In [None]:
# Initialize Git repository
git init

# Initialize DVC
dvc init

# Add and commit code files to Git
git add data_preprocessing.py model_training.py
git commit -m "Add data preprocessing and model training scripts"

# Add large dataset to DVC
dvc add data/large_dataset.csv
git add data/large_dataset.csv.dvc
git commit -m "Add reference to large dataset"

# Create a new branch for feature development
git checkout -b feature/new_data_transformation

# Make changes to the code and data
# ... (modify data_preprocessing.py)
# ... (update large_dataset.csv)

# Update DVC tracked file
dvc add data/large_dataset.csv

# Commit changes
git add data_preprocessing.py data/large_dataset.csv.dvc
git commit -m "Implement new data transformation and update dataset"

# Push changes to remote repository
git push origin feature/new_data_transformation
dvc push


In [None]:
# Create a new feature branch
git checkout -b feature/enhanced_feature_engineering

# Make changes to feature engineering code
vim feature_engineering.py

# Update dataset with new features
python feature_engineering.py

# Track updated dataset with DVC
dvc add data/enhanced_features.csv

# Commit changes
git add feature_engineering.py data/enhanced_features.csv.dvc
git commit -m "Implement enhanced feature engineering"

# Push changes to remote repository
git push origin feature/enhanced_feature_engineering
dvc push

In [None]:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta

# Modular components

def extract_data(**kwargs):
    # Extract data from source
    data = {"raw_data": [1, 2, 3, 4, 5]}
    return data

def clean_data(**kwargs):
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='extract_data')
    # Clean the data
    cleaned_data = [x for x in data['raw_data'] if x % 2 == 0]
    return {"cleaned_data": cleaned_data}

def transform_data(**kwargs):
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='clean_data')
    # Transform the data
    transformed_data = [x * 2 for x in data['cleaned_data']]
    return {"transformed_data": transformed_data}

def load_data(**kwargs):
    ti = kwargs['ti']
    data = ti.xcom_pull(task_ids='transform_data')
    # Load data (simplified example)
    print(f"Loading data: {data['transformed_data']}")

# DAG definition
default_args = {
    'owner': 'airflow',
    'depends_on_past': False,
    'start_date': datetime(2023, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'modular_data_pipeline',
    default_args=default_args,
    description='A modular data pipeline example',
    schedule_interval=timedelta(days=1),
)

# Task definitions
extract_task = PythonOperator(
    task_id='extract_data',
    python_callable=extract_data,
    dag=dag,
)

clean_task = PythonOperator(
    task_id='clean_data',
    python_callable=clean_data,
    dag=dag,
)

transform_task = PythonOperator(
    task_id='transform_data',
    python_callable=transform_data,
    dag=dag,
)

load_task = PythonOperator(
    task_id='load_data',
    python_callable=load_data,
    dag=dag,
)

# Define task dependencies
extract_task >> clean_task >> transform_task >> load_task

In [None]:
import logging
from tenacity import retry, stop_after_attempt, wait_exponential
import random
import luigi
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataProcessingError(Exception):
    pass

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_data(source):
    logger.info(f"Fetching data from {source}")
    if random.random() < 0.5:  # Simulate intermittent failure
        raise ConnectionError("Failed to connect to data source")
    return [1, 2, 3, 4, 5]

def process_data(data):
    logger.info("Processing data")
    if not data:
        raise DataProcessingError("Empty dataset")
    return [x * 2 for x in data]

def save_data(data):
    logger.info(f"Saving data: {data}")
    # Simulate data saving

def run_pipeline(data_source):
    try:
        raw_data = fetch_data(data_source)
        processed_data = process_data(raw_data)
        save_data(processed_data)
        logger.info("Pipeline completed successfully")
    except ConnectionError as e:
        logger.error(f"Failed to fetch data after multiple attempts: {e}")
    except DataProcessingError as e:
        logger.error(f"Data processing error: {e}")
    except Exception as e:
        logger.error(f"Unexpected error in pipeline: {e}")
    finally:
        logger.info("Cleaning up resources")
        # Perform any necessary cleanup

if __name__ == "__main__":
    run_pipeline("example_source")

# Airflow DAG definition
default_args = {
    'owner': 'dataops_team',
    'depends_on_past': False,
    'start_date': datetime(2023, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'simple_etl_pipeline',
    default_args=default_args,)

In [None]:
import logging
from tenacity import retry, stop_after_attempt, wait_exponential
import random

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DataProcessingError(Exception):
    pass

@retry(stop=stop_after_attempt(3), wait=wait_exponential(multiplier=1, min=4, max=10))
def fetch_data(source):
    logger.info(f"Fetching data from {source}")
    if random.random() < 0.5:  # Simulate intermittent failure
        raise ConnectionError("Failed to connect to data source")
    return [1, 2, 3, 4, 5]

def process_data(data):
    logger.info("Processing data")
    if not data:
        raise DataProcessingError("Empty dataset")
    return [x * 2 for x in data]

def save_data(data):
    logger.info(f"Saving data: {data}")
    # Simulate data saving

def run_pipeline(data_source):
    try:
        raw_data = fetch_data(data_source)
        processed_data = process_data(raw_data)
        save_data(processed_data)
        logger.info("Pipeline completed successfully")
    except ConnectionError as e:
        logger.error(f"Failed to fetch data after multiple attempts: {e}")
    except DataProcessingError as e:
        logger.error(f"Data processing error: {e}")
    except Exception as e:
        logger.error(f"Unexpected error in pipeline: {e}")
    finally:
        logger.info("Cleaning up resources")
        # Perform any necessary cleanup

if __name__ == "__main__":
    run_pipeline("example_source")

In [None]:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.operators.bash_operator import BashOperator
from datetime import datetime, timedelta

default_args = {
    'owner': 'dataops_team',
    'depends_on_past': False,
    'start_date': datetime(2023, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'simple_etl_pipeline',
    default_args=default_args,
    description='A simple ETL pipeline using Apache Airflow',
    schedule_interval=timedelta(days=1),
)

def extract_data(**kwargs):
    # Simulating data extraction
    data = [1, 2, 3, 4, 5]
    kwargs['ti'].xcom_push(key='raw_data', value=data)

def transform_data(**kwargs):
    ti = kwargs['ti']
    data = ti.xcom_pull(key='raw_data', task_ids='extract_task')
    transformed_data = [x * 2 for x in data]
    ti.xcom_push(key='transformed_data', value=transformed_data)

In [None]:
def load_data(**kwargs):
    ti = kwargs['ti']
    data = ti.xcom_pull(key='transformed_data', task_ids='transform_task')
    print(f"Loading data: {data}")

extract_task = PythonOperator(
    task_id='extract_task',
    python_callable=extract_data,
    dag=dag,
)

transform_task = PythonOperator(
    task_id='transform_task',
    python_callable=transform_data,
    dag=dag,
)

load_task = PythonOperator(
    task_id='load_task',
    python_callable=load_data,
    dag=dag,
)

cleanup_task = BashOperator(
    task_id='cleanup_task',
    bash_command='echo "Performing cleanup operations"',
    dag=dag,
)

extract_task >> transform_task >> load_task >> cleanup_task

In [None]:
import luigi
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pickle

class DownloadData(luigi.Task):
    date = luigi.DateParameter()

    def output(self):
        return luigi.LocalTarget(f"data/raw_{self.date}.csv")

    def run(self):
        # Simulate downloading data
        df = pd.DataFrame({'feature': range(100), 'target': [0, 1] * 50})
        df.to_csv(self.output().path, index=False)

class PrepareData(luigi.Task):
    date = luigi.DateParameter()

    def requires(self):
        return DownloadData(date=self.date)

    def output(self):
        return luigi.LocalTarget(f"data/prepared_{self.date}.csv")

    def run(self):
        df = pd.read_csv(self.input().path)
        # Perform data preparation
        df['feature_squared'] = df['feature'] ** 2
        df.to_csv(self.output().path, index=False)

class TrainModel(luigi.Task):
    date = luigi.DateParameter()

    def requires(self):
        return PrepareData(date=self.date)

    def output(self):
        return luigi.LocalTarget(f"models/model_{self.date}.pkl")

In [None]:
def run(self):
        df = pd.read_csv(self.input().path)
        X = df[['feature', 'feature_squared']]
        y = df['target']
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
        model = RandomForestClassifier()
        model.fit(X_train, y_train)
        with self.output().open('wb') as f:
            pickle.dump(model, f)

class EvaluateModel(luigi.Task):
    date = luigi.DateParameter()

    def requires(self):
        return {
            'data': PrepareData(date=self.date),
            'model': TrainModel(date=self.date)
        }

    def output(self):
        return luigi.LocalTarget(f"reports/evaluation_{self.date}.txt")

    def run(self):
        with self.input()['model'].open('rb') as f:
            model = pickle.load(f)
        df = pd.read_csv(self.input()['data'].path)
        X = df[['feature', 'feature_squared']]
        y = df['target']
        accuracy = model.score(X, y)
        with self.output().open('w') as f:
            f.write(f"Model accuracy: {accuracy}")

if __name__ == '__main__':
    luigi.build([EvaluateModel(date=luigi.DateParameter().parse('2023-05-01'))], local_scheduler=True)