# Open-source ETL tools:

This script sets up an Airflow environment, initializes the database, and defines a simple ETL DAG. The DAG extracts data, transforms it by adding a new column, and then loads the data, with tasks executed in sequence.

In [None]:
# Install necessary packages
!pip install apache-airflow==2.7.0 pandas

In [None]:

# Set up Airflow home directory and environment variables
import os
os.environ['AIRFLOW_HOME'] = '/content/airflow'

# Create necessary directories
os.makedirs('/content/airflow/dags', exist_ok=True)

# Initialize Airflow database
!airflow db init

# Define your DAG script
dag_script = """
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from datetime import datetime, timedelta
import pandas as pd

default_args = {
    'owner': 'dataops_team',
    'depends_on_past': False,
    'start_date': datetime(2023, 1, 1),
    'email_on_failure': False,
    'email_on_retry': False,
    'retries': 1,
    'retry_delay': timedelta(minutes=5),
}

dag = DAG(
    'sample_etl_pipeline',
    default_args=default_args,
    description='A simple ETL pipeline using Airflow',
    schedule_interval=timedelta(days=1),
)

def extract_data(**kwargs):
    data = pd.DataFrame({'id': range(1, 6), 'value': [10, 20, 30, 40, 50]})
    kwargs['ti'].xcom_push(key='extracted_data', value=data.to_json())
    print("Data extracted.")

def transform_data(**kwargs):
    ti = kwargs['ti']
    data_json = ti.xcom_pull(key='extracted_data', task_ids='extract_task')
    data = pd.read_json(data_json)
    data['value_squared'] = data['value'] ** 2
    ti.xcom_push(key='transformed_data', value=data.to_json())
    print("Data transformed.")

def load_data(**kwargs):
    ti = kwargs['ti']
    data_json = ti.xcom_pull(key='transformed_data', task_ids='transform_task')
    data = pd.read_json(data_json)
    print("Loading data:", data)

extract_task = PythonOperator(
    task_id='extract_task',
    python_callable=extract_data,
    provide_context=True,
    dag=dag,
)

transform_task = PythonOperator(
    task_id='transform_task',
    python_callable=transform_data,
    provide_context=True,
    dag=dag,
)

load_task = PythonOperator(
    task_id='load_task',
    python_callable=load_data,
    provide_context=True,
    dag=dag,
)

extract_task >> transform_task >> load_task
"""



In [3]:
# Save the DAG script to a file
dag_path = '/content/airflow/dags/sample_etl_pipeline.py'
os.makedirs(os.path.dirname(dag_path), exist_ok=True)  # Ensure the directory exists
with open(dag_path, 'w') as f:
    f.write(dag_script)

print("DAG script saved to:", dag_path)

DAG script saved to: /content/airflow/dags/sample_etl_pipeline.py


# Cloud-based ETL services:


This code mounts Google Drive, initializes a Spark session, creates a synthetic dataset, converts it from a Pandas DataFrame to a Spark DataFrame, applies transformations, and saves the transformed data to Google Drive

In [None]:
!pip install pyspark


In [5]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')



# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder.appName("DataOpsPipeline").getOrCreate()

# Create a synthetic dataset using Pandas
data = {
    "id": [1, 2, 3, 4, 5],
    "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "age": [25, 30, 35, 40, 45]
}
pdf = pd.DataFrame(data)

# Convert Pandas DataFrame to Spark DataFrame
raw_data = spark.createDataFrame(pdf)

# Apply transformations
transformed_data = raw_data.select(
    col("id").cast("long").alias("id"),
    col("name").cast("string").alias("full_name"),
    col("age").cast("long").alias("age")
).dropna()

# Define the output path in Google Drive
output_path = "/content/drive/My Drive/path_to_your_transformed_data/"

# Write transformed data back to Google Drive
transformed_data.write.parquet(output_path, mode="overwrite")

print("Transformation and save completed.")


Mounted at /content/drive
Transformation and save completed.


# Building custom ETL pipelines:

The pipeline uses a custom class for the ETL process, logs its operations, and handles errors. It also creates a sample CSV file, allows for file uploads, and demonstrates reading the transformed data from the SQLite database

In [7]:
import pandas as pd
from sqlalchemy import create_engine
import logging
import sqlite3
from google.colab import files

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class CustomETLPipeline:
    def __init__(self, source_file, target_db_url):
        self.source_file = source_file
        self.target_engine = create_engine(target_db_url)

    def extract(self):
        logger.info("Extracting data from CSV file")
        return pd.read_csv(self.source_file)

    def transform(self, df):
        logger.info("Applying transformations")
        # Example transformations
        df['full_name'] = df['first_name'] + ' ' + df['last_name']
        df['age_group'] = pd.cut(df['age'], bins=[0, 18, 35, 50, 100], labels=['0-18', '19-35', '36-50', '50+'])
        return df

    def load(self, df):
        logger.info("Loading data into the database")
        df.to_sql('transformed_data', self.target_engine, if_exists='replace', index=False)

    def run(self):
        try:
            data = self.extract()
            transformed_data = self.transform(data)
            self.load(transformed_data)
            logger.info("ETL process completed successfully")
        except Exception as e:
            logger.error(f"ETL process failed: {str(e)}")

# Create a sample CSV file
data = {
    'first_name': ['John', 'Jane', 'Jim', 'Jill'],
    'last_name': ['Doe', 'Smith', 'Brown', 'Taylor'],
    'age': [28, 34, 45, 23]
}
df = pd.DataFrame(data)
csv_file = '/content/sample_data.csv'
df.to_csv(csv_file, index=False)

print(f"Sample CSV file created at: {csv_file}")

# Upload a new file if needed
uploaded = files.upload()
source_file = next(iter(uploaded), csv_file)  # Use uploaded file or default to the sample CSV

# Use SQLite for demonstration
target_db_url = 'sqlite:///transformed_data.db'

# Create ETL pipeline and run
etl_pipeline = CustomETLPipeline(source_file, target_db_url)
etl_pipeline.run()

# Check the content of the SQLite database
try:
    conn = sqlite3.connect('transformed_data.db')
    df_loaded = pd.read_sql_query("SELECT * FROM transformed_data", conn)
    print(df_loaded)
except Exception as e:
    logger.error(f"Failed to read from the database: {str(e)}")
finally:
    conn.close()


Sample CSV file created at: /content/sample_data.csv


Saving sample_data.csv to sample_data (1).csv
  first_name last_name  age    full_name age_group
0       John       Doe   28     John Doe     19-35
1       Jane     Smith   34   Jane Smith     19-35
2        Jim     Brown   45    Jim Brown     36-50
3       Jill    Taylor   23  Jill Taylor     19-35
