# Lab 4
# Extend ETL pipeline 

#### Imports

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os 

#### Loading data

In [10]:
DATA_FILE = 'custom_data.csv' # Your existing data file
LAST_EXTRACTION_FILE = 'last_extraction.txt'
OUTPUT_FULL_TRANSFORMED_FILENAME = 'transformed_full.csv'
OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME = 'transformed_incremental.csv'
print(f"Data file: {DATA_FILE}")

Data file: custom_data.csv


#### Full extraction

In [11]:

df_full_extraction = pd.DataFrame() # Initialize an empty DataFrame

try:
    if os.path.exists(DATA_FILE):
        df_full_extraction = pd.read_csv(DATA_FILE)
        print(f"Number of rows: {df_full_extraction.shape[0]}")
        print(f"Number of columns: {df_full_extraction.shape[1]}")
        print(df_full_extraction.head())
        print(df_full_extraction.info())

        print(f"\nExtracted {df_full_extraction.shape[0]} rows fully.")
    else:
        print(f"Error: The file '{DATA_FILE}' was not found")

except Exception as e:
    print(f"An error occurred during full extraction: {e}")

Number of rows: 338
Number of columns: 10
   record_id transaction_date transaction_timestamp customer_name  \
0          1       2025-04-01   2025-04-01T17:49:57        Costco   
1          2       2025-04-01   2025-04-01T10:36:18         Apple   
2          3       2025-04-01   2025-04-01T17:16:47        Target   
3          4       2025-04-01   2025-04-01T18:15:48        Costco   
4          5       2025-04-01   2025-04-01T02:37:35        Amazon   

  product_category   amount  quantity payment_method last_updated_timestamp  \
0      Electronics   929.94         1         PayPal    2025-04-01T18:49:57   
1        Groceries   130.17         4         PayPal    2025-04-01T12:59:18   
2       Home Goods  1333.76         8    Credit Card    2025-04-01T18:19:47   
3            Tools  1115.54         9         PayPal    2025-04-01T20:41:48   
4         Software   633.14         6    Credit Card    2025-04-01T03:35:35   

      status  
0    Pending  
1  Cancelled  
2    Pending  
3  Cance

### Incremental extraction

In [12]:
def get_last_extraction_timestamp(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            try:
                timestamp_str = f.read().strip()
                if timestamp_str:
                    return datetime.fromisoformat(timestamp_str)
                else:
                    return datetime.min 
            except ValueError:
                return datetime.min 
    else:
        return datetime.min 

last_extraction_time = get_last_extraction_timestamp(LAST_EXTRACTION_FILE)
print(f"Last extraction timestamp: {last_extraction_time}")

df_incremental_extraction = pd.DataFrame() 

try:
    if not df_full_extraction.empty: 
        df_full_extraction['last_updated_timestamp'] = pd.to_datetime(df_full_extraction['last_updated_timestamp'], errors='coerce')
        df_current_data_valid_timestamps = df_full_extraction.dropna(subset=['last_updated_timestamp'])

        df_incremental_extraction = df_current_data_valid_timestamps[
            df_current_data_valid_timestamps['last_updated_timestamp'] > last_extraction_time
        ].copy() 

        print(f"\nExtracted {df_incremental_extraction.shape[0]} rows incrementally since last check ({last_extraction_time}).")

        if not df_incremental_extraction.empty:
            print("\nIncremental data extracted (first 5 rows):")
            print(df_incremental_extraction.head())
        else:
            print("No new or updated records found.")
    else:
        print("Full extraction DataFrame is empty, cannot perform incremental extraction.")

except Exception as e:
    print(f"An error occurred during incremental extraction: {e}")

Last extraction timestamp: 2025-06-09 20:56:34.567175

Extracted 0 rows incrementally since last check (2025-06-09 20:56:34.567175).
No new or updated records found.


## Transformations

#### cleaning data
- Cleans the dataset by handling missing values and removing duplicates.

In [3]:
def clean_data(df):
    df = df.drop_duplicates()
    df.fillna({
        'quantity': df['quantity'].median(),
        'unit_price': df['unit_price'].mean()
    }, inplace=True)
    return df

#### Enriching data
- adding calculated columns

In [4]:
def enrich_data(df):
    df['total_price'] = df['quantity'] * df['unit_price']
    df['contribution_pct'] = (df['total_price'] / df['total_price'].sum()) * 100
    return df

#### Structuring data
- Standardizing data formats and types.

In [5]:
def structure_data(df):
    df['date'] = pd.to_datetime(df['date'], errors='coerce').dt.strftime('%Y-%m-%d')
    df['quantity'] = df['quantity'].astype(int)
    df.rename(columns={'unit_price': 'price_per_unit'}, inplace=True)
    return df

### Transforming FULL DATA

In [8]:
# Debugging: Print column names to verify
print("Columns in full_data:", full_data.columns)
print("Columns in incremental_data:", incremental_data.columns)

# Adjust column names if needed (case sensitivity or alternate names)
if 'quantity' not in full_data.columns:
    print("Warning: 'quantity' column not found in full_data. Please check the dataset.")
if 'unit_price' not in full_data.columns:
    print("Warning: 'unit_price' column not found in full_data. Please check the dataset.")

# Example of handling missing columns
def clean_data(df):
    """Cleans the dataset by handling missing values and removing duplicates."""
    df = df.drop_duplicates()
    
    # Check if 'quantity' and 'unit_price' exist before processing
    if 'quantity' in df.columns and 'unit_price' in df.columns:
        df.fillna({
            'quantity': df['quantity'].median(),
            'unit_price': df['unit_price'].mean()
        }, inplace=True)
    else:
        print("Required columns for cleaning are missing.")
    
    return df

# Continue with the transformations
try:
    print("Transforming Full Data...")
    transformed_full = clean_data(full_data)
    transformed_full = enrich_data(transformed_full)
    transformed_full = structure_data(transformed_full)
    transformed_full.to_csv('transformed_full.csv', index=False)
    print("Full data transformation completed.")
except Exception as e:
    print(f"Error during full data transformation: {e}")

try:
    print("Transforming Incremental Data...")
    transformed_incremental = clean_data(incremental_data)
    transformed_incremental = enrich_data(transformed_incremental)
    transformed_incremental = structure_data(transformed_incremental)
    transformed_incremental.to_csv('transformed_incremental.csv', index=False)
    print("Incremental data transformation completed.")
except Exception as e:
    print(f"Error during incremental data transformation: {e}")


Columns in full_data: Index(['id', 'customer', 'date', 'amount', 'last_updated'], dtype='object')
Columns in incremental_data: Index(['record_id', 'transaction_date', 'transaction_timestamp',
       'customer_name', 'product_category', 'amount', 'quantity',
       'payment_method', 'last_updated_timestamp', 'status'],
      dtype='object')
Transforming Full Data...
Required columns for cleaning are missing.
Error during full data transformation: 'quantity'
Transforming Incremental Data...
Required columns for cleaning are missing.
Error during incremental data transformation: 'unit_price'


### Transforming INCREMENTAL DATA

In [7]:
print("Transforming Incremental Data...")
transformed_incremental = clean_data(incremental_data)
transformed_incremental = enrich_data(transformed_incremental)
transformed_incremental = structure_data(transformed_incremental)
transformed_incremental.to_csv('transformed_incremental.csv', index=False)


Transforming Incremental Data...


KeyError: 'unit_price'

### Transforming FULL DATA