# Lab 4
# Extend ETL pipeline 

#### Imports

In [9]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os 

#### Loading data

In [10]:
DATA_FILE = 'custom_data.csv' # Your existing data file
LAST_EXTRACTION_FILE = 'last_extraction.txt'
OUTPUT_FULL_TRANSFORMED_FILENAME = 'transformed_full.csv'
OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME = 'transformed_incremental.csv'
print(f"Data file: {DATA_FILE}")

Data file: custom_data.csv


#### Full extraction

In [20]:
full_data_df = pd.DataFrame() 

try:
    if os.path.exists(DATA_FILE):
        full_data_df = pd.read_csv(DATA_FILE)
        print(f"Extracted {full_data_df.shape[0]} rows.")
        print("First 5 rows of full data:\n", full_data_df.head())
        print("\nData Types:\n", full_data_df.info())
    else:
        print(f"Error: '{DATA_FILE}' not found.")
except Exception as e:
    print(f"An error occurred during full extraction: {e}")

Extracted 338 rows.
First 5 rows of full data:
    record_id transaction_date transaction_timestamp customer_name  \
0          1       2025-04-01   2025-04-01T17:49:57        Costco   
1          2       2025-04-01   2025-04-01T10:36:18         Apple   
2          3       2025-04-01   2025-04-01T17:16:47        Target   
3          4       2025-04-01   2025-04-01T18:15:48        Costco   
4          5       2025-04-01   2025-04-01T02:37:35        Amazon   

  product_category   amount  quantity payment_method last_updated_timestamp  \
0      Electronics   929.94         1         PayPal    2025-04-01T18:49:57   
1        Groceries   130.17         4         PayPal    2025-04-01T12:59:18   
2       Home Goods  1333.76         8    Credit Card    2025-04-01T18:19:47   
3            Tools  1115.54         9         PayPal    2025-04-01T20:41:48   
4         Software   633.14         6    Credit Card    2025-04-01T03:35:35   

      status  
0    Pending  
1  Cancelled  
2    Pending  
3 

### Incremental extraction

In [23]:
def get_last_timestamp(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            timestamp_str = f.read().strip()
            if timestamp_str:
                return datetime.fromisoformat(timestamp_str)
    return datetime.min 

last_extraction_time = get_last_timestamp(LAST_EXTRACTION_FILE)
print(f"Last extraction timestamp: {last_extraction_time}")

incremental_data_df = pd.DataFrame() 

try:
    if not full_data_df.empty: 
        full_data_df['last_updated_timestamp'] = pd.to_datetime(full_data_df['last_updated_timestamp'], errors='coerce')
        
        incremental_data_df = full_data_df[
            full_data_df['last_updated_timestamp'] > last_extraction_time
        ].copy() 

        print(f"Extracted {incremental_data_df.shape[0]} incremental rows.")
        if not incremental_data_df.empty:
            print("First 5 rows of incremental data:\n", incremental_data_df.head())
        else:
            print("No new or updated records found.")
    else:
        print("No full data to perform incremental extraction from.")
except Exception as e:
    print(f"An error occurred during incremental extraction: {e}")

Last extraction timestamp: 2025-06-14 10:30:50.574049
Extracted 0 incremental rows.
No new or updated records found.


## Transformations
- cleaning
- standradization
- type conversion

In [24]:
def apply_transformations(df):
    transformed_df = df.copy()

    # Cleaning: Remove duplicates
    transformed_df.drop_duplicates(subset=['record_id'], keep='first', inplace=True)

    # Cleaning: Strip whitespace from string columns
    for col in ['customer_name', 'product_category', 'payment_method', 'status']:
        if col in transformed_df.columns and transformed_df[col].dtype == 'object':
            transformed_df[col] = transformed_df[col].str.strip()

    # Cleaning: Standardize 'status' casing
    if 'status' in transformed_df.columns:
        transformed_df['status'] = transformed_df['status'].str.title()

    # Cleaning: Handle missing values in critical columns
    transformed_df.dropna(subset=['amount', 'quantity', 'transaction_date', 'transaction_timestamp'], inplace=True)

    # Structural: Data Type Conversion
    transformed_df['transaction_date'] = pd.to_datetime(transformed_df['transaction_date'], errors='coerce')
    transformed_df['transaction_timestamp'] = pd.to_datetime(transformed_df['transaction_timestamp'], errors='coerce')
    transformed_df['last_updated_timestamp'] = pd.to_datetime(transformed_df['last_updated_timestamp'], errors='coerce')
    transformed_df['amount'] = pd.to_numeric(transformed_df['amount'], errors='coerce')
    transformed_df['quantity'] = pd.to_numeric(transformed_df['quantity'], errors='coerce')

    # Enrichment: Calculate total_price
    transformed_df['total_price'] = transformed_df['quantity'] * transformed_df['amount']

    # Structural: Column Selection and Renaming
    selected_cols = [
        'record_id',
        'transaction_date',
        'transaction_timestamp',
        'customer_name',
        'product_category',
        'quantity',
        'amount',
        'total_price',
        'payment_method',
        'status'
    ]
    transformed_df = transformed_df[[col for col in selected_cols if col in transformed_df.columns]]

    transformed_df.rename(columns={
        'record_id': 'transaction_id',
        'customer_name': 'customer',
        'product_category': 'category',
        'payment_method': 'payment_type'
    }, inplace=True)

    return transformed_df

#### Transform full data

In [25]:
print("\n--- Transforming Full Data ---")
if not full_data_df.empty:
    transformed_full_df = apply_transformations(full_data_df)
    print(f"Transformed {len(transformed_full_df)} records for full data.")
    print("First 5 rows of transformed full data:\n", transformed_full_df.head())
    print("\nData Types:\n", transformed_full_df.info())

    try:
        transformed_full_df.to_csv(OUTPUT_FULL_TRANSFORMED_FILENAME, index=False)
        print(f"Transformed full data saved to '{OUTPUT_FULL_TRANSFORMED_FILENAME}'")
    except Exception as e:
        print(f"Error saving transformed full data: {e}")
else:
    print("Full data is empty, skipping transformation.")


--- Transforming Full Data ---
Transformed 338 records for full data.
First 5 rows of transformed full data:
    transaction_id transaction_date transaction_timestamp customer  \
0               1       2025-04-01   2025-04-01 17:49:57   Costco   
1               2       2025-04-01   2025-04-01 10:36:18    Apple   
2               3       2025-04-01   2025-04-01 17:16:47   Target   
3               4       2025-04-01   2025-04-01 18:15:48   Costco   
4               5       2025-04-01   2025-04-01 02:37:35   Amazon   

      category  quantity   amount  total_price payment_type     status  
0  Electronics         1   929.94       929.94       PayPal    Pending  
1    Groceries         4   130.17       520.68       PayPal  Cancelled  
2   Home Goods         8  1333.76     10670.08  Credit Card    Pending  
3        Tools         9  1115.54     10039.86       PayPal  Cancelled  
4     Software         6   633.14      3798.84  Credit Card  Cancelled  
<class 'pandas.core.frame.DataFrame'

#### Incremental Data

In [26]:
if not incremental_data_df.empty:
    transformed_incremental_df = apply_transformations(incremental_data_df)
    print(f"Transformed {len(transformed_incremental_df)} records for incremental data.")
    print("First 5 rows of transformed incremental data:\n", transformed_incremental_df.head())
    print("\nData Types:\n", transformed_incremental_df.info())

    try:
        transformed_incremental_df.to_csv(OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME, index=False)
        print(f"Transformed incremental data saved to '{OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME}'")
    except Exception as e:
        print(f"Error saving transformed incremental data: {e}")
else:
    print("Incremental data is empty, skipping transformation.")

Incremental data is empty, skipping transformation.


### Update Last Extraction

In [15]:
print(f"\n--- Saving New Timestamp ---")
current_extraction_time = datetime.now()
try:
    with open(LAST_EXTRACTION_FILE, 'w') as f:
        f.write(current_extraction_time.isoformat())
    print(f"New extraction timestamp saved: {current_extraction_time.isoformat()}")
except Exception as e:
    print(f"Error saving new timestamp to '{LAST_EXTRACTION_FILE}': {e}")


--- Saving New Timestamp ---
New extraction timestamp saved: 2025-06-14T10:30:50.574049
