# Lab 4
# Extend ETL pipeline 

#### Imports

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import os 

#### Adding more rows to the DataFrame
- this is the code to add more rows to the DataFrame
- otherwise there would be nothing in incremental parts of the assignement
- This script generates new records for a custom dataset and appends them to an existing CSV file.
- If the file does not exist, it creates a new one with the specified number of records.
- The dataset includes transaction details such as date, customer name, product category, amount, quantity, payment method, and status.

In [3]:
# Defined Variables
OUTPUT_FILENAME = 'custom_data.csv'
NUM_NEW_ROWS = 30
SALES_PER_DAY_RANGE = (3, 8) 

CUSTOMERS = ['Amazon', 'Walmart', 'Target', 'Costco', 'BestBuy', 'eBay', 'Microsoft', 'Google', 'Apple', 'Meta']
PRODUCT_CATEGORIES = ['Electronics', 'Home Goods', 'Apparel', 'Books', 'Groceries', 'Software', 'Tools', 'Sports']
PAYMENT_METHODS = ['Credit Card', 'Debit Card', 'PayPal', 'Bank Transfer']
STATUS_OPTIONS = ['Completed', 'Pending', 'Cancelled', 'Refunded'] 

# Print statement to indicate 30 rows being added
print(f"Attempting to add {NUM_NEW_ROWS} new records to '{OUTPUT_FILENAME}'...")

# Using Try-Except to handle file reading and record ID management
# Using it to avoid using os operations directly
try:
    existing_df = pd.read_csv(OUTPUT_FILENAME)
    if not existing_df.empty:
        record_id_counter = existing_df['record_id'].max() + 1
    else:
        record_id_counter = 1
except FileNotFoundError:
    print(f"'{OUTPUT_FILENAME}' not found. Creating a new file with new records.")
    existing_df = pd.DataFrame()
    record_id_counter = 1 

# Start generating new records from June 1, 2025
START_DATE_FOR_NEW_DATA = datetime(2025, 6, 1)

# Generate new records, started by creating a new list to hold the new records
new_records = []
num_records_generated = 0
current_date_for_new_data = START_DATE_FOR_NEW_DATA

# Using timedelta to generate records for each day
while num_records_generated < NUM_NEW_ROWS:
  
    if num_records_generated > 0: 
        current_date_for_new_data += timedelta(days=1)

    transaction_time = current_date_for_new_data + timedelta(
        hours=random.randint(0, 23),
        minutes=random.randint(0, 59),
        seconds=random.randint(0, 59)
    )

    last_updated_time = transaction_time + timedelta(
        minutes=random.randint(1, 120) 
    )
  
    if last_updated_time.date() > current_date_for_new_data.date():
        last_updated_time = current_date_for_new_data.replace(hour=23, minute=59, second=59)

    new_records.append({
        'record_id': record_id_counter,
        'transaction_date': current_date_for_new_data.date().isoformat(),
        'transaction_timestamp': transaction_time.isoformat(),
        'customer_name': random.choice(CUSTOMERS),
        'product_category': random.choice(PRODUCT_CATEGORIES),
        'amount': round(random.uniform(10.00, 2000.00), 2),
        'quantity': random.randint(1, 10),
        'payment_method': random.choice(PAYMENT_METHODS),
        'last_updated_timestamp': last_updated_time.isoformat(),
        'status': random.choice(STATUS_OPTIONS)
    })
    record_id_counter += 1
    num_records_generated += 1

# Convert the list of new records to a DataFrame and combine with existing data
new_df = pd.DataFrame(new_records)
combined_df = pd.concat([existing_df, new_df], ignore_index=True)
combined_df.to_csv(OUTPUT_FILENAME, index=False)

print(f"Successfully added {NUM_NEW_ROWS} new records to '{OUTPUT_FILENAME}'.")

Attempting to add 30 new records to 'custom_data.csv'...
Successfully added 30 new records to 'custom_data.csv'.


#### Loading Original data
- setting the path to all the csv files and the extraction text file

In [4]:
DATA_FILE = 'custom_data.csv' 
LAST_EXTRACTION_FILE = 'last_extraction.txt'
OUTPUT_FULL_TRANSFORMED_FILENAME = 'transformed_full.csv'
OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME = 'transformed_incremental.csv'

#### Full extraction
- Simply loading the full dataset

In [5]:

full_data_df = pd.read_csv(DATA_FILE)
print(full_data_df.head())
# Shape of the DataFrame
print("Shape:", full_data_df.shape)

   record_id transaction_date transaction_timestamp customer_name  \
0          1       2025-04-01   2025-04-01T17:49:57        Costco   
1          2       2025-04-01   2025-04-01T10:36:18         Apple   
2          3       2025-04-01   2025-04-01T17:16:47        Target   
3          4       2025-04-01   2025-04-01T18:15:48        Costco   
4          5       2025-04-01   2025-04-01T02:37:35        Amazon   

  product_category   amount  quantity payment_method last_updated_timestamp  \
0      Electronics   929.94         1         PayPal    2025-04-01T18:49:57   
1        Groceries   130.17         4         PayPal    2025-04-01T12:59:18   
2       Home Goods  1333.76         8    Credit Card    2025-04-01T18:19:47   
3            Tools  1115.54         9         PayPal    2025-04-01T20:41:48   
4         Software   633.14         6    Credit Card    2025-04-01T03:35:35   

      status  
0    Pending  
1  Cancelled  
2    Pending  
3  Cancelled  
4  Cancelled  
Shape: (488, 10)


### Incremental extraction

In [None]:
# retrieves the last extraction timestamp from a file
def get_last_timestamp(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            timestamp_str = f.read().strip()
            if timestamp_str:
                return datetime.fromisoformat(timestamp_str)
    return datetime.min 
# get the last extraction timestamp
last_extraction_time = get_last_timestamp(LAST_EXTRACTION_FILE)
print(f"Last extraction timestamp: {last_extraction_time}")

incremental_data_df = pd.DataFrame() 
# Attempting to extract incremental data based on the last extraction timestamp
try:
    if not full_data_df.empty: 
        full_data_df['last_updated_timestamp'] = pd.to_datetime(full_data_df['last_updated_timestamp'], errors='coerce')
        
        incremental_data_df = full_data_df[
            full_data_df['last_updated_timestamp'] > last_extraction_time
        ].copy() 

        print(f"Extracted {incremental_data_df.shape[0]} incremental rows.")
        if not incremental_data_df.empty:
            print("First 5 rows of incremntal data:\n", incremental_data_df.head())
        else:
            print("No new or updated records found.")
    else:
        print("No full dta to perform incremental extraction from.")
except Exception as e:
    print(f"An error occurred during incremental extraction: {e}")

Last extraction timestamp: 2025-06-17 12:29:13.058419
Extracted 68 incremental rows.
First 5 rows of incremental data:
      record_id transaction_date transaction_timestamp customer_name  \
355        356       2025-06-18   2025-06-18T13:03:27        Amazon   
356        357       2025-06-19   2025-06-19T15:37:21        Google   
357        358       2025-06-20   2025-06-20T03:05:25        Target   
358        359       2025-06-21   2025-06-21T19:45:24        Amazon   
359        360       2025-06-22   2025-06-22T17:12:33        Amazon   

    product_category   amount  quantity payment_method last_updated_timestamp  \
355            Books  1435.31         5    Credit Card    2025-06-18 13:36:27   
356       Home Goods    39.00         3  Bank Transfer    2025-06-19 16:16:21   
357      Electronics  1987.26         3         PayPal    2025-06-20 04:32:25   
358        Groceries  1267.45         2  Bank Transfer    2025-06-21 20:57:24   
359           Sports   470.82         4    Credi

## Transformations
Decided on Using a Fucntion to Transform the Data as it would save on code duplication for the full data and incremental data
Performed the following transformations:
- removing duplicateds
- sttripping whitespace
- standardizing
- drpooing rows wirh missing values
- Data typoe conversions
    - string to datetime
    - string to numeric/ coercing to numeric to prevent errors
- feature engineering 
- renaming columns for standardization

In [7]:
def apply_transformations(df):
    transformed_df = df.copy()

    # Removing duplicates
    transformed_df.drop_duplicates(subset=['record_id'], keep='first', inplace=True)

    # Stripping whitespaec from string columns
    for col in ['customer_name', 'product_category', 'payment_method', 'status']:
        if col in transformed_df.columns and transformed_df[col].dtype == 'object':
            transformed_df[col] = transformed_df[col].str.strip()

    # Standrdizing 'status' casing
    if 'status' in transformed_df.columns:
        transformed_df['status'] = transformed_df['status'].str.title()

    # Handling missing values in critical columns
    transformed_df.dropna(subset=['amount', 'quantity', 'transaction_date', 'transaction_timestamp'], inplace=True)

    # Data Type Conversion
    transformed_df['transaction_date'] = pd.to_datetime(transformed_df['transaction_date'], errors='coerce')
    transformed_df['transaction_timestamp'] = pd.to_datetime(transformed_df['transaction_timestamp'], errors='coerce')
    transformed_df['last_updated_timestamp'] = pd.to_datetime(transformed_df['last_updated_timestamp'], errors='coerce')
    transformed_df['amount'] = pd.to_numeric(transformed_df['amount'], errors='coerce')
    transformed_df['quantity'] = pd.to_numeric(transformed_df['quantity'], errors='coerce')

    # Feature engineering
    transformed_df['total_price'] = transformed_df['quantity'] * transformed_df['amount']

    # Column Selection and Renaming
    selected_cols = [
        'record_id',
        'transaction_date',
        'transaction_timestamp',
        'customer_name',
        'product_category',
        'quantity',
        'amount',
        'total_price',
        'payment_method',
        'status'
    ]
    transformed_df = transformed_df[[col for col in selected_cols if col in transformed_df.columns]]

    transformed_df.rename(columns={
        'record_id': 'transaction_id',
        'customer_name': 'customer',
        'product_category': 'category',
        'payment_method': 'payment_type'
    }, inplace=True)

    return transformed_df

#### Transform full data
- apply the transfoirmation function to the full data

In [None]:
# Check if the DataFrame 'full_data_df' is not empty
if not full_data_df.empty:
    # If there is data, apply transformation functions to the DataFrame
    transformed_full_df = apply_transformations(full_data_df)
    
    # Print information about the transformation
    print(f"Transformed {len(transformed_full_df)} records for full data.")
    print("First 5 rows of transformed full data:\n", transformed_full_df.head())
    print("\nData Types:\n", transformed_full_df.info())  # Shows column data types and memory usage

    try:
        # Attempt to save the transformed DataFrame to a CSV file
        transformed_full_df.to_csv(OUTPUT_FULL_TRANSFORMED_FILENAME, index=False)
        print(f"Transformed full data saved to '{OUTPUT_FULL_TRANSFORMED_FILENAME}'")
    except Exception as e:
        # Handle any errors that occur during the save operation
        print(f"Error saving transformed full data: {e}")
else:
    # If the input DataFrame was empty, skip the transformation process
    print("Full data is empty, skipping transformation.")

Transformed 488 records for full data.
First 5 rows of transformed full data:
    transaction_id transaction_date transaction_timestamp customer  \
0               1       2025-04-01   2025-04-01 17:49:57   Costco   
1               2       2025-04-01   2025-04-01 10:36:18    Apple   
2               3       2025-04-01   2025-04-01 17:16:47   Target   
3               4       2025-04-01   2025-04-01 18:15:48   Costco   
4               5       2025-04-01   2025-04-01 02:37:35   Amazon   

      category  quantity   amount  total_price payment_type     status  
0  Electronics         1   929.94       929.94       PayPal    Pending  
1    Groceries         4   130.17       520.68       PayPal  Cancelled  
2   Home Goods         8  1333.76     10670.08  Credit Card    Pending  
3        Tools         9  1115.54     10039.86       PayPal  Cancelled  
4     Software         6   633.14      3798.84  Credit Card  Cancelled  
<class 'pandas.core.frame.DataFrame'>
Int64Index: 488 entries, 0 to 

#### Incremental Data

In [None]:
# Check if the incremental data DataFrame is not empty
if not incremental_data_df.empty:
    # Apply the same transformation functions to the incremental data
    transformed_incremental_df = apply_transformations(incremental_data_df)
    
    # Print transformation details for incremental data
    print(f"Transformed {len(transformed_incremental_df)} records for incremental data.")
    print("First 5 rows of transformed incremental data:\n", transformed_incremental_df.head())
    print("\nData Types:\n", transformed_incremental_df.info())  # Shows column structure and memory usage

    try:
        # Attempt to save the transformed incremental data to a different output file
        transformed_incremental_df.to_csv(OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME, index=False)
        print(f"Transformed incremental data saved to '{OUTPUT_INCREMENTAL_TRANSFORMED_FILENAME}'")
    except Exception as e:
        # Handle any file saving errors specifically for incremental data
        print(f"Error saving transformed incremental data: {e}")
else:
    # If no incremental data was available, skip processing
    print("Incremental data is empty, skipping transformation.")

Transformed 68 records for incremental data.
First 5 rows of transformed incremental data:
      transaction_id transaction_date transaction_timestamp customer  \
355             356       2025-06-18   2025-06-18 13:03:27   Amazon   
356             357       2025-06-19   2025-06-19 15:37:21   Google   
357             358       2025-06-20   2025-06-20 03:05:25   Target   
358             359       2025-06-21   2025-06-21 19:45:24   Amazon   
359             360       2025-06-22   2025-06-22 17:12:33   Amazon   

        category  quantity   amount  total_price   payment_type     status  
355        Books         5  1435.31      7176.55    Credit Card  Completed  
356   Home Goods         3    39.00       117.00  Bank Transfer    Pending  
357  Electronics         3  1987.26      5961.78         PayPal  Cancelled  
358    Groceries         2  1267.45      2534.90  Bank Transfer  Cancelled  
359       Sports         4   470.82      1883.28    Credit Card  Completed  
<class 'pandas.core

### Update Last Extraction

In [None]:
# Get the current date and time to mark when this extrction occurred
current_extraction_time = datetime.now()

try:
    # Attempt to open the timestamp file in write modde W
    with open(LAST_EXTRACTION_FILE, 'w') as f:
        # Write the current timestamp
        f.write(current_extraction_time.isoformat())
    
    # Confirm successful save with the actual timestamp that was stored
    print(f"New extraction timestamp saved: {current_extraction_time.isoformat()}")
    
except Exception as e:
    # Handle any errors that might occur during file operations, such as. Permission issues, Disk full, Invalid file path
    print(f"Error saving new timestamp to '{LAST_EXTRACTION_FILE}': {e}")

New extraction timestamp saved: 2025-06-18T12:31:32.820503
