# ETL Transformation Notebook

## Libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

## Load extracted Data

In [4]:
rawData = pd.read_csv('data/raw_data.csv')
incrementalData = pd.read_csv('data/incremental_data.csv')
print(f"Loaded {len(rawData)} raw records and {len(incrementalData)} incremental records")


Loaded 100 raw records and 10 incremental records


## Transfromation 1: Cleaning missing values

#### Creating compies for Transformation

In [5]:
rawCleaned = rawData.copy()
incCleaned = incrementalData.copy()

#### Before cleaning

In [6]:
print(f"Raw data - Missing values: {rawData.isnull().sum().sum()}")
print(f"Raw data - Duplicates: {rawData.duplicated().sum()}")
print(f"Incremental data - Missing values: {incrementalData.isnull().sum().sum()}")


Raw data - Missing values: 88
Raw data - Duplicates: 1
Incremental data - Missing values: 12


#### Missing cutomer names
- change the missing names to Unknown for both raw and incremental

In [7]:
rawCleaned['customer_name'].fillna('Unknown_Customer', inplace=True)
incCleaned['customer_name'].fillna('Unknown_Customer', inplace=True)

#### Imputing missing quantities for raw data using median
- this is beacause median values are less affected by outliers
- calculates the median of the quantity column and fills missing values with it

In [10]:
for product in rawCleaned['product'].unique():
    product_median_qty = rawCleaned[rawCleaned['product'] == product]['quantity'].median()
    rawCleaned.loc[(rawCleaned['product'] == product) & (rawCleaned['quantity'].isna()), 'quantity'] = product_median_qty


#### Imputing missing values for incremental data using median 
- used the same logic as above
- calculates the median of the quantity column and fills missing values with it

In [12]:
for product in incCleaned['product'].unique():
    product_median_qty = incCleaned[incCleaned['product'] == product]['quantity'].median()
    incCleaned.loc[(incCleaned['product'] == product) & (incCleaned['quantity'].isna()), 'quantity'] = product_median_qty

#### missing unit prices using median
- first created a price mapping dictionary to hold the median prices for each product
- used the apply() logic to replace missing unit prices with the median price for that product

In [13]:
price_map = rawCleaned.groupby('product')['unit_price'].median().to_dict()
rawCleaned['unit_price'] = rawCleaned.apply(
    lambda row: price_map[row['product']] if pd.isna(row['unit_price']) else row['unit_price'], 
    axis=1
)

#### Missing regions
- created most common region mapping dictionary using the mode() function
- used the fillna() function to replace missing regions with the most common region for that product
- applied this to both raw and incremental data

In [16]:
# Create mapping
region_map = rawCleaned.dropna().groupby('customer_name')['region'].agg(lambda x: x.mode()[0] if not x.mode().empty else 'Unknown').to_dict()
# Fill missing regions
rawCleaned['region'] = rawCleaned['region'].fillna(rawCleaned['customer_name'].map(region_map)).fillna('Unknown')
incCleaned['region'] = incCleaned['region'].fillna('Central')

#### Removing Exact Duplicates
- using the drop_duplicates() function to remove exact duplicates from both raw and incremental data

In [17]:
rawCleaned.drop_duplicates(inplace=True)
incCleaned.drop_duplicates(inplace=True)

#### Cleaning verification

In [19]:
print(f"Raw data - Missing values: {rawCleaned.isnull().sum().sum()}")
print(f"Raw data - Duplicates: {rawCleaned.duplicated().sum()}")
print(f"Raw data - Records: {len(rawData)} → {len(rawCleaned)}")

Raw data - Missing values: 1
Raw data - Duplicates: 0
Raw data - Records: 100 → 99


## Transfromation 2: Data Enrichment

#### Feature Engineering: Adding total price columns to botb raw and incremental data