In [5]:
import pandas as pd
import os

# Step 1: Load data
raw = pd.read_csv("data/raw_data.csv")
incremental = pd.read_csv("data/incremental_data.csv")

# Step 2: Remove duplicates
raw = raw.drop_duplicates()
incremental = incremental.drop_duplicates()

# Step 3: Handle missing values
raw = raw.dropna(subset=['customer_name'])
incremental = incremental.dropna(subset=['customer_name'])

raw['quantity'] = raw['quantity'].fillna(1)
incremental['quantity'] = incremental['quantity'].fillna(1)

raw['unit_price'] = raw['unit_price'].fillna(raw['unit_price'].mean())
incremental['unit_price'] = incremental['unit_price'].fillna(incremental['unit_price'].mean())

raw['region'] = raw['region'].fillna("Unknown")
incremental['region'] = incremental['region'].fillna("Unknown")

# Step 4: Add total_price
raw['total_price'] = raw['quantity'] * raw['unit_price']
incremental['total_price'] = incremental['quantity'] * incremental['unit_price']

# Step 5: Convert order_date
raw['order_date'] = pd.to_datetime(raw['order_date'], errors='coerce')
incremental['order_date'] = pd.to_datetime(incremental['order_date'], errors='coerce')

# Step 6 (Bonus): Add price bands
bins = [0, 999, 4999, 9999, float('inf')]
labels = ['Low', 'Medium', 'High', 'Very High']
raw['price_band'] = pd.cut(raw['total_price'], bins=bins, labels=labels)
incremental['price_band'] = pd.cut(incremental['total_price'], bins=bins, labels=labels)

# ✅ Step 7: Create folder & Save
os.makedirs("transformed", exist_ok=True)
raw.to_csv("transformed/transformed_full.csv", index=False)
incremental.to_csv("transformed/transformed_incremental.csv", index=False)

print("✅ Transformed files saved successfully.")


✅ Transformed files saved successfully.
