In [22]:
# -----------------------------
# Customer Segmentation Processing
# -----------------------------

import pandas as pd
import os
from datetime import datetime

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), "../..")) 

# Paths to cleaned datasets
CLEANED_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "cleaned")
PROCESSED_DATA_DIR = os.path.join(PROJECT_ROOT, "data", "customer_segmentation")
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

# Initialize provenance log
log_file_path = os.path.join(PROCESSED_DATA_DIR, "provenance_log.txt")

with open(log_file_path, "w") as f:
    f.write(f"Provenance log started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

def log_step(description):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log_entry = f"{timestamp} - {description}\n"
    with open(log_file_path, "a") as f:
        f.write(log_entry)

# -----------------------------
# Load Cleaned Datasets
# -----------------------------
hh_demo = pd.read_csv(os.path.join(CLEANED_DATA_DIR, "hh_demographic_cleaned.csv"))
transactions = pd.read_csv(os.path.join(CLEANED_DATA_DIR, "transaction_data_cleaned.csv"))
products = pd.read_csv(os.path.join(CLEANED_DATA_DIR, "product_cleaned.csv"))
campaign_desc = pd.read_csv(os.path.join(CLEANED_DATA_DIR, "campaign_desc_cleaned.csv"))
coupon_redempt = pd.read_csv(os.path.join(CLEANED_DATA_DIR, "coupon_redempt_cleaned.csv"))

log_step("Cleaned datasets loaded: hh_demo, transactions, products, coupon_redempt, campaign_desc.")

In [None]:
# -----------------------------
# Merge Datasets for Customer Segmentation
# -----------------------------

# Merge household demographics
tx = transactions.merge(hh_demo, how='left', on='household_key')
log_step("Merged household demographics into transactions.")

# Merge product info
tx = tx.merge(products, how='left', on='PRODUCT_ID')
log_step("Merged product info into transactions.")

In [None]:
# -----------------------------
# Save Processed Dataset Ready for Analysis
# -----------------------------
PROCESSED_DATA_FILE = os.path.join(PROCESSED_DATA_DIR, "transactions_merged.csv")
tx.to_csv(PROCESSED_DATA_FILE, index=False)

log_step(f"Saved merged transactions to {PROCESSED_DATA_FILE}. Shape: {tx.shape} with  Columns:  {list(tx.columns)}")

In [None]:
# -----------------------------
# Load and clean
# -----------------------------

tx = pd.read_csv(PROCESSED_DATA_FILE)
log_step(f"Merged Transactions loaded. Shape: {tx.shape} Columns: {tx.columns}")

In [None]:
# Define columns to check
critical_demo_cols = ['MARITAL_STATUS_CODE', 'INCOME_DESC']
missing_placeholders = ["Unknown", "None", "NONE", "None/Unknown", "", "NaN"]

# Count rows before dropping
initial_rows = tx.shape[0]

# Boolean mask for rows where either column is NaN or placeholder
mask_missing = tx[critical_demo_cols].apply(
    lambda col: col.isna() | col.astype(str).str.strip().isin(missing_placeholders)
).any(axis=1)

# Count & percentage
num_dropped = mask_missing.sum()
per_dropped = num_dropped / initial_rows * 100

# Drop rows
tx_clean = tx[~mask_missing].reset_index(drop=True)

# Log step
log_step(f"Dropped {num_dropped} rows with missing/unknown MARITAL_STATUS_CODE or INCOME_DESC ({per_dropped:.2f}% of total).")
log_step(f"Shape after drop: {tx_clean.shape}")

In [None]:
# -----------------------------
# Total Spending per Household
# -----------------------------

log_step("Starting household-level aggregation...")

# Aggregate transactional data per household
agg = (
    tx.groupby("household_key")
    .agg({
        "SALES_VALUE": "sum",
        "QUANTITY": "sum",
        "BASKET_ID": pd.Series.nunique,
        "COUPON_DISC": lambda x: (x > 0).sum()
    })
    .reset_index()
)

# Rename for clarity
agg = agg.rename(columns={
    "SALES_VALUE": "total_spent",
    "QUANTITY": "total_quantity",
    "BASKET_ID": "num_transactions",
    "COUPON_DISC": "num_coupons_redeemed"
})

# Derived metrics
agg["avg_basket_size"] = agg["total_quantity"] / agg["num_transactions"]
agg["coupon_redemption_rate"] = agg["num_coupons_redeemed"] / agg["num_transactions"]

log_step(f"Aggregated {agg.shape[0]} households. Columns: {list(agg.columns)}")

# -----------------------------
# Merge Demographic Information
# -----------------------------

demographics_cols = [
    "household_key", "AGE_DESC", "MARITAL_STATUS_CODE", "INCOME_DESC",
    "HOMEOWNER_DESC", "HH_COMP_DESC", "HOUSEHOLD_SIZE_DESC", "KID_CATEGORY_DESC"
]

agg = agg.merge(tx[demographics_cols].drop_duplicates(), on="household_key", how="left")
log_step(f"Merged demographic data. Shape after merge: {agg.shape}")

# -----------------------------
# Drop Missing/Unknown Demographics
# -----------------------------

missing_placeholders = ["Unknown", "NULL", "nan", "None", "NA", "Not Available", " "]

mask_missing_all = agg[demographics_cols[1:]].apply(
    lambda col: col.isna() | col.astype(str).str.strip().isin(missing_placeholders)
).any(axis=1)

num_dropped = mask_missing_all.sum()
per_dropped = num_dropped / agg.shape[0] * 100

agg = agg[~mask_missing_all].reset_index(drop=True)

log_step(f"Dropped {num_dropped} households with missing/unknown demographic values ({per_dropped:.2f}%).")
log_step(f"Final aggregated dataset shape: {agg.shape}")


log_step("Sample of aggregated data:")
log_step(agg.head())

In [None]:
# -----------------------------
# Demographic Distribution - Pie Charts (Saved)
# -----------------------------
import matplotlib.pyplot as plt
import os

# Create directory for figures
FIGURES_DIR = os.path.join(PROJECT_ROOT, "data", "customer_segmentation", "figures")
os.makedirs(FIGURES_DIR, exist_ok=True)

log_step("Starting demographic distribution pie chart generation...")

# List of demographic columns to visualize
demo_cols = [
    'AGE_DESC', 'MARITAL_STATUS_CODE', 'INCOME_DESC',
    'HOMEOWNER_DESC', 'HH_COMP_DESC', 'HOUSEHOLD_SIZE_DESC', 'KID_CATEGORY_DESC'
]

# Function to generate and save pie chart for a column
def save_pie_chart(col_name):
    plt.figure(figsize=(6, 6))
    data = agg[col_name].value_counts(dropna=False)
    labels = data.index
    sizes = data.values

    plt.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
    plt.title(f"Distribution of {col_name.replace('_', ' ')}", fontsize=12)
    plt.tight_layout()

    # Save figure
    fig_path = os.path.join(FIGURES_DIR, f"{col_name.lower()}_piechart.png")
    plt.savefig(fig_path)
    plt.close()

    log_step(f"Saved pie chart for {col_name} to {fig_path}")

# Generate and save pie charts for each demographic column
for col in demo_cols:
    save_pie_chart(col)

log_step(f"Completed saving all demographic pie charts to {FIGURES_DIR}")
