In [2]:
"""
=================================================================
PHASE 2 & 3: DATA CLEANING & FEATURE ENGINEERING
=================================================================
This notebook handles:
1. Loading and exploring the raw data
2. Cleaning & handling missing values (logical decisions documented)
3. Removing duplicates & anomalies
4. Creating engineered features for analysis
=================================================================
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)

print("=" * 80)
print("STEP 1: LOAD & EXPLORE DATA")
print("=" * 80)

# Load data with proper encoding handling
df = pd.read_csv('../data/raw/superstore.csv', encoding='latin-1')

print(f"\nüìä Dataset Shape: {df.shape}")
print(f"\nüìã Column Names & Types:")
print(df.dtypes)
print(f"\nüîç First few rows:")
print(df.head())
print(f"\nüìà Data Info:")
print(df.info())
print(f"\n‚ùì Missing Values:")
missing = df.isnull().sum()
print(missing[missing > 0] if missing.sum() > 0 else "No missing values")

print("\n" + "=" * 80)
print("STEP 2: DATA CLEANING - DECISION LOG")
print("=" * 80)

# Create a decision log
decisions = []

# ===== DECISION 1: Handle Missing Values =====
decision = "MISSING VALUES: No missing values found in dataset"
print(f"\n‚úÖ {decision}")
decisions.append(decision)

# ===== DECISION 2: Remove Duplicates =====
duplicates_before = df.duplicated().sum()
df = df.drop_duplicates()
duplicates_after = df.duplicated().sum()
decision = f"DUPLICATES: Removed {duplicates_before} duplicate rows"
print(f"‚úÖ {decision}")
decisions.append(decision)

# ===== DECISION 3: Parse Date Columns =====
print("\nüîÑ Parsing date columns...")
# Identify date columns (adjust based on your actual data)
date_columns = [col for col in df.columns if 'date' in col.lower() or 'order' in col.lower()]
print(f"Detected date columns: {date_columns}")

# Parse dates
for col in df.columns:
    if 'date' in col.lower():
        df[col] = pd.to_datetime(df[col], errors='coerce')
        decision = f"DATE PARSING: Converted '{col}' to datetime"
        print(f"‚úÖ {decision}")
        decisions.append(decision)

# ===== DECISION 4: Handle Inconsistent Categories =====
print("\nüè∑Ô∏è Checking categorical columns...")
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
for col in categorical_cols:
    unique_count = df[col].nunique()
    print(f"  {col}: {unique_count} unique values")
    # Strip whitespace from all object columns
    df[col] = df[col].str.strip() if df[col].dtype == 'object' else df[col]

decision = "CATEGORIES: Stripped whitespace from all categorical columns"
print(f"‚úÖ {decision}")
decisions.append(decision)

# ===== DECISION 5: Handle Negative/Invalid Values =====
print("\n‚ö†Ô∏è Checking for invalid/negative values...")

# Check for negative profit
negative_profit = (df['Profit'] < 0).sum() if 'Profit' in df.columns else 0
if negative_profit > 0:
    print(f"  Found {negative_profit} rows with negative profit (KEPT - some products DO lose money)")
    decision = f"NEGATIVE PROFIT: Kept {negative_profit} negative profit rows (domain logic: some orders lose money)"
    decisions.append(decision)

# Check for negative quantity
negative_qty = (df['Quantity'] < 0).sum() if 'Quantity' in df.columns else 0
if negative_qty > 0:
    print(f"  Found {negative_qty} rows with negative quantity - REMOVING (returns/cancellations)")
    df = df[df['Quantity'] > 0]
    decision = f"NEGATIVE QUANTITY: Removed {negative_qty} negative quantity rows (cancelled orders)"
    decisions.append(decision)

# Check for negative discount
negative_discount = (df['Discount'] < 0).sum() if 'Discount' in df.columns else 0
if negative_discount > 0:
    print(f"  Found {negative_discount} rows with negative discount - KEEPING (markup possible)")
    decision = f"NEGATIVE DISCOUNT: Kept {negative_discount} negative discount rows (markup/special pricing)"
    decisions.append(decision)

# ===== DECISION 6: Remove Outliers (Domain Logic) =====
print("\nüìä Checking for statistical outliers...")

# For quantity: anything > 20 units per order is unusual (typical retail)
if 'Quantity' in df.columns:
    Q1 = df['Quantity'].quantile(0.25)
    Q3 = df['Quantity'].quantile(0.75)
    IQR = Q3 - Q1
    outliers = ((df['Quantity'] < Q1 - 1.5*IQR) | (df['Quantity'] > Q3 + 1.5*IQR)).sum()
    print(f"  Quantity outliers (IQR method): {outliers} rows")
    # Keep them for now - let's see them in analysis
    decision = f"QUANTITY OUTLIERS: Kept {outliers} outliers (high-volume orders are valid)"
    decisions.append(decision)

# ===== DECISION 7: Handle Invalid Dates =====
invalid_dates = 0
for col in df.columns:
    if pd.api.types.is_datetime64_any_dtype(df[col]):
        invalid_count = df[col].isnull().sum()
        if invalid_count > 0:
            print(f"  Invalid dates in {col}: {invalid_count}")
            invalid_dates += invalid_count
            # Drop rows with invalid dates
            df = df.dropna(subset=[col])
            decision = f"INVALID DATES in '{col}': Removed {invalid_count} rows"
            decisions.append(decision)

print("\n" + "=" * 80)
print("STEP 3: FEATURE ENGINEERING")
print("=" * 80)

# ===== FEATURE 1: Order Date Features =====
print("\nüî® Creating temporal features...")
if 'Order Date' in df.columns:
    df['order_year'] = df['Order Date'].dt.year
    df['order_month'] = df['Order Date'].dt.month
    df['order_quarter'] = df['Order Date'].dt.quarter
    df['order_day_of_week'] = df['Order Date'].dt.dayofweek
    df['order_week_of_year'] = df['Order Date'].dt.isocalendar().week
    print("‚úÖ Created: order_year, order_month, order_quarter, order_day_of_week, order_week_of_year")

# ===== FEATURE 2: Profit Metrics =====
print("\nüí∞ Creating profit features...")
if 'Sales' in df.columns and 'Profit' in df.columns:
    df['profit_margin'] = (df['Profit'] / df['Sales'] * 100).round(2)
    df['profit_margin'] = df['profit_margin'].replace([np.inf, -np.inf], 0)  # Handle division by zero
    print("‚úÖ Created: profit_margin (%)")

# ===== FEATURE 3: Discount Flag =====
print("\nüè∑Ô∏è Creating discount features...")
if 'Discount' in df.columns:
    df['has_discount'] = (df['Discount'] > 0).astype(int)
    df['high_discount'] = (df['Discount'] > df['Discount'].median()).astype(int)
    print("‚úÖ Created: has_discount, high_discount")

# ===== FEATURE 4: Customer Type (New vs Returning) =====
print("\nüë• Creating customer features...")
if 'Customer ID' in df.columns and 'Order Date' in df.columns:
    # Find first order date for each customer
    customer_first_order = df.groupby('Customer ID')['Order Date'].min().reset_index()
    customer_first_order.columns = ['Customer ID', 'First Order Date']
    
    df = df.merge(customer_first_order, on='Customer ID', how='left')
    
    # Customer type: if this is their first order
    df['customer_type'] = df.apply(
        lambda row: 'New' if row['Order Date'] == row['First Order Date'] else 'Returning',
        axis=1
    )
    print("‚úÖ Created: customer_type (New/Returning)")

# ===== FEATURE 5: Customer Order Frequency & AOV =====
print("\nüìä Creating customer aggregation features...")
if 'Customer ID' in df.columns:
    customer_stats = df.groupby('Customer ID').agg({
        'Order ID': 'count',  # Number of orders
        'Sales': ['sum', 'mean'],  # Total sales & average order value
        'Profit': 'sum',
        'Order Date': 'min'  # First order date
    }).reset_index()
    
    customer_stats.columns = ['Customer ID', 'order_frequency', 'total_customer_sales', 
                             'avg_order_value', 'total_customer_profit', 'customer_first_order']
    
    df = df.merge(customer_stats, on='Customer ID', how='left')
    print("‚úÖ Created: order_frequency, avg_order_value (per customer)")

# ===== FEATURE 6: Delivery Delay Flag =====
print("\n‚è±Ô∏è Creating delivery features...")
if 'Ship Date' in df.columns and 'Order Date' in df.columns:
    df['delivery_days'] = (df['Ship Date'] - df['Order Date']).dt.days
    df['delivery_delay_flag'] = (df['delivery_days'] > df['delivery_days'].median()).astype(int)
    print("‚úÖ Created: delivery_days, delivery_delay_flag")

# ===== FEATURE 7: Revenue Segment =====
print("\nüíé Creating customer value segmentation...")
if 'Sales' in df.columns:
    df['revenue_segment'] = pd.qcut(df['Sales'], q=3, labels=['Low', 'Medium', 'High'], duplicates='drop')
    print("‚úÖ Created: revenue_segment (Low/Medium/High)")

print("\n" + "=" * 80)
print("DATA QUALITY SUMMARY")
print("=" * 80)
print(f"\n‚ú® Final Dataset Shape: {df.shape}")
print(f"\nüéØ New Features Created:")
print(f"  - Temporal: order_year, order_month, order_quarter, order_day_of_week, order_week_of_year")
print(f"  - Financial: profit_margin, has_discount, high_discount, revenue_segment")
print(f"  - Customer: customer_type, order_frequency, avg_order_value, total_customer_sales")
print(f"  - Delivery: delivery_days, delivery_delay_flag")

print(f"\nüìã Decision Log:")
for i, decision in enumerate(decisions, 1):
    print(f"  {i}. {decision}")

print("\n" + "=" * 80)
print("SAVING CLEANED DATA")
print("=" * 80)

# Save cleaned data
df.to_csv('../data/processed/superstore_cleaned.csv', index=False)
print("\n‚úÖ Cleaned data saved to: data/processed/superstore_cleaned.csv")
print(f"   Total rows: {len(df)}")
print(f"   Total columns: {len(df.columns)}")

# Display final dataframe
print(f"\nüìä Sample of cleaned data:")
print(df.head())

STEP 1: LOAD & EXPLORE DATA

üìä Dataset Shape: (9994, 21)

üìã Column Names & Types:
Row ID             int64
Order ID          object
Order Date        object
Ship Date         object
Ship Mode         object
Customer ID       object
Customer Name     object
Segment           object
Country           object
City              object
State             object
Postal Code        int64
Region            object
Product ID        object
Category          object
Sub-Category      object
Product Name      object
Sales            float64
Quantity           int64
Discount         float64
Profit           float64
dtype: object

üîç First few rows:
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
1       2  CA-2016-152156   11/8/2016  11/11/2016    Second Class    CG-12520   
2       3  CA-2016-138688   6/12/2016   6/16/2016    Second Class    DV-13045   
3       4  US-2015-108966  

In [None]:
# Verify cleaned data
print(df.info())
print("\n‚úÖ Data cleaning and feature engineering complete!")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   Row ID                 9994 non-null   int64         
 1   Order ID               9994 non-null   object        
 2   Order Date             9994 non-null   datetime64[ns]
 3   Ship Date              9994 non-null   datetime64[ns]
 4   Ship Mode              9994 non-null   object        
 5   Customer ID            9994 non-null   object        
 6   Customer Name          9994 non-null   object        
 7   Segment                9994 non-null   object        
 8   Country                9994 non-null   object        
 9   City                   9994 non-null   object        
 10  State                  9994 non-null   object        
 11  Postal Code            9994 non-null   int64         
 12  Region                 9994 non-null   object        
 13  Pro