# 📊 Data Exploration - Retail Customer Analytics

This notebook provides comprehensive exploratory data analysis (EDA) of the retail customer dataset.

## Objectives:
- Load and examine the raw customer data
- Understand data structure, types, and quality
- Identify patterns, trends, and anomalies
- Generate initial insights for business stakeholders
- Prepare data quality assessment for preprocessing


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

# Configure plotting
plt.style.use('default')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

print("📚 Libraries imported successfully!")

## 1. Data Loading and Initial Inspection

In [None]:
# Load the dataset
# Option 1: Load from Kaggle dataset
# df = pd.read_csv('https://www.kaggle.com/datasets/iamsouravbanerjee/customer-shopping-trends-dataset')

# Option 2: Load from local file
try:
    df = pd.read_csv('../data/raw/customer_shopping_data.csv')
    print(f"✅ Dataset loaded successfully!")
    print(f"📏 Dataset shape: {df.shape}")
except FileNotFoundError:
    print("⚠️ Dataset file not found. Using sample data generator...")
    # Use our sample data generator
    import sys
    sys.path.append('../src')
    from utils.common import load_sample_data
    df = load_sample_data(n_customers=2000)
    print(f"✅ Sample dataset generated successfully!")
    print(f"📏 Dataset shape: {df.shape}")

In [None]:
# Basic dataset information
print("📋 DATASET OVERVIEW")
print("=" * 50)
print(f"Number of records: {len(df):,}")
print(f"Number of features: {df.shape[1]}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print("\n📊 COLUMN INFORMATION:")
df.info()

In [None]:
# Display first few rows
print("🔍 FIRST 5 ROWS:")
display(df.head())

print("\n🔍 LAST 5 ROWS:")
display(df.tail())

In [None]:
# Statistical summary
print("📈 STATISTICAL SUMMARY - NUMERICAL FEATURES:")
display(df.describe())

print("\n📝 STATISTICAL SUMMARY - CATEGORICAL FEATURES:")
display(df.describe(include=['object']))

## 2. Data Quality Assessment

In [None]:
# Check for missing values
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percentage.values
}).sort_values('Missing Count', ascending=False)

print("❓ MISSING VALUES ANALYSIS:")
display(missing_df[missing_df['Missing Count'] > 0])

if missing_df['Missing Count'].sum() == 0:
    print("✅ No missing values found in the dataset!")

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"🔄 DUPLICATE RECORDS: {duplicates}")

if duplicates > 0:
    print(f"⚠️ Found {duplicates} duplicate records ({duplicates/len(df)*100:.2f}% of data)")
    # Show duplicate examples
    print("\nExamples of duplicate records:")
    display(df[df.duplicated(keep=False)].head())
else:
    print("✅ No duplicate records found!")

In [None]:
# Data type analysis
dtype_summary = df.dtypes.value_counts()
print("🏷️ DATA TYPES SUMMARY:")
for dtype, count in dtype_summary.items():
    print(f"  {dtype}: {count} columns")

# Identify potential data type issues
print("\n🔍 POTENTIAL DATA TYPE ISSUES:")
for col in df.columns:
    if df[col].dtype == 'object':
        # Check if numeric data is stored as object
        try:
            pd.to_numeric(df[col])
            print(f"  ⚠️ {col}: Stored as object but appears to be numeric")
        except:
            pass
    
    # Check for date columns
    if 'date' in col.lower() and df[col].dtype == 'object':
        print(f"  📅 {col}: Appears to be a date column but stored as object")

## 3. Univariate Analysis

In [None]:
# Numerical features distribution
numerical_cols = df.select_dtypes(include=[np.number]).columns
print(f"📊 ANALYZING {len(numerical_cols)} NUMERICAL FEATURES:")
print(list(numerical_cols))

# Create distribution plots
if len(numerical_cols) > 0:
    n_cols = min(3, len(numerical_cols))
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if len(numerical_cols) > 1 else [axes]
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes):
            df[col].hist(bins=30, ax=axes[i], alpha=0.7, edgecolor='black')
            axes[i].set_title(f'Distribution of {col}')
            axes[i].set_xlabel(col)
            axes[i].set_ylabel('Frequency')
    
    # Hide empty subplots
    for i in range(len(numerical_cols), len(axes)):
        axes[i].set_visible(False)
    
    plt.tight_layout()
    plt.show()

In [None]:
# Categorical features analysis
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"📝 ANALYZING {len(categorical_cols)} CATEGORICAL FEATURES:")
print(list(categorical_cols))

for col in categorical_cols:
    print(f"\n🏷️ {col}:")
    value_counts = df[col].value_counts()
    print(f"  Unique values: {df[col].nunique()}")
    print(f"  Most frequent: {value_counts.index[0]} ({value_counts.iloc[0]} occurrences)")
    
    if df[col].nunique() <= 10:  # Show all values if <= 10 unique
        print("  Value distribution:")
        for val, count in value_counts.items():
            percentage = (count / len(df)) * 100
            print(f"    {val}: {count} ({percentage:.1f}%)")
    else:
        print(f"  Top 5 values: {list(value_counts.head().index)}")

In [None]:
# Interactive categorical distribution plots
if len(categorical_cols) > 0:
    for col in categorical_cols[:4]:  # Limit to first 4 categorical columns
        if df[col].nunique() <= 20:  # Only plot if reasonable number of categories
            fig = px.bar(
                x=df[col].value_counts().index,
                y=df[col].value_counts().values,
                title=f'Distribution of {col}',
                labels={'x': col, 'y': 'Count'}
            )
            fig.update_layout(height=400)
            fig.show()
        else:
            print(f"⚠️ Skipping {col} - too many categories ({df[col].nunique()})")

## 4. Key Business Insights

In [None]:
# Generate key business insights
print("💡 KEY BUSINESS INSIGHTS")
print("=" * 50)

insights = []

# Find key columns
customer_col = 'Customer ID' if 'Customer ID' in df.columns else None
amount_col = next((col for col in df.columns if 'amount' in col.lower() or 'price' in col.lower()), None)
category_col = next((col for col in df.columns if 'category' in col.lower()), None)
age_col = next((col for col in df.columns if 'age' in col.lower()), None)

# Customer insights
if customer_col:
    unique_customers = df[customer_col].nunique()
    total_transactions = len(df)
    avg_transactions_per_customer = total_transactions / unique_customers
    
    insights.append(f"👥 Customer Base: {unique_customers:,} unique customers")
    insights.append(f"🛒 Average transactions per customer: {avg_transactions_per_customer:.1f}")

# Revenue insights
if amount_col:
    total_revenue = df[amount_col].sum()
    avg_transaction_value = df[amount_col].mean()
    median_transaction_value = df[amount_col].median()
    
    insights.append(f"💰 Total revenue: ${total_revenue:,.2f}")
    insights.append(f"💳 Average transaction value: ${avg_transaction_value:.2f}")
    insights.append(f"💳 Median transaction value: ${median_transaction_value:.2f}")

# Category insights
if category_col:
    top_category = df[category_col].value_counts().index[0]
    top_category_count = df[category_col].value_counts().iloc[0]
    category_percentage = (top_category_count / len(df)) * 100
    
    insights.append(f"🏆 Most popular category: {top_category} ({category_percentage:.1f}% of transactions)")
    insights.append(f"📝 Total categories: {df[category_col].nunique()}")

# Age insights (if available)
if age_col:
    avg_age = df[age_col].mean()
    median_age = df[age_col].median()
    age_range = df[age_col].max() - df[age_col].min()
    
    insights.append(f"👶 Customer age - Average: {avg_age:.1f}, Median: {median_age:.1f}, Range: {age_range}")

# Data quality insights
completeness = (1 - df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100
insights.append(f"✅ Data completeness: {completeness:.1f}%")

# Print insights
for i, insight in enumerate(insights, 1):
    print(f"{i:2d}. {insight}")

print("\n" + "=" * 50)
print("📋 RECOMMENDATIONS FOR NEXT STEPS:")
print("1. 🧹 Data Preprocessing: Address missing values and duplicates")
print("2. 🔧 Feature Engineering: Create RFM features, customer segments")
print("3. 📊 Advanced Analytics: Customer segmentation, churn prediction")
print("4. 🤖 Machine Learning: Recommendation systems, CLV prediction")
print("5. 📈 Business Intelligence: Dashboard creation, KPI monitoring")

In [None]:
# Save exploration results
try:
    # Create summary statistics file
    summary_stats = df.describe(include='all')
    summary_stats.to_csv('../reports/analysis/01_exploration_summary.csv')
    
    # Save data quality report
    quality_report = pd.DataFrame({
        'Column': df.columns,
        'Data_Type': df.dtypes,
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100,
        'Unique_Values': [df[col].nunique() for col in df.columns]
    })
    quality_report.to_csv('../reports/analysis/01_data_quality_report.csv', index=False)
    
    print("💾 Reports saved to ../reports/analysis/")
    print("   - 01_exploration_summary.csv")
    print("   - 01_data_quality_report.csv")
except Exception as e:
    print(f"⚠️ Could not save reports: {e}")

print("\n✅ DATA EXPLORATION COMPLETED!")
print("\n🚀 Ready for next notebook: 02_eda_insights.ipynb")