In [1]:
# Zeotap Assignment
# Ecommerce Data analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from scipy.spatial.distance import cdist
import faiss
from sklearn.preprocessing import MinMaxScaler

In [2]:
# Task 1: EDA
customers_df=pd.read_csv('Customers Data.csv')
products_df=pd.read_csv('Products.csv')
transactions_df=pd.read_csv('Transactions.csv')

In [None]:
from scipy import stats

def plot_numerical_distributions(transactions_df, products_df):
    """Plot distributions of numerical variables with outlier detection"""
    
    # Set up the plotting style
    plt.style.use('seaborn')
    
    # Transaction Value Analysis
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Transaction Value Distribution
    sns.boxplot(y=transactions_df['TotalValue'], ax=axes[0,0])
    axes[0,0].set_title('Transaction Values Distribution')
    
    # Detect outliers using IQR method
    Q1 = transactions_df['TotalValue'].quantile(0.25)
    Q3 = transactions_df['TotalValue'].quantile(0.75)
    IQR = Q3 - Q1
    outliers = transactions_df[(transactions_df['TotalValue'] < (Q1 - 1.5 * IQR)) | 
                              (transactions_df['TotalValue'] > (Q3 + 1.5 * IQR))]
    
    # Add histogram with KDE
    sns.histplot(transactions_df['TotalValue'], kde=True, ax=axes[0,1])
    axes[0,1].set_title(f'Transaction Values Histogram\nOutliers: {len(outliers)} ({len(outliers)/len(transactions_df)*100:.1f}%)')
    
    # 2. Quantity Distribution
    sns.boxplot(y=transactions_df['Quantity'], ax=axes[1,0])
    axes[1,0].set_title('Purchase Quantity Distribution')
    
    sns.histplot(transactions_df['Quantity'], kde=True, ax=axes[1,1])
    axes[1,1].set_title('Purchase Quantity Histogram')
    
    plt.tight_layout()
    plt.show()

def analyze_customer_behavior(transactions_df):
    """Analyze customer purchase behavior"""
    
    # Aggregate customer metrics
    customer_metrics = transactions_df.groupby('CustomerID').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    })
    
    customer_metrics.columns = ['Purchase_Frequency', 'Total_Spend', 'Avg_Transaction', 'Total_Items', 'Avg_Items']
    
    # Plot customer metrics
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Purchase Frequency Distribution
    sns.histplot(customer_metrics['Purchase_Frequency'], kde=True, ax=axes[0,0])
    axes[0,0].set_title('Customer Purchase Frequency')
    
    # Total Spend Distribution
    sns.histplot(customer_metrics['Total_Spend'], kde=True, ax=axes[0,1])
    axes[0,1].set_title('Customer Total Spend')
    
    # Average Transaction Distribution
    sns.histplot(customer_metrics['Avg_Transaction'], kde=True, ax=axes[1,0])
    axes[1,0].set_title('Customer Average Transaction Value')
    
    # Average Items Distribution
    sns.histplot(customer_metrics['Avg_Items'], kde=True, ax=axes[1,1])
    axes[1,1].set_title('Customer Average Items per Transaction')
    
    plt.tight_layout()
    plt.show()
    
    return customer_metrics

def analyze_temporal_patterns(transactions_df):
    """Analyze time-based patterns"""
    
    transactions_df['TransactionDate'] = pd.to_datetime(transactions_df['TransactionDate'])
    
    # Extract time components
    transactions_df['Year'] = transactions_df['TransactionDate'].dt.year
    transactions_df['Month'] = transactions_df['TransactionDate'].dt.month
    transactions_df['DayOfWeek'] = transactions_df['TransactionDate'].dt.dayofweek
    transactions_df['Hour'] = transactions_df['TransactionDate'].dt.hour
    
    # Plot temporal patterns
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Monthly pattern
    monthly_sales = transactions_df.groupby('Month')['TotalValue'].sum()
    sns.lineplot(x=monthly_sales.index, y=monthly_sales.values, ax=axes[0,0], marker='o')
    axes[0,0].set_title('Monthly Sales Pattern')
    
    # Daily pattern
    daily_sales = transactions_df.groupby('DayOfWeek')['TotalValue'].mean()
    sns.barplot(x=daily_sales.index, y=daily_sales.values, ax=axes[0,1])
    axes[0,1].set_title('Average Sales by Day of Week')
    
    # Hourly pattern
    hourly_sales = transactions_df.groupby('Hour')['TotalValue'].mean()
    sns.lineplot(x=hourly_sales.index, y=hourly_sales.values, ax=axes[1,0], marker='o')
    axes[1,0].set_title('Average Sales by Hour')
    
    # Sales volume heatmap
    pivot_table = pd.pivot_table(transactions_df, 
                                values='TotalValue',
                                index='DayOfWeek', 
                                columns='Hour',
                                aggfunc='mean')
    sns.heatmap(pivot_table, ax=axes[1,1], cmap='YlOrRd')
    axes[1,1].set_title('Sales Heatmap: Day vs Hour')
    
    plt.tight_layout()
    plt.show()

def analyze_category_performance(transactions_df, products_df):
    """Analyze product category performance"""
    
    # Merge transactions with products
    category_data = transactions_df.merge(products_df[['ProductID', 'Category']], on='ProductID')
    
    # Category metrics
    category_metrics = category_data.groupby('Category').agg({
        'TransactionID': 'count',
        'TotalValue': ['sum', 'mean'],
        'Quantity': ['sum', 'mean']
    })
    
    # Plot category performance
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # Transaction Count by Category
    category_data.groupby('Category')['TransactionID'].count().plot(
        kind='bar', ax=axes[0,0])
    axes[0,0].set_title('Number of Transactions by Category')
    
    # Total Revenue by Category
    category_data.groupby('Category')['TotalValue'].sum().plot(
        kind='bar', ax=axes[0,1])
    axes[0,1].set_title('Total Revenue by Category')
    
    # Average Transaction Value by Category
    category_data.groupby('Category')['TotalValue'].mean().plot(
        kind='bar', ax=axes[1,0])
    axes[1,0].set_title('Average Transaction Value by Category')
    
    # Category Share Pie Chart
    category_data.groupby('Category')['TotalValue'].sum().plot(
        kind='pie', ax=axes[1,1], autopct='%1.1f%%')
    axes[1,1].set_title('Category Revenue Share')
    
    plt.tight_layout()
    plt.show()
    
    return category_metrics

def main():
    """Run comprehensive visual EDA"""
    
    # 1. Analyze numerical distributions and outliers
    print("Analyzing numerical distributions and outliers...")
    plot_numerical_distributions(transactions_df, products_df)
    
    # 2. Analyze customer behavior
    print("\nAnalyzing customer behavior...")
    customer_metrics = analyze_customer_behavior(transactions_df)
    
    # 3. Analyze temporal patterns
    print("\nAnalyzing temporal patterns...")
    analyze_temporal_patterns(transactions_df)
    
    # 4. Analyze category performance
    print("\nAnalyzing category performance...")
    category_metrics = analyze_category_performance(transactions_df, products_df)
    
    return customer_metrics, category_metrics

# Run the analysis
customer_metrics, category_metrics = main()