## 1. Import Libraries

In [None]:
# Data manipulation
import pandas as pd
import numpy as np
import os
from datetime import datetime

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Statistical analysis
from scipy import stats
from scipy.stats import pearsonr, spearmanr

# Configure visualization settings
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("✓ Libraries imported successfully")

## 2. Load Data

In [None]:
# Load cleaned data
data_path = os.path.join('..', 'data', 'retail_transactions_clean.csv')
df = pd.read_csv(data_path, parse_dates=['Date'])

print(f"Dataset Shape: {df.shape}")
print(f"Date Range: {df['Date'].min()} to {df['Date'].max()}")
print(f"Total Revenue: ${df['NETAMT'].sum():,.2f}")
print(f"\nFirst few rows:")
df.head()

## 3. Data Overview

In [None]:
# Display basic information
print("Dataset Information:")
print("=" * 60)
df.info()

print("\n" + "=" * 60)
print("Missing Values:")
print("=" * 60)
missing = df.isnull().sum()
if missing.sum() == 0:
    print("No missing values found ✓")
else:
    print(missing[missing > 0])

In [None]:
# Statistical summary
print("Statistical Summary:")
df.describe()

## 4. Univariate Analysis

In [None]:
# Distribution of Revenue
fig, axes = plt.subplots(2, 2, figsize=(15, 10))

# Revenue distribution
axes[0, 0].hist(df['NETAMT'], bins=50, edgecolor='black', alpha=0.7)
axes[0, 0].set_title('Distribution of Transaction Revenue', fontsize=14, fontweight='bold')
axes[0, 0].set_xlabel('Revenue ($)')
axes[0, 0].set_ylabel('Frequency')
axes[0, 0].axvline(df['NETAMT'].mean(), color='red', linestyle='--', label=f"Mean: ${df['NETAMT'].mean():.2f}")
axes[0, 0].legend()

# Quantity distribution
qty_counts = df['Qty'].value_counts().sort_index()
axes[0, 1].bar(qty_counts.index, qty_counts.values, edgecolor='black', alpha=0.7)
axes[0, 1].set_title('Distribution of Quantity Sold', fontsize=14, fontweight='bold')
axes[0, 1].set_xlabel('Quantity')
axes[0, 1].set_ylabel('Frequency')

# Discount distribution
axes[1, 0].hist(df['DiscountPct'], bins=30, edgecolor='black', alpha=0.7, color='coral')
axes[1, 0].set_title('Distribution of Discount Percentage', fontsize=14, fontweight='bold')
axes[1, 0].set_xlabel('Discount %')
axes[1, 0].set_ylabel('Frequency')
axes[1, 0].axvline(df['DiscountPct'].mean(), color='red', linestyle='--', label=f"Mean: {df['DiscountPct'].mean():.2f}%")
axes[1, 0].legend()

# Conversion rate distribution
axes[1, 1].hist(df['ConversionPct'], bins=30, edgecolor='black', alpha=0.7, color='lightgreen')
axes[1, 1].set_title('Distribution of Conversion Rate', fontsize=14, fontweight='bold')
axes[1, 1].set_xlabel('Conversion %')
axes[1, 1].set_ylabel('Frequency')
axes[1, 1].axvline(df['ConversionPct'].mean(), color='red', linestyle='--', label=f"Mean: {df['ConversionPct'].mean():.2f}%")
axes[1, 1].legend()

plt.tight_layout()
plt.savefig('../results/univariate_distributions.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved to results/univariate_distributions.png")

## 5. Category Analysis

In [None]:
# Revenue by category
category_revenue = df.groupby('Category')['NETAMT'].sum().sort_values(ascending=False)

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar chart
axes[0].bar(category_revenue.index, category_revenue.values, edgecolor='black', alpha=0.8)
axes[0].set_title('Total Revenue by Category', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Category')
axes[0].set_ylabel('Revenue ($)')
axes[0].tick_params(axis='x', rotation=45)
for i, v in enumerate(category_revenue.values):
    axes[0].text(i, v, f'${v/1e6:.1f}M', ha='center', va='bottom')

# Pie chart
colors = sns.color_palette('husl', len(category_revenue))
axes[1].pie(category_revenue.values, labels=category_revenue.index, autopct='%1.1f%%',
            startangle=90, colors=colors, textprops={'fontsize': 11})
axes[1].set_title('Revenue Share by Category', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.savefig('../results/category_revenue_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nCategory Revenue Summary:")
print(category_revenue.apply(lambda x: f"${x:,.2f}"))
print("\n✓ Chart saved to results/category_revenue_analysis.png")

In [None]:
# SubCategory analysis
subcategory_data = df.groupby(['Category', 'SubCategory']).agg({
    'NETAMT': 'sum',
    'Qty': 'sum'
}).reset_index()

# Top 10 subcategories by revenue
top_subcategories = subcategory_data.nlargest(10, 'NETAMT')

plt.figure(figsize=(12, 6))
bars = plt.barh(range(len(top_subcategories)), top_subcategories['NETAMT'].values, alpha=0.8)
plt.yticks(range(len(top_subcategories)), 
           [f"{row['Category']}-{row['SubCategory']}" for _, row in top_subcategories.iterrows()])
plt.xlabel('Revenue ($)', fontsize=12)
plt.title('Top 10 SubCategories by Revenue', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()

# Add value labels
for i, v in enumerate(top_subcategories['NETAMT'].values):
    plt.text(v, i, f' ${v/1e6:.2f}M', va='center', fontsize=10)

plt.tight_layout()
plt.savefig('../results/top_subcategories.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved to results/top_subcategories.png")

## 6. Time Series Analysis

In [None]:
# Daily revenue trend
daily_revenue = df.groupby('Date')['NETAMT'].sum()

fig, axes = plt.subplots(2, 1, figsize=(15, 10))

# Daily trend
axes[0].plot(daily_revenue.index, daily_revenue.values, linewidth=1, alpha=0.6)
axes[0].plot(daily_revenue.index, daily_revenue.rolling(window=7).mean(), 
             linewidth=2, color='red', label='7-Day Moving Average')
axes[0].set_title('Daily Revenue Trend with 7-Day Moving Average', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Revenue ($)')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Monthly revenue
monthly_revenue = df.groupby(['Year', 'MonthName', 'Month']).agg({
    'NETAMT': 'sum'
}).reset_index().sort_values('Month')

axes[1].bar(range(len(monthly_revenue)), monthly_revenue['NETAMT'].values, 
            edgecolor='black', alpha=0.8)
axes[1].set_xticks(range(len(monthly_revenue)))
axes[1].set_xticklabels(monthly_revenue['MonthName'], rotation=45)
axes[1].set_title('Monthly Revenue Performance', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Revenue ($)')
axes[1].grid(True, alpha=0.3, axis='y')

# Add value labels
for i, v in enumerate(monthly_revenue['NETAMT'].values):
    axes[1].text(i, v, f'${v/1e6:.1f}M', ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../results/time_series_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("✓ Chart saved to results/time_series_analysis.png")

In [None]:
# Day of week analysis
dow_analysis = df.groupby('DayName').agg({
    'NETAMT': ['sum', 'mean', 'count']
}).reset_index()
dow_analysis.columns = ['DayName', 'Total_Revenue', 'Avg_Transaction', 'Transaction_Count']

# Order days correctly
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
dow_analysis['DayName'] = pd.Categorical(dow_analysis['DayName'], categories=day_order, ordered=True)
dow_analysis = dow_analysis.sort_values('DayName')

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Total revenue by day
axes[0].bar(dow_analysis['DayName'], dow_analysis['Total_Revenue'], 
            color=['#FF6B6B' if day in ['Saturday', 'Sunday'] else '#4ECDC4' 
                   for day in dow_analysis['DayName']], edgecolor='black', alpha=0.8)
axes[0].set_title('Revenue by Day of Week', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Total Revenue ($)')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# Transaction count by day
axes[1].bar(dow_analysis['DayName'], dow_analysis['Transaction_Count'],
            color=['#FF6B6B' if day in ['Saturday', 'Sunday'] else '#4ECDC4' 
                   for day in dow_analysis['DayName']], edgecolor='black', alpha=0.8)
axes[1].set_title('Transaction Count by Day of Week', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Number of Transactions')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../results/day_of_week_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nDay of Week Analysis:")
print(dow_analysis)
print("\n✓ Chart saved to results/day_of_week_analysis.png")

## 7. Store Performance Analysis

In [None]:
# Store performance metrics
store_metrics = df.groupby('Store').agg({
    'NETAMT': ['sum', 'mean'],
    'Qty': 'sum',
    'ConversionPct': 'mean',
    'DiscountPct': 'mean'
}).reset_index()
store_metrics.columns = ['Store', 'Total_Revenue', 'Avg_Transaction', 'Total_Units', 'Avg_Conversion', 'Avg_Discount']
store_metrics = store_metrics.sort_values('Total_Revenue', ascending=False)

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Total revenue by store
axes[0, 0].bar(store_metrics['Store'], store_metrics['Total_Revenue'], edgecolor='black', alpha=0.8)
axes[0, 0].set_title('Total Revenue by Store', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Revenue ($)')
axes[0, 0].tick_params(axis='x', rotation=45)
for i, v in enumerate(store_metrics['Total_Revenue'].values):
    axes[0, 0].text(i, v, f'${v/1e6:.1f}M', ha='center', va='bottom')

# Average transaction value
axes[0, 1].bar(store_metrics['Store'], store_metrics['Avg_Transaction'], 
               color='coral', edgecolor='black', alpha=0.8)
axes[0, 1].set_title('Average Transaction Value by Store', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Avg Transaction ($)')
axes[0, 1].tick_params(axis='x', rotation=45)

# Conversion rate
axes[1, 0].bar(store_metrics['Store'], store_metrics['Avg_Conversion'],
               color='lightgreen', edgecolor='black', alpha=0.8)
axes[1, 0].set_title('Average Conversion Rate by Store', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Conversion Rate (%)')
axes[1, 0].tick_params(axis='x', rotation=45)

# Average discount
axes[1, 1].bar(store_metrics['Store'], store_metrics['Avg_Discount'],
               color='gold', edgecolor='black', alpha=0.8)
axes[1, 1].set_title('Average Discount by Store', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Discount (%)')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../results/store_performance_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nStore Performance Metrics:")
print(store_metrics)
print("\n✓ Chart saved to results/store_performance_analysis.png")

## 8. Discount Impact Analysis

In [None]:
# Discount vs quantity and revenue
discount_analysis = df.groupby('DiscountSegment').agg({
    'NETAMT': ['sum', 'mean'],
    'Qty': ['sum', 'mean'],
    'DiscountPct': 'mean'
}).reset_index()
discount_analysis.columns = ['DiscountSegment', 'Total_Revenue', 'Avg_Transaction', 
                             'Total_Qty', 'Avg_Qty', 'Avg_Discount']

# Order segments
segment_order = ['No Discount', 'Low Discount', 'Medium Discount', 'High Discount']
discount_analysis['DiscountSegment'] = pd.Categorical(discount_analysis['DiscountSegment'], 
                                                       categories=segment_order, ordered=True)
discount_analysis = discount_analysis.sort_values('DiscountSegment')

fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Revenue by discount segment
axes[0].bar(discount_analysis['DiscountSegment'], discount_analysis['Total_Revenue'],
            edgecolor='black', alpha=0.8)
axes[0].set_title('Total Revenue by Discount Segment', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Revenue ($)')
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')

# Average quantity by discount segment
axes[1].bar(discount_analysis['DiscountSegment'], discount_analysis['Avg_Qty'],
            color='coral', edgecolor='black', alpha=0.8)
axes[1].set_title('Average Quantity Sold by Discount Segment', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Average Quantity')
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.savefig('../results/discount_impact_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nDiscount Impact Analysis:")
print(discount_analysis)
print("\n✓ Chart saved to results/discount_impact_analysis.png")

## 9. Correlation Analysis

In [None]:
# Select numeric columns for correlation
numeric_cols = ['MRP', 'SellingPrice', 'Qty', 'NETAMT', 'DiscountPct', 
                'ConversionPct', 'Footfall', 'AvgUnitPrice']
correlation_matrix = df[numeric_cols].corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=1, cbar_kws={'shrink': 0.8})
plt.title('Correlation Matrix of Key Metrics', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig('../results/correlation_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nKey Correlations with Revenue (NETAMT):")
revenue_corr = correlation_matrix['NETAMT'].sort_values(ascending=False)
print(revenue_corr[revenue_corr.index != 'NETAMT'])
print("\n✓ Chart saved to results/correlation_matrix.png")

## 10. Price Segment Analysis

In [None]:
# Price segment analysis
price_segment_analysis = df.groupby('PriceSegment').agg({
    'NETAMT': ['sum', 'mean', 'count'],
    'Qty': 'sum',
    'DiscountPct': 'mean'
}).reset_index()
price_segment_analysis.columns = ['PriceSegment', 'Total_Revenue', 'Avg_Transaction', 
                                  'Transaction_Count', 'Total_Qty', 'Avg_Discount']

# Order segments
segment_order = ['Budget', 'Economy', 'Mid-Range', 'Premium']
price_segment_analysis['PriceSegment'] = pd.Categorical(price_segment_analysis['PriceSegment'],
                                                        categories=segment_order, ordered=True)
price_segment_analysis = price_segment_analysis.sort_values('PriceSegment')

fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Revenue by price segment
axes[0, 0].bar(price_segment_analysis['PriceSegment'], price_segment_analysis['Total_Revenue'],
               edgecolor='black', alpha=0.8)
axes[0, 0].set_title('Total Revenue by Price Segment', fontsize=14, fontweight='bold')
axes[0, 0].set_ylabel('Revenue ($)')
axes[0, 0].tick_params(axis='x', rotation=45)

# Transaction count by price segment
axes[0, 1].bar(price_segment_analysis['PriceSegment'], price_segment_analysis['Transaction_Count'],
               color='coral', edgecolor='black', alpha=0.8)
axes[0, 1].set_title('Transaction Count by Price Segment', fontsize=14, fontweight='bold')
axes[0, 1].set_ylabel('Number of Transactions')
axes[0, 1].tick_params(axis='x', rotation=45)

# Average transaction value
axes[1, 0].bar(price_segment_analysis['PriceSegment'], price_segment_analysis['Avg_Transaction'],
               color='lightgreen', edgecolor='black', alpha=0.8)
axes[1, 0].set_title('Average Transaction Value by Price Segment', fontsize=14, fontweight='bold')
axes[1, 0].set_ylabel('Avg Transaction ($)')
axes[1, 0].tick_params(axis='x', rotation=45)

# Average discount
axes[1, 1].bar(price_segment_analysis['PriceSegment'], price_segment_analysis['Avg_Discount'],
               color='gold', edgecolor='black', alpha=0.8)
axes[1, 1].set_title('Average Discount by Price Segment', fontsize=14, fontweight='bold')
axes[1, 1].set_ylabel('Discount (%)')
axes[1, 1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.savefig('../results/price_segment_analysis.png', dpi=300, bbox_inches='tight')
plt.show()

print("\nPrice Segment Analysis:")
print(price_segment_analysis)
print("\n✓ Chart saved to results/price_segment_analysis.png")

## 11. Key Insights & Recommendations

In [None]:
# Generate key insights
print("=" * 80)
print(" " * 20 + "KEY BUSINESS INSIGHTS")
print("=" * 80)

# 1. Best performing category
best_category = df.groupby('Category')['NETAMT'].sum().idxmax()
best_category_revenue = df.groupby('Category')['NETAMT'].sum().max()
print(f"\n1. TOP CATEGORY: {best_category}")
print(f"   Revenue: ${best_category_revenue:,.2f}")
print(f"   Share: {best_category_revenue / df['NETAMT'].sum() * 100:.1f}% of total revenue")

# 2. Best performing store
best_store = df.groupby('Store')['NETAMT'].sum().idxmax()
best_store_revenue = df.groupby('Store')['NETAMT'].sum().max()
print(f"\n2. TOP STORE: {best_store}")
print(f"   Revenue: ${best_store_revenue:,.2f}")
print(f"   Share: {best_store_revenue / df['NETAMT'].sum() * 100:.1f}% of total revenue")

# 3. Best day of week
best_day = df.groupby('DayName')['NETAMT'].sum().idxmax()
best_day_revenue = df.groupby('DayName')['NETAMT'].sum().max()
print(f"\n3. BEST DAY: {best_day}")
print(f"   Average Daily Revenue: ${best_day_revenue / 52:,.2f}")

# 4. Best month
best_month = df.groupby('MonthName')['NETAMT'].sum().idxmax()
best_month_revenue = df.groupby('MonthName')['NETAMT'].sum().max()
print(f"\n4. BEST MONTH: {best_month}")
print(f"   Revenue: ${best_month_revenue:,.2f}")

# 5. Discount effectiveness
high_discount = df[df['DiscountPct'] > 20]['Qty'].mean()
low_discount = df[df['DiscountPct'] <= 20]['Qty'].mean()
print(f"\n5. DISCOUNT IMPACT:")
print(f"   High Discount (>20%) - Avg Qty: {high_discount:.2f} units")
print(f"   Low Discount (≤20%) - Avg Qty: {low_discount:.2f} units")
print(f"   Lift: {(high_discount / low_discount - 1) * 100:.1f}%")

# 6. Weekend vs Weekday
weekend_revenue = df[df['IsWeekend'] == 1]['NETAMT'].sum()
weekday_revenue = df[df['IsWeekend'] == 0]['NETAMT'].sum()
weekend_days = df[df['IsWeekend'] == 1]['Date'].nunique()
weekday_days = df[df['IsWeekend'] == 0]['Date'].nunique()
print(f"\n6. WEEKEND vs WEEKDAY:")
print(f"   Weekend Avg Daily Revenue: ${weekend_revenue / weekend_days:,.2f}")
print(f"   Weekday Avg Daily Revenue: ${weekday_revenue / weekday_days:,.2f}")
print(f"   Weekend Premium: {(weekend_revenue / weekend_days) / (weekday_revenue / weekday_days) - 1:.1%}")

# 7. Price segment insights
premium_share = df[df['PriceSegment'] == 'Premium']['NETAMT'].sum() / df['NETAMT'].sum() * 100
print(f"\n7. PREMIUM SEGMENT:")
print(f"   Revenue Share: {premium_share:.1f}%")
print(f"   Transactions: {len(df[df['PriceSegment'] == 'Premium']):,}")

# 8. Conversion insights
avg_conversion = df['ConversionPct'].mean()
best_conversion_store = df.groupby('Store')['ConversionPct'].mean().idxmax()
best_conversion_rate = df.groupby('Store')['ConversionPct'].mean().max()
print(f"\n8. CONVERSION RATE:")
print(f"   Overall Average: {avg_conversion:.2f}%")
print(f"   Best Store: {best_conversion_store} ({best_conversion_rate:.2f}%)")

print("\n" + "=" * 80)
print(" " * 25 + "RECOMMENDATIONS")
print("=" * 80)
print("\n1. Focus marketing efforts on weekend promotions for maximum impact")
print("2. Increase inventory for top-performing categories during peak months")
print("3. Optimize discount strategy - higher discounts drive quantity but may reduce margins")
print("4. Study best-performing store practices and replicate across network")
print("5. Target conversion rate improvement in underperforming stores")
print("6. Develop premium segment offerings for higher margins")
print("7. Plan inventory and staffing based on day-of-week patterns")
print("8. Focus on festival/promotional periods for maximum revenue capture")
print("=" * 80)

## 12. Export Summary Report

In [None]:
# Create comprehensive summary report
summary_report = {
    'Metric': [
        'Total Revenue',
        'Total Transactions',
        'Total Units Sold',
        'Average Transaction Value',
        'Average Discount %',
        'Average Conversion Rate',
        'Top Category',
        'Top Store',
        'Best Day of Week',
        'Best Month',
        'Weekend Premium'
    ],
    'Value': [
        f"${df['NETAMT'].sum():,.2f}",
        f"{len(df):,}",
        f"{df['Qty'].sum():,}",
        f"${df['NETAMT'].mean():,.2f}",
        f"{df['DiscountPct'].mean():.2f}%",
        f"{df['ConversionPct'].mean():.2f}%",
        best_category,
        best_store,
        best_day,
        best_month,
        f"{(weekend_revenue / weekend_days) / (weekday_revenue / weekday_days) - 1:.1%}"
    ]
}

summary_df = pd.DataFrame(summary_report)
summary_df.to_csv('../results/executive_summary.csv', index=False)

print("Executive Summary:")
print(summary_df.to_string(index=False))
print("\n✓ Summary report saved to results/executive_summary.csv")

print("\n" + "=" * 80)
print(" " * 25 + "ANALYSIS COMPLETE")
print("=" * 80)
print("\nAll visualizations and reports have been saved to the 'results' folder.")
print("Dashboard-ready data is available in the 'dashboard' folder.")