# ShopSphere E-Commerce Data Visualizations
## Customer Behavior, Retention & RFM Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

In [None]:
# Load and prepare data
df = pd.read_csv("ShopSphere_Dataset.csv")
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['CustomerID'] = df['CustomerID'].astype('object')
df['Revenue'] = df['Quantity'] * df['UnitPrice']

# Customer metrics
customer_metrics = df.groupby('CustomerID').agg({
    'InvoiceNo': 'nunique',
    'Revenue': ['sum', 'mean'],
    'Quantity': 'sum',
    'InvoiceDate': ['min', 'max'],
    'Description': 'nunique'
}).round(2)

customer_metrics.columns = ['Orders', 'Total_Revenue', 'Avg_Revenue', 'Total_Items', 
                           'First_Purchase', 'Last_Purchase', 'Product_Variety']

analysis_date = df['InvoiceDate'].max()
customer_metrics['Days_Since_Last_Purchase'] = (
    analysis_date - customer_metrics['Last_Purchase']
).dt.days

# Customer segmentation
def categorize_customer(row):
    if row['Orders'] >= 10 and row['Total_Revenue'] >= 5000:
        return 'VIP'
    elif row['Orders'] >= 5 and row['Total_Revenue'] >= 2000:
        return 'Loyal'
    elif row['Orders'] >= 3:
        return 'Regular'
    else:
        return 'New/Occasional'

customer_metrics['Customer_Segment'] = customer_metrics.apply(categorize_customer, axis=1)

print(f"Data prepared: {len(df):,} transactions, {df['CustomerID'].nunique():,} customers")

## 1. Customer Behavior Segmentation Visualizations

In [None]:
# Customer Segmentation Visualization
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Customer Behavior Segmentation Analysis', fontsize=16, fontweight='bold')

# 1. Segment Distribution (Pie Chart)
segment_counts = customer_metrics['Customer_Segment'].value_counts()
colors = ['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4']
axes[0,0].pie(segment_counts.values, labels=segment_counts.index, autopct='%1.1f%%', 
              colors=colors, startangle=90)
axes[0,0].set_title('Customer Segment Distribution', fontweight='bold')

# 2. Revenue by Segment (Bar Chart)
segment_revenue = customer_metrics.groupby('Customer_Segment')['Total_Revenue'].sum().sort_values(ascending=False)
bars = axes[0,1].bar(segment_revenue.index, segment_revenue.values, color=colors)
axes[0,1].set_title('Total Revenue by Customer Segment', fontweight='bold')
axes[0,1].set_ylabel('Total Revenue ($)')
axes[0,1].tick_params(axis='x', rotation=45)
# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    axes[0,1].text(bar.get_x() + bar.get_width()/2., height,
                   f'${height:,.0f}', ha='center', va='bottom')

# 3. Average Order Value by Segment
segment_aov = customer_metrics.groupby('Customer_Segment')['Avg_Revenue'].mean().sort_values(ascending=False)
bars = axes[1,0].bar(segment_aov.index, segment_aov.values, color=colors)
axes[1,0].set_title('Average Order Value by Segment', fontweight='bold')
axes[1,0].set_ylabel('Average Order Value ($)')
axes[1,0].tick_params(axis='x', rotation=45)
for bar in bars:
    height = bar.get_height()
    axes[1,0].text(bar.get_x() + bar.get_width()/2., height,
                   f'${height:.0f}', ha='center', va='bottom')

# 4. Orders Distribution by Segment (Box Plot)
segment_order = ['VIP', 'Loyal', 'Regular', 'New/Occasional']
segment_data = [customer_metrics[customer_metrics['Customer_Segment'] == seg]['Orders'].values 
                for seg in segment_order if seg in customer_metrics['Customer_Segment'].unique()]
segment_labels = [seg for seg in segment_order if seg in customer_metrics['Customer_Segment'].unique()]

box_plot = axes[1,1].boxplot(segment_data, labels=segment_labels, patch_artist=True)
for patch, color in zip(box_plot['boxes'], colors[:len(segment_labels)]):
    patch.set_facecolor(color)
axes[1,1].set_title('Order Frequency Distribution by Segment', fontweight='bold')
axes[1,1].set_ylabel('Number of Orders')
axes[1,1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 2. Top Countries by Unique Customers

In [None]:
# Top Countries Analysis
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
fig.suptitle('Geographic Analysis: Top Countries Performance', fontsize=16, fontweight='bold')

# 1. Unique Customers by Country
top_countries = df.groupby('Country')['CustomerID'].nunique().sort_values(ascending=False)
colors = ['#FF6B6B', '#4ECDC4']
bars1 = axes[0].bar(top_countries.index, top_countries.values, color=colors)
axes[0].set_title('Unique Customers by Country', fontweight='bold')
axes[0].set_ylabel('Number of Unique Customers')
axes[0].tick_params(axis='x', rotation=45)

# Add value labels
for bar in bars1:
    height = bar.get_height()
    axes[0].text(bar.get_x() + bar.get_width()/2., height,
                 f'{height:,}', ha='center', va='bottom')

# 2. Revenue per Customer by Country
country_metrics = df.groupby('Country').agg({
    'Revenue': 'sum',
    'CustomerID': 'nunique'
})
country_metrics['Revenue_Per_Customer'] = country_metrics['Revenue'] / country_metrics['CustomerID']
revenue_per_customer = country_metrics['Revenue_Per_Customer'].sort_values(ascending=False)

bars2 = axes[1].bar(revenue_per_customer.index, revenue_per_customer.values, color=colors)
axes[1].set_title('Revenue per Customer by Country', fontweight='bold')
axes[1].set_ylabel('Revenue per Customer ($)')
axes[1].tick_params(axis='x', rotation=45)

# Add value labels
for bar in bars2:
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height,
                 f'${height:,.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Summary statistics
print("\nCountry Performance Summary:")
print("-" * 40)
for country in top_countries.index:
    customers = top_countries[country]
    revenue_per_cust = revenue_per_customer[country]
    total_revenue = country_metrics.loc[country, 'Revenue']
    print(f"{country}:")
    print(f"  • Customers: {customers:,}")
    print(f"  • Revenue per Customer: ${revenue_per_cust:,.2f}")
    print(f"  • Total Revenue: ${total_revenue:,.2f}")
    print()

## 3. Retention Analysis Visualizations

In [None]:
# Retention Analysis
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Customer Retention Analysis', fontsize=16, fontweight='bold')

# 1. Days Since Last Purchase Distribution
axes[0,0].hist(customer_metrics['Days_Since_Last_Purchase'], bins=50, color='#FF6B6B', alpha=0.7, edgecolor='black')
axes[0,0].set_title('Days Since Last Purchase Distribution', fontweight='bold')
axes[0,0].set_xlabel('Days Since Last Purchase')
axes[0,0].set_ylabel('Number of Customers')
axes[0,0].axvline(customer_metrics['Days_Since_Last_Purchase'].mean(), color='red', linestyle='--', 
                  label=f'Mean: {customer_metrics["Days_Since_Last_Purchase"].mean():.0f} days')
axes[0,0].legend()

# 2. Customer Lifetime (Days between first and last purchase)
customer_metrics['Customer_Lifetime_Days'] = (
    customer_metrics['Last_Purchase'] - customer_metrics['First_Purchase']
).dt.days

# Filter out single-purchase customers for lifetime analysis
repeat_customers = customer_metrics[customer_metrics['Customer_Lifetime_Days'] > 0]
axes[0,1].hist(repeat_customers['Customer_Lifetime_Days'], bins=50, color='#4ECDC4', alpha=0.7, edgecolor='black')
axes[0,1].set_title('Customer Lifetime Distribution (Repeat Customers)', fontweight='bold')
axes[0,1].set_xlabel('Customer Lifetime (Days)')
axes[0,1].set_ylabel('Number of Customers')
axes[0,1].axvline(repeat_customers['Customer_Lifetime_Days'].mean(), color='blue', linestyle='--',
                  label=f'Mean: {repeat_customers["Customer_Lifetime_Days"].mean():.0f} days')
axes[0,1].legend()

# 3. Retention Risk Categories
def retention_risk(days):
    if days <= 30:
        return 'Active (≤30 days)'
    elif days <= 90:
        return 'At Risk (31-90 days)'
    elif days <= 180:
        return 'High Risk (91-180 days)'
    else:
        return 'Lost (>180 days)'

customer_metrics['Retention_Risk'] = customer_metrics['Days_Since_Last_Purchase'].apply(retention_risk)
risk_counts = customer_metrics['Retention_Risk'].value_counts()
risk_order = ['Active (≤30 days)', 'At Risk (31-90 days)', 'High Risk (91-180 days)', 'Lost (>180 days)']
risk_counts = risk_counts.reindex([cat for cat in risk_order if cat in risk_counts.index])

colors_risk = ['#2ECC71', '#F39C12', '#E74C3C', '#95A5A6']
bars = axes[1,0].bar(range(len(risk_counts)), risk_counts.values, color=colors_risk[:len(risk_counts)])
axes[1,0].set_title('Customer Retention Risk Categories', fontweight='bold')
axes[1,0].set_ylabel('Number of Customers')
axes[1,0].set_xticks(range(len(risk_counts)))
axes[1,0].set_xticklabels(risk_counts.index, rotation=45, ha='right')

# Add percentage labels
total_customers = risk_counts.sum()
for i, bar in enumerate(bars):
    height = bar.get_height()
    percentage = (height / total_customers) * 100
    axes[1,0].text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:,}\n({percentage:.1f}%)', ha='center', va='bottom')

# 4. Retention by Customer Segment
segment_retention = customer_metrics.groupby('Customer_Segment')['Days_Since_Last_Purchase'].mean().sort_values()
bars = axes[1,1].bar(segment_retention.index, segment_retention.values, 
                     color=['#96CEB4', '#45B7D1', '#4ECDC4', '#FF6B6B'])
axes[1,1].set_title('Average Days Since Last Purchase by Segment', fontweight='bold')
axes[1,1].set_ylabel('Average Days Since Last Purchase')
axes[1,1].tick_params(axis='x', rotation=45)

for bar in bars:
    height = bar.get_height()
    axes[1,1].text(bar.get_x() + bar.get_width()/2., height,
                   f'{height:.0f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Print retention insights
print("\nRetention Analysis Insights:")
print("-" * 40)
print(f"• {risk_counts.get('Active (≤30 days)', 0):,} customers are active (recent purchase)")
print(f"• {risk_counts.get('At Risk (31-90 days)', 0):,} customers are at risk of churning")
print(f"• {risk_counts.get('High Risk (91-180 days)', 0):,} customers are high churn risk")
print(f"• {risk_counts.get('Lost (>180 days)', 0):,} customers appear to be lost")
print(f"\n• Average customer lifetime: {repeat_customers['Customer_Lifetime_Days'].mean():.0f} days")
print(f"• {len(repeat_customers):,} customers made repeat purchases ({len(repeat_customers)/len(customer_metrics)*100:.1f}%)")

## 4. RFM Analysis Visualizations

In [None]:
# RFM Analysis
rfm_data = df.groupby('CustomerID').agg({
    'InvoiceDate': 'max',
    'InvoiceNo': 'nunique',
    'Revenue': 'sum'
}).reset_index()

rfm_data.columns = ['CustomerID', 'LastPurchase', 'Frequency', 'Monetary']
rfm_data['Recency'] = (analysis_date - rfm_data['LastPurchase']).dt.days

# Create RFM scores
rfm_data['R_Score'] = pd.qcut(rfm_data['Recency'].rank(method='first'), 5, labels=[5,4,3,2,1])
rfm_data['F_Score'] = pd.qcut(rfm_data['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm_data['M_Score'] = pd.qcut(rfm_data['Monetary'].rank(method='first'), 5, labels=[1,2,3,4,5])

# RFM Segmentation
def rfm_segment(row):
    r, f, m = int(row['R_Score']), int(row['F_Score']), int(row['M_Score'])
    if r >= 4 and f >= 4 and m >= 4:
        return 'Champions'
    elif r >= 3 and f >= 3 and m >= 3:
        return 'Loyal Customers'
    elif r >= 4 and f <= 2:
        return 'New Customers'
    elif r <= 2 and f >= 3 and m >= 3:
        return 'At Risk'
    elif r <= 2 and f >= 4 and m >= 4:
        return 'Cannot Lose Them'
    elif r >= 3 and f <= 3 and m <= 3:
        return 'Potential Loyalists'
    else:
        return 'Others'

rfm_data['Segment'] = rfm_data.apply(rfm_segment, axis=1)

# Create RFM visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('RFM Analysis: Customer Segmentation', fontsize=16, fontweight='bold')

# 1. RFM Segment Distribution
segment_counts = rfm_data['Segment'].value_counts()
colors_rfm = plt.cm.Set3(np.linspace(0, 1, len(segment_counts)))
wedges, texts, autotexts = axes[0,0].pie(segment_counts.values, labels=segment_counts.index, 
                                         autopct='%1.1f%%', colors=colors_rfm, startangle=90)
axes[0,0].set_title('RFM Customer Segments Distribution', fontweight='bold')

# 2. RFM Scores Distribution
rfm_scores = ['R_Score', 'F_Score', 'M_Score']
rfm_labels = ['Recency', 'Frequency', 'Monetary']
x_pos = np.arange(len(rfm_labels))

for i, (score, label) in enumerate(zip(rfm_scores, rfm_labels)):
    score_dist = rfm_data[score].value_counts().sort_index()
    axes[0,1].bar(x_pos[i] + np.arange(len(score_dist))*0.1 - 0.2, score_dist.values, 
                  width=0.08, label=label, alpha=0.7)

axes[0,1].set_title('RFM Scores Distribution', fontweight='bold')
axes[0,1].set_xlabel('Score (1-5)')
axes[0,1].set_ylabel('Number of Customers')
axes[0,1].legend()
axes[0,1].set_xticks([0, 1, 2])
axes[0,1].set_xticklabels(rfm_labels)

# 3. Segment Value Analysis
segment_value = rfm_data.groupby('Segment').agg({
    'Monetary': 'mean',
    'Frequency': 'mean',
    'Recency': 'mean'
}).round(2)

segment_revenue = rfm_data.groupby('Segment')['Monetary'].sum().sort_values(ascending=False)
bars = axes[1,0].bar(segment_revenue.index, segment_revenue.values, color=colors_rfm[:len(segment_revenue)])
axes[1,0].set_title('Total Revenue by RFM Segment', fontweight='bold')
axes[1,0].set_ylabel('Total Revenue ($)')
axes[1,0].tick_params(axis='x', rotation=45)

for bar in bars:
    height = bar.get_height()
    axes[1,0].text(bar.get_x() + bar.get_width()/2., height,
                   f'${height:,.0f}', ha='center', va='bottom', rotation=90)

# 4. RFM Scatter Plot (Frequency vs Monetary, colored by Recency)
scatter = axes[1,1].scatter(rfm_data['Frequency'], rfm_data['Monetary'], 
                           c=rfm_data['Recency'], cmap='RdYlBu_r', alpha=0.6, s=50)
axes[1,1].set_title('Customer Distribution: Frequency vs Monetary Value', fontweight='bold')
axes[1,1].set_xlabel('Frequency (Number of Orders)')
axes[1,1].set_ylabel('Monetary Value ($)')
cbar = plt.colorbar(scatter, ax=axes[1,1])
cbar.set_label('Recency (Days)', rotation=270, labelpad=15)

plt.tight_layout()
plt.show()

# RFM Summary Statistics
print("\nRFM Segment Analysis:")
print("-" * 50)
segment_summary = rfm_data.groupby('Segment').agg({
    'CustomerID': 'count',
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).round(2)

segment_summary.columns = ['Count', 'Avg_Recency', 'Avg_Frequency', 'Avg_Monetary']
segment_summary['Percentage'] = (segment_summary['Count'] / segment_summary['Count'].sum() * 100).round(1)

for segment, data in segment_summary.iterrows():
    print(f"{segment}:")
    print(f"  • Customers: {data['Count']:,} ({data['Percentage']}%)")
    print(f"  • Avg Recency: {data['Avg_Recency']:.0f} days")
    print(f"  • Avg Frequency: {data['Avg_Frequency']:.1f} orders")
    print(f"  • Avg Monetary: ${data['Avg_Monetary']:,.2f}")
    print()

## 5. Summary Dashboard

In [None]:
# Create a summary dashboard
fig = plt.figure(figsize=(16, 10))
fig.suptitle('ShopSphere E-Commerce: Executive Dashboard', fontsize=18, fontweight='bold')

# Create a grid layout
gs = fig.add_gridspec(3, 4, hspace=0.3, wspace=0.3)

# Key Metrics (Top row)
ax1 = fig.add_subplot(gs[0, 0])
ax2 = fig.add_subplot(gs[0, 1])
ax3 = fig.add_subplot(gs[0, 2])
ax4 = fig.add_subplot(gs[0, 3])

# Key metrics cards
total_customers = df['CustomerID'].nunique()
total_revenue = df['Revenue'].sum()
avg_order_value = df.groupby('InvoiceNo')['Revenue'].sum().mean()
repeat_customer_rate = len(repeat_customers) / len(customer_metrics) * 100

metrics = [
    (total_customers, 'Total\nCustomers', '#FF6B6B'),
    (total_revenue, 'Total\nRevenue ($)', '#4ECDC4'),
    (avg_order_value, 'Avg Order\nValue ($)', '#45B7D1'),
    (repeat_customer_rate, 'Repeat Customer\nRate (%)', '#96CEB4')
]

axes_metrics = [ax1, ax2, ax3, ax4]
for ax, (value, label, color) in zip(axes_metrics, metrics):
    ax.text(0.5, 0.7, f'{value:,.0f}' if 'Rate' not in label else f'{value:.1f}%', 
            ha='center', va='center', fontsize=20, fontweight='bold', transform=ax.transAxes)
    ax.text(0.5, 0.3, label, ha='center', va='center', fontsize=12, transform=ax.transAxes)
    ax.set_facecolor(color)
    ax.set_alpha(0.3)
    ax.set_xticks([])
    ax.set_yticks([])

# Customer Segments (Middle left)
ax5 = fig.add_subplot(gs[1, :2])
segment_counts.plot(kind='bar', ax=ax5, color=['#FF6B6B', '#4ECDC4', '#45B7D1', '#96CEB4'])
ax5.set_title('Customer Segments', fontweight='bold')
ax5.set_ylabel('Number of Customers')
ax5.tick_params(axis='x', rotation=45)

# RFM Segments (Middle right)
ax6 = fig.add_subplot(gs[1, 2:])
top_rfm_segments = rfm_data['Segment'].value_counts().head(6)
top_rfm_segments.plot(kind='bar', ax=ax6, color=plt.cm.Set3(np.linspace(0, 1, len(top_rfm_segments))))
ax6.set_title('Top RFM Segments', fontweight='bold')
ax6.set_ylabel('Number of Customers')
ax6.tick_params(axis='x', rotation=45)

# Retention Risk (Bottom left)
ax7 = fig.add_subplot(gs[2, :2])
risk_counts.plot(kind='bar', ax=ax7, color=['#2ECC71', '#F39C12', '#E74C3C', '#95A5A6'])
ax7.set_title('Customer Retention Risk', fontweight='bold')
ax7.set_ylabel('Number of Customers')
ax7.tick_params(axis='x', rotation=45)

# Country Performance (Bottom right)
ax8 = fig.add_subplot(gs[2, 2:])
top_countries.plot(kind='bar', ax=ax8, color=['#FF6B6B', '#4ECDC4'])
ax8.set_title('Customers by Country', fontweight='bold')
ax8.set_ylabel('Number of Customers')
ax8.tick_params(axis='x', rotation=45)

plt.show()

# Print executive summary
print("\n" + "="*60)
print("EXECUTIVE SUMMARY")
print("="*60)
print(f"📊 Business Overview:")
print(f"   • {total_customers:,} total customers across {df['Country'].nunique()} countries")
print(f"   • ${total_revenue:,.2f} total revenue generated")
print(f"   • ${avg_order_value:.2f} average order value")
print(f"   • {repeat_customer_rate:.1f}% repeat customer rate")

print(f"\n🎯 Customer Insights:")
vip_customers = len(customer_metrics[customer_metrics['Customer_Segment'] == 'VIP'])
at_risk_customers = risk_counts.get('At Risk (31-90 days)', 0)
print(f"   • {vip_customers:,} VIP customers driving premium revenue")
print(f"   • {at_risk_customers:,} customers at risk of churning")
print(f"   • Top market: {top_countries.index[0]} with {top_countries.iloc[0]:,} customers")

print(f"\n🚀 Action Items:")
print(f"   • Target {at_risk_customers:,} at-risk customers with retention campaigns")
print(f"   • Expand successful strategies from {top_countries.index[0]} to other markets")
print(f"   • Focus on converting {len(customer_metrics[customer_metrics['Customer_Segment'] == 'Regular']):,} regular customers to loyal status")