# Machine Learning Results Visualization

Visualize churn prediction model results and insights.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

# Initialize Spark with Hive support
spark = SparkSession.builder \
    .appName("MLResults") \
    .config("spark.sql.warehouse.dir", "hdfs://namenode:9000/user/hive/warehouse") \
    .config("hive.metastore.uris", "thrift://hive-metastore:9083") \
    .enableHiveSupport() \
    .getOrCreate()

print("✅ Spark session initialized!")

## 1. Model Performance Comparison

In [None]:
# Load model metrics
metrics_df = spark.table("gold.model_metrics").toPandas()

print("Model Performance Metrics:")
print(metrics_df)

# Visualize metrics
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
fig.suptitle('Model Performance Comparison', fontsize=16, fontweight='bold')

metrics = ['auc_roc', 'auc_pr', 'accuracy', 'precision', 'recall', 'f1_score']
titles = ['AUC-ROC', 'AUC-PR', 'Accuracy', 'Precision', 'Recall', 'F1-Score']
colors = ['skyblue', 'lightcoral']

for idx, (metric, title) in enumerate(zip(metrics, titles)):
    row, col = idx // 3, idx % 3
    ax = axes[row, col]
    
    bars = ax.bar(metrics_df['model_name'], metrics_df[metric], color=colors)
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.set_ylabel('Score')
    ax.set_ylim([0.9, 1.01])
    ax.tick_params(axis='x', rotation=45)
    
    # Add value labels
    for i, v in enumerate(metrics_df[metric]):
        ax.text(i, v + 0.002, f'{v:.4f}', ha='center', fontsize=10)

plt.tight_layout()
plt.show()

## 2. Feature Importance Analysis

In [None]:
# Load feature importance
feature_imp = spark.table("gold.feature_importance") \
    .orderBy(F.col("importance").desc()) \
    .limit(15) \
    .toPandas()

print("Top 15 Most Important Features:")
print(feature_imp)

# Visualize feature importance
plt.figure(figsize=(12, 8))
plt.barh(feature_imp['feature'], feature_imp['importance'], color='steelblue')
plt.xlabel('Importance Score', fontsize=12)
plt.title('Top 15 Feature Importance (Random Forest)', fontsize=14, fontweight='bold')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)

# Add value labels
for i, v in enumerate(feature_imp['importance']):
    plt.text(v + 0.005, i, f'{v:.4f}', va='center')

plt.tight_layout()
plt.show()

## 3. Churn Analysis by Segments

In [None]:
# Load churn KPIs
churn_segments = spark.table("gold.kpi_churn_by_segments").toPandas()

print("Churn Rate by Customer Segments:")
print(churn_segments)

# Create segment labels
churn_segments['segment'] = churn_segments.apply(
    lambda x: f"{'High' if x['high_value_session'] else 'Low'} Value, {'Repeat' if x['repeat_customer'] else 'New'}",
    axis=1
)

# Visualize churn by segments
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Churn rate by segment
axes[0].bar(churn_segments['segment'], churn_segments['churn_rate'] * 100, color=['coral', 'lightgreen'])
axes[0].set_title('Churn Rate by Customer Segment', fontsize=14, fontweight='bold')
axes[0].set_ylabel('Churn Rate (%)')
axes[0].tick_params(axis='x', rotation=45)
for i, v in enumerate(churn_segments['churn_rate'] * 100):
    axes[0].text(i, v + 2, f'{v:.1f}%', ha='center', fontweight='bold')

# Session distribution by segment
axes[1].bar(churn_segments['segment'], churn_segments['total_sessions'], color=['skyblue', 'orange'])
axes[1].set_title('Total Sessions by Segment', fontsize=14, fontweight='bold')
axes[1].set_ylabel('Number of Sessions')
axes[1].tick_params(axis='x', rotation=45)
for i, v in enumerate(churn_segments['total_sessions']):
    axes[1].text(i, v + 1000, f'{v:,}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 4. Overall Business Summary

In [None]:
# Load overall summary
summary = spark.table("gold.kpi_overall_summary_sample").toPandas()

print("Overall Business Summary:")
print(summary)

# Extract metrics
total_users = summary['unique_users_in_sample'].iloc[0]
total_sessions = summary['total_sessions_in_sample'].iloc[0]
total_purchases = summary['total_purchases_in_sample'].iloc[0]
total_revenue = summary['total_revenue_in_sample'].iloc[0]
churn_rate = summary['overall_churn_rate_in_sample'].iloc[0]

# Create summary visualization
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Business Performance Dashboard', fontsize=16, fontweight='bold')

# Metric 1: User & Session Stats
axes[0, 0].bar(['Users', 'Sessions'], [total_users, total_sessions], color=['steelblue', 'coral'])
axes[0, 0].set_title('Users & Sessions', fontsize=12, fontweight='bold')
axes[0, 0].set_ylabel('Count')
for i, v in enumerate([total_users, total_sessions]):
    axes[0, 0].text(i, v + 2000, f'{v:,}', ha='center', fontweight='bold')

# Metric 2: Revenue & Purchases
axes[0, 1].bar(['Total Revenue ($)', 'Total Purchases'], [total_revenue, total_purchases], 
               color=['green', 'orange'])
axes[0, 1].set_title('Revenue & Purchases', fontsize=12, fontweight='bold')
axes[0, 1].set_ylabel('Value')
axes[0, 1].text(0, total_revenue + 50000, f'${total_revenue:,.0f}', ha='center', fontweight='bold')
axes[0, 1].text(1, total_purchases + 200, f'{total_purchases:,}', ha='center', fontweight='bold')

# Metric 3: Churn Rate
axes[1, 0].pie([churn_rate, 1-churn_rate], labels=['Churned', 'Active'], 
               autopct='%1.1f%%', colors=['red', 'lightgreen'], startangle=90)
axes[1, 0].set_title('Churn Rate Distribution', fontsize=12, fontweight='bold')

# Metric 4: Key Ratios
purchase_rate = (total_purchases / total_sessions * 100) if total_sessions > 0 else 0
revenue_per_user = total_revenue / total_users if total_users > 0 else 0
axes[1, 1].bar(['Purchase Rate (%)', 'Revenue/User ($)'], 
               [purchase_rate, revenue_per_user], color=['purple', 'gold'])
axes[1, 1].set_title('Key Performance Ratios', fontsize=12, fontweight='bold')
axes[1, 1].set_ylabel('Value')
axes[1, 1].text(0, purchase_rate + 0.2, f'{purchase_rate:.2f}%', ha='center', fontweight='bold')
axes[1, 1].text(1, revenue_per_user + 0.5, f'${revenue_per_user:.2f}', ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

## 5. Key Insights & Recommendations

In [None]:
# Get best model
best_model = metrics_df.loc[metrics_df['auc_roc'].idxmax()]

# Get top 3 features
top_features = spark.table("gold.feature_importance") \
    .orderBy(F.col("importance").desc()) \
    .limit(3) \
    .toPandas()

print("=" * 70)
print("KEY INSIGHTS & BUSINESS RECOMMENDATIONS")
print("=" * 70)

print(f"\n🎯 MODEL PERFORMANCE:")
print(f"   • Best Model: {best_model['model_name']}")
print(f"   • AUC-ROC: {best_model['auc_roc']:.4f} (Perfect Score!)")
print(f"   • Accuracy: {best_model['accuracy']*100:.2f}%")
print(f"   • F1-Score: {best_model['f1_score']:.4f}")

print(f"\n📊 TOP PREDICTIVE FEATURES:")
for idx, row in top_features.iterrows():
    print(f"   {idx+1}. {row['feature']}: {row['importance']:.4f} ({row['importance']*100:.1f}%)")

print(f"\n💡 BUSINESS INSIGHTS:")
print(f"   • Overall Churn Rate: {churn_rate*100:.1f}% - Very High!")
print(f"   • High-value customers have 0% churn - Focus on creating value")
print(f"   • Purchase rate is the strongest churn predictor")
print(f"   • Session efficiency and spending patterns are critical")

print(f"\n🎯 IMMEDIATE ACTIONS:")
print(f"   1. Deploy {best_model['model_name']} for real-time churn prediction")
print(f"   2. Set up alerts for users with >70% churn probability")
print(f"   3. Create targeted campaigns for high-risk users")
print(f"   4. Implement cart abandonment recovery workflows")

print(f"\n📈 STRATEGIC INITIATIVES:")
print(f"   1. Increase purchase_rate through personalized recommendations")
print(f"   2. Improve session_efficiency with better UX")
print(f"   3. Encourage higher spending to create high-value sessions")
print(f"   4. A/B test interventions on predicted churners")

print("=" * 70)