In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (16, 8)

print("Libraries loaded successfully ‚úì")

## 1. Load Data and Models

In [None]:
# Load operator performance data
df = pd.read_parquet('warehouse/data/operator_performance.parquet')

print(f"Operator performance data loaded: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")
print(f"Operators: {df['operator'].nunique()}")
print(f"\nTier distribution:")
print(df.groupby('operator_tier')['operator'].nunique())

df.head()

In [None]:
# Load tier-specific models
main_tiers = ['top_tier', 'largeplus', 'large', 'mediumplus', 'medium']
tier_name_mapping = {
    'top_tier': 'Top Tier',
    'largeplus': 'Large+',
    'large': 'Large',
    'mediumplus': 'Medium+',
    'medium': 'Medium'
}

models = {}
for tier_key in main_tiers:
    model_path = f'warehouse/data/models/stake_model_{tier_key}.pkl'
    try:
        models[tier_name_mapping[tier_key]] = joblib.load(model_path)
        print(f"‚úì Loaded model for {tier_name_mapping[tier_key]}")
    except Exception as e:
        print(f"‚úó Failed to load model for {tier_key}: {str(e)}")

print(f"\nTotal models loaded: {len(models)}")

## 2. Calculate Operator Historical Shares

Each operator's historical share of their tier's total stake

In [None]:
# Calculate total stake per operator across all time
operator_totals = df.groupby(['operator', 'operator_tier'])['total_stake'].sum().reset_index()
operator_totals.columns = ['operator', 'operator_tier', 'operator_total_stake']

# Calculate total stake per tier across all time
tier_totals = df.groupby('operator_tier')['total_stake'].sum().reset_index()
tier_totals.columns = ['operator_tier', 'tier_total_stake']

# Merge and calculate shares
operator_shares = operator_totals.merge(tier_totals, on='operator_tier')
operator_shares['historical_share'] = operator_shares['operator_total_stake'] / operator_shares['tier_total_stake']

print("Operator Historical Shares (Top 10 by total stake):")
print("="*80)
display(operator_shares.nlargest(10, 'operator_total_stake')[[
    'operator', 'operator_tier', 'operator_total_stake', 'historical_share'
]])

print(f"\n‚úì Calculated historical shares for {len(operator_shares)} operators")

## 3. Prepare Features for Prediction

In [None]:
def create_ts_features(tier_data):
    """
    Create time-series features matching the training process.
    """
    df_feat = tier_data.copy()
    df_feat = df_feat.sort_values('date').reset_index(drop=True)
    
    # Autoregressive features
    df_feat['stake_lag1'] = df_feat['total_stake'].shift(1)
    df_feat['stake_lag7'] = df_feat['total_stake'].shift(7)
    df_feat['stake_lag14'] = df_feat['total_stake'].shift(14)
    
    # Rolling statistics
    df_feat['stake_ma7'] = df_feat['total_stake'].shift(1).rolling(window=7, min_periods=1).mean()
    df_feat['stake_std7'] = df_feat['total_stake'].shift(1).rolling(window=7, min_periods=1).std()
    
    # Bet volume features
    df_feat['bets_current'] = df_feat['total_bets']
    df_feat['bets_lag1'] = df_feat['total_bets'].shift(1)
    df_feat['bets_ma7'] = df_feat['total_bets'].shift(1).rolling(window=7, min_periods=1).mean()
    
    # Temporal features
    df_feat['day_of_week'] = pd.to_datetime(df_feat['date']).dt.dayofweek
    df_feat['is_weekend'] = (df_feat['day_of_week'] >= 5).astype(int)
    df_feat['month'] = pd.to_datetime(df_feat['date']).dt.month
    
    return df_feat

# Aggregate to tier-daily and create features
tier_daily = df.groupby(['date', 'operator_tier'], as_index=False).agg({
    'total_stake': 'sum',
    'total_bets': 'sum',
    'total_payout': 'sum',
    'GGR': 'sum'
})

tier_daily = tier_daily.sort_values(['operator_tier', 'date'])

# Create features for each tier
tier_features = {}
for tier in tier_daily['operator_tier'].unique():
    tier_data = tier_daily[tier_daily['operator_tier'] == tier].copy()
    tier_features[tier] = create_ts_features(tier_data)

print("‚úì Features prepared for all tiers")

## 4. Generate Tier-Expected Predictions

In [None]:
# Generate predictions for each tier
tier_predictions = []

for tier, model_info in models.items():
    print(f"Generating predictions for {tier}...")
    
    # Get tier data with features
    tier_data = tier_features[tier].dropna().copy()
    
    # Get model components
    model = model_info['model']
    scaler = model_info['scaler']
    features = model_info['features']
    
    # Prepare features
    X = tier_data[features]
    X_scaled = scaler.transform(X)
    
    # Make predictions
    tier_data['tier_expected_stake'] = model.predict(X_scaled)
    tier_data['tier_actual_stake'] = tier_data['total_stake']
    
    tier_predictions.append(tier_data[['date', 'operator_tier', 'tier_actual_stake', 'tier_expected_stake']])

# Combine all tier predictions
tier_preds_df = pd.concat(tier_predictions, ignore_index=True)

print(f"\n‚úì Generated predictions for {len(tier_preds_df)} tier-days")
print(f"\nSample predictions:")
display(tier_preds_df.head(10))

## 5. Calculate Operator Efficiency Scores

In [None]:
# Merge operator data with tier predictions
operator_daily = df[['date', 'operator', 'operator_tier', 'game_category', 
                     'total_stake', 'total_bets', 'GGR']].copy()

# Aggregate to operator-daily (sum across game categories)
operator_daily = operator_daily.groupby(['date', 'operator', 'operator_tier'], as_index=False).agg({
    'total_stake': 'sum',
    'total_bets': 'sum',
    'GGR': 'sum'
})

# Merge with tier predictions
operator_daily = operator_daily.merge(tier_preds_df, 
                                      on=['date', 'operator_tier'],
                                      how='inner')

# Merge with operator historical shares
operator_daily = operator_daily.merge(operator_shares[['operator', 'historical_share']],
                                      on='operator',
                                      how='left')

# Calculate operator-expected stake
operator_daily['operator_expected_stake'] = operator_daily['tier_expected_stake'] * operator_daily['historical_share']

# Calculate efficiency score
operator_daily['efficiency_score'] = operator_daily['total_stake'] / operator_daily['operator_expected_stake']

# Calculate deviation
operator_daily['stake_deviation'] = operator_daily['total_stake'] - operator_daily['operator_expected_stake']
operator_daily['stake_deviation_pct'] = (operator_daily['stake_deviation'] / operator_daily['operator_expected_stake']) * 100

print(f"‚úì Calculated efficiency scores for {len(operator_daily)} operator-days")
print(f"\nSample efficiency scores:")
display(operator_daily[[
    'date', 'operator', 'operator_tier', 'total_stake', 
    'operator_expected_stake', 'efficiency_score'
]].head(10))

## 6. Aggregate Efficiency Metrics per Operator

In [None]:
# Calculate aggregate metrics per operator
operator_efficiency = operator_daily.groupby(['operator', 'operator_tier']).agg({
    'efficiency_score': ['mean', 'median', 'std', 'min', 'max'],
    'total_stake': 'sum',
    'operator_expected_stake': 'sum',
    'stake_deviation': 'sum',
    'GGR': 'sum',
    'date': 'count'  # Number of days
}).reset_index()

# Flatten column names
operator_efficiency.columns = ['operator', 'operator_tier', 
                               'efficiency_mean', 'efficiency_median', 'efficiency_std', 
                               'efficiency_min', 'efficiency_max',
                               'total_stake_sum', 'expected_stake_sum', 'stake_deviation_sum',
                               'total_ggr', 'n_days']

# Calculate overall efficiency (cumulative)
operator_efficiency['overall_efficiency'] = operator_efficiency['total_stake_sum'] / operator_efficiency['expected_stake_sum']

# Calculate deviation percentage
operator_efficiency['deviation_pct'] = (operator_efficiency['stake_deviation_sum'] / operator_efficiency['expected_stake_sum']) * 100

# Sort by overall efficiency
operator_efficiency = operator_efficiency.sort_values('overall_efficiency', ascending=False)

print("="*100)
print("OPERATOR EFFICIENCY RANKINGS")
print("="*100)
print("\nTop 10 Most Efficient Operators (Overall Efficiency Score):")
print("-" * 100)
display(operator_efficiency.head(10)[[
    'operator', 'operator_tier', 'overall_efficiency', 'efficiency_mean', 
    'deviation_pct', 'total_stake_sum', 'total_ggr'
]].round(3))

print("\n" + "="*100)
print("Bottom 10 Least Efficient Operators:")
print("-" * 100)
display(operator_efficiency.tail(10)[[
    'operator', 'operator_tier', 'overall_efficiency', 'efficiency_mean',
    'deviation_pct', 'total_stake_sum', 'total_ggr'
]].round(3))

## 7. Visualize Efficiency by Tier

In [None]:
# Plot efficiency distribution by tier
fig, axes = plt.subplots(2, 1, figsize=(16, 10))

# Box plot of efficiency scores by tier
tier_order = ['Top Tier', 'Large+', 'Large', 'Medium+', 'Medium', 'Small', 'Micro']
operator_efficiency_sorted = operator_efficiency.copy()
operator_efficiency_sorted['operator_tier'] = pd.Categorical(
    operator_efficiency_sorted['operator_tier'], 
    categories=tier_order, 
    ordered=True
)
operator_efficiency_sorted = operator_efficiency_sorted.sort_values('operator_tier')

sns.boxplot(data=operator_efficiency_sorted, x='operator_tier', y='overall_efficiency', ax=axes[0])
axes[0].axhline(y=1.0, color='red', linestyle='--', linewidth=2, label='Expected (1.0)')
axes[0].set_title('Operator Efficiency Distribution by Tier', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Tier')
axes[0].set_ylabel('Overall Efficiency Score')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')
axes[0].tick_params(axis='x', rotation=45)

# Scatter plot: Efficiency vs Total Stake
for tier in operator_efficiency_sorted['operator_tier'].unique():
    tier_data = operator_efficiency_sorted[operator_efficiency_sorted['operator_tier'] == tier]
    axes[1].scatter(tier_data['total_stake_sum'], tier_data['overall_efficiency'], 
                   label=tier, s=100, alpha=0.6, edgecolors='black')

axes[1].axhline(y=1.0, color='red', linestyle='--', linewidth=2, label='Expected (1.0)')
axes[1].set_title('Efficiency vs Total Stake (Size vs Performance)', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Total Stake (UGX)')
axes[1].set_ylabel('Overall Efficiency Score')
axes[1].set_xscale('log')
axes[1].legend(title='Tier', bbox_to_anchor=(1.05, 1), loc='upper left')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n‚úì Efficiency visualization complete")

## 8. Identify Outlier Operators (High/Low Efficiency)

In [None]:
# Calculate z-scores within each tier
operator_efficiency['efficiency_zscore'] = operator_efficiency.groupby('operator_tier')['overall_efficiency'].transform(
    lambda x: (x - x.mean()) / x.std()
)

# Identify outliers (|z-score| > 2)
high_performers = operator_efficiency[operator_efficiency['efficiency_zscore'] > 2].copy()
low_performers = operator_efficiency[operator_efficiency['efficiency_zscore'] < -2].copy()

print("="*100)
print("OUTLIER OPERATORS (z-score > 2 or < -2 within tier)")
print("="*100)

if len(high_performers) > 0:
    print(f"\nüî• HIGH PERFORMERS ({len(high_performers)} operators):")
    print("-" * 100)
    display(high_performers[[
        'operator', 'operator_tier', 'overall_efficiency', 'efficiency_zscore',
        'deviation_pct', 'total_stake_sum', 'total_ggr'
    ]].round(3))
else:
    print("\nüî• HIGH PERFORMERS: None (no operators with z-score > 2)")

if len(low_performers) > 0:
    print(f"\n‚ö†Ô∏è  LOW PERFORMERS ({len(low_performers)} operators):")
    print("-" * 100)
    display(low_performers[[
        'operator', 'operator_tier', 'overall_efficiency', 'efficiency_zscore',
        'deviation_pct', 'total_stake_sum', 'total_ggr'
    ]].round(3))
else:
    print("\n‚ö†Ô∏è  LOW PERFORMERS: None (no operators with z-score < -2)")

print("\n" + "="*100)

## 9. Time Series of Efficiency (Recent Trend)

In [None]:
# Select top 5 operators by total stake for trend analysis
top_operators = operator_efficiency.nlargest(5, 'total_stake_sum')['operator'].tolist()

# Filter operator_daily for these operators
top_ops_daily = operator_daily[operator_daily['operator'].isin(top_operators)].copy()
top_ops_daily = top_ops_daily.sort_values('date')

# Plot efficiency trends
fig, ax = plt.subplots(figsize=(18, 6))

for op in top_operators:
    op_data = top_ops_daily[top_ops_daily['operator'] == op]
    ax.plot(op_data['date'], op_data['efficiency_score'], 
           label=op, linewidth=2, alpha=0.7, marker='o', markersize=2)

ax.axhline(y=1.0, color='red', linestyle='--', linewidth=2, label='Expected (1.0)')
ax.set_title('Efficiency Score Trend - Top 5 Operators by Stake', fontsize=14, fontweight='bold')
ax.set_xlabel('Date')
ax.set_ylabel('Efficiency Score')
ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
ax.grid(True, alpha=0.3)
ax.tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

print("\n‚úì Efficiency trends visualized for top 5 operators")

## 10. Save Results

In [None]:
# Save operator efficiency summary
operator_efficiency.to_parquet('warehouse/data/operator_efficiency.parquet', index=False)
print("‚úì Saved: warehouse/data/operator_efficiency.parquet")

# Save daily efficiency data
operator_daily.to_parquet('warehouse/data/operator_efficiency_daily.parquet', index=False)
print("‚úì Saved: warehouse/data/operator_efficiency_daily.parquet")

# Save tier predictions
tier_preds_df.to_parquet('warehouse/data/tier_predictions.parquet', index=False)
print("‚úì Saved: warehouse/data/tier_predictions.parquet")

print("\n" + "="*100)
print("ALL RESULTS SAVED")
print("="*100)

## 11. Summary Report

In [None]:
print("\n" + "="*100)
print("OPERATOR EFFICIENCY ANALYSIS - SUMMARY")
print("="*100)

print("\nüìä KEY STATISTICS")
print("-" * 100)
print(f"Total operators analyzed: {len(operator_efficiency)}")
print(f"Total operator-days: {len(operator_daily):,}")
print(f"Date range: {operator_daily['date'].min()} to {operator_daily['date'].max()}")

print("\nüìà EFFICIENCY DISTRIBUTION")
print("-" * 100)
print(f"Mean efficiency score: {operator_efficiency['overall_efficiency'].mean():.3f}")
print(f"Median efficiency score: {operator_efficiency['overall_efficiency'].median():.3f}")
print(f"Std deviation: {operator_efficiency['overall_efficiency'].std():.3f}")
print(f"Min efficiency: {operator_efficiency['overall_efficiency'].min():.3f} ({operator_efficiency.loc[operator_efficiency['overall_efficiency'].idxmin(), 'operator']})")
print(f"Max efficiency: {operator_efficiency['overall_efficiency'].max():.3f} ({operator_efficiency.loc[operator_efficiency['overall_efficiency'].idxmax(), 'operator']})")

print("\nüéØ EFFICIENCY CATEGORIES")
print("-" * 100)
highly_efficient = operator_efficiency[operator_efficiency['overall_efficiency'] > 1.2]
efficient = operator_efficiency[(operator_efficiency['overall_efficiency'] > 1.0) & (operator_efficiency['overall_efficiency'] <= 1.2)]
average = operator_efficiency[(operator_efficiency['overall_efficiency'] >= 0.8) & (operator_efficiency['overall_efficiency'] <= 1.0)]
inefficient = operator_efficiency[operator_efficiency['overall_efficiency'] < 0.8]

print(f"Highly Efficient (>1.2): {len(highly_efficient)} operators ({len(highly_efficient)/len(operator_efficiency)*100:.1f}%)")
print(f"Efficient (1.0-1.2): {len(efficient)} operators ({len(efficient)/len(operator_efficiency)*100:.1f}%)")
print(f"Average (0.8-1.0): {len(average)} operators ({len(average)/len(operator_efficiency)*100:.1f}%)")
print(f"Inefficient (<0.8): {len(inefficient)} operators ({len(inefficient)/len(operator_efficiency)*100:.1f}%)")

print("\nüèÜ TOP 5 MOST EFFICIENT OPERATORS")
print("-" * 100)
for idx, row in operator_efficiency.head(5).iterrows():
    print(f"{row['operator']:8s} ({row['operator_tier']:10s}): {row['overall_efficiency']:.3f} "
          f"({row['deviation_pct']:+.1f}% vs expected, GGR: UGX {row['total_ggr']:,.0f})")

print("\n‚ö†Ô∏è  BOTTOM 5 LEAST EFFICIENT OPERATORS")
print("-" * 100)
for idx, row in operator_efficiency.tail(5).iterrows():
    print(f"{row['operator']:8s} ({row['operator_tier']:10s}): {row['overall_efficiency']:.3f} "
          f"({row['deviation_pct']:+.1f}% vs expected, GGR: UGX {row['total_ggr']:,.0f})")

print("\n" + "="*100)
print("üí° REGULATORY INSIGHTS")
print("="*100)

print("\n1. EFFICIENCY vs TIER DESIGN VALIDATED:")
print("   ‚Ä¢ Tiers based on operational scale (movement_wager_amt)")
print("   ‚Ä¢ Efficiency measures performance within scale tier")
print("   ‚Ä¢ Successfully separates 'how big' from 'how well performing'")

print("\n2. FAIR PEER COMPARISON ENABLED:")
print("   ‚Ä¢ Operators compared to tier-expected performance")
print("   ‚Ä¢ Small efficient operators identified (punching above weight)")
print("   ‚Ä¢ Large inefficient operators identified (underperforming)")

print("\n3. ACTIONABLE FOR REGULATION:")
print("   ‚Ä¢ Efficiency score > 1.2: Investigate for best practices")
print("   ‚Ä¢ Efficiency score < 0.8: Investigate for operational issues")
print("   ‚Ä¢ Sudden efficiency drops: Potential fraud/technical issues")
print("   ‚Ä¢ Consistent high efficiency: Reward/certify operators")

print("\n" + "="*100)
print("‚úÖ OPERATOR EFFICIENCY ANALYSIS COMPLETE")
print("="*100)