# Machine Learning Model Training

This notebook trains and evaluates machine learning models for customer churn prediction.

## Objectives
1. Load features from Snowflake
2. Prepare data for modeling
3. Train multiple models
4. Compare model performance
5. Analyze feature importance
6. Save best model and predictions to Snowflake

In [None]:
# Import libraries
import sys
sys.path.append('/home/ubuntu/snowflake_ds_project')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

from sklearn.metrics import roc_curve, auc, confusion_matrix

from src.snowflake_connector import SnowflakeConnector
from src.model_training import ChurnPredictor
from config import config

# Set visualization style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("Libraries imported successfully!")

## 1. Connect to Snowflake

In [None]:
# Validate configuration
config.validate()

# Create connector
connector = SnowflakeConnector(config.get_connection_params())
connector.connect()

print("Connected to Snowflake successfully!")

## 2. Prepare Data

In [None]:
# Create predictor
predictor = ChurnPredictor(connector)

# Prepare data
X_train, X_test, y_train, y_test = predictor.prepare_data(
    test_size=0.2,
    balance_data=True
)

print(f"Training set shape: {X_train.shape}")
print(f"Test set shape: {X_test.shape}")

## 3. Train Multiple Models

In [None]:
# Dictionary to store results
model_results = {}

# List of models to train
model_types = ['logistic', 'random_forest', 'gradient_boosting', 'xgboost']

for model_type in model_types:
    print(f"\n{'='*60}")
    print(f"Training {model_type.upper()} model")
    print(f"{'='*60}")
    
    # Create new predictor for each model
    temp_predictor = ChurnPredictor(connector)
    temp_predictor.feature_names = predictor.feature_names
    temp_predictor.scaler = predictor.scaler
    
    # Train model
    temp_predictor.train_model(
        model_type=model_type,
        X_train=X_train,
        y_train=y_train,
        tune_hyperparameters=False
    )
    
    # Evaluate model
    metrics = temp_predictor.evaluate_model(X_test, y_test)
    
    # Store results
    model_results[model_type] = {
        'predictor': temp_predictor,
        'metrics': metrics
    }

print("\nAll models trained successfully!")

## 4. Compare Model Performance

In [None]:
# Create comparison DataFrame
comparison_df = pd.DataFrame({
    model_name: results['metrics']
    for model_name, results in model_results.items()
}).T

comparison_df = comparison_df.round(4)
print("\nModel Performance Comparison:")
print(comparison_df)

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Bar plot for all metrics
comparison_df.plot(kind='bar', ax=axes[0])
axes[0].set_title('Model Performance Comparison - All Metrics')
axes[0].set_xlabel('Model')
axes[0].set_ylabel('Score')
axes[0].legend(loc='lower right')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=45, ha='right')
axes[0].set_ylim([0, 1])
axes[0].grid(True, alpha=0.3)

# Focus on key metrics
key_metrics = ['accuracy', 'f1_score', 'roc_auc']
comparison_df[key_metrics].plot(kind='bar', ax=axes[1])
axes[1].set_title('Model Performance Comparison - Key Metrics')
axes[1].set_xlabel('Model')
axes[1].set_ylabel('Score')
axes[1].legend(loc='lower right')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=45, ha='right')
axes[1].set_ylim([0, 1])
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# ROC curves for all models
plt.figure(figsize=(10, 8))

for model_name, results in model_results.items():
    model = results['predictor'].model
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.plot(fpr, tpr, label=f'{model_name} (AUC = {roc_auc:.4f})', linewidth=2)

plt.plot([0, 1], [0, 1], 'k--', label='Random Classifier', linewidth=1)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves - Model Comparison')
plt.legend(loc='lower right')
plt.grid(True, alpha=0.3)
plt.show()

## 5. Select Best Model and Analyze

In [None]:
# Select best model based on ROC AUC
best_model_name = comparison_df['roc_auc'].idxmax()
best_predictor = model_results[best_model_name]['predictor']

print(f"Best Model: {best_model_name.upper()}")
print(f"ROC AUC: {comparison_df.loc[best_model_name, 'roc_auc']:.4f}")

In [None]:
# Confusion matrix for best model
y_pred = best_predictor.model.predict(X_test)
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Active', 'Churned'],
            yticklabels=['Active', 'Churned'])
plt.title(f'Confusion Matrix - {best_model_name.upper()}')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

print(f"\nTrue Negatives: {cm[0, 0]}")
print(f"False Positives: {cm[0, 1]}")
print(f"False Negatives: {cm[1, 0]}")
print(f"True Positives: {cm[1, 1]}")

## 6. Feature Importance Analysis

In [None]:
# Get feature importance
feature_importance_df = best_predictor.get_feature_importance()

if feature_importance_df is not None:
    print("\nFeature Importance:")
    print(feature_importance_df)
    
    # Visualize feature importance
    plt.figure(figsize=(10, 8))
    plt.barh(feature_importance_df['feature'], feature_importance_df['importance'])
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.title(f'Feature Importance - {best_model_name.upper()}')
    plt.gca().invert_yaxis()
    plt.tight_layout()
    plt.show()
else:
    print("Feature importance not available for this model")

## 7. Prediction Distribution Analysis

In [None]:
# Get prediction probabilities
y_pred_proba = best_predictor.model.predict_proba(X_test)[:, 1]

# Create DataFrame for analysis
pred_df = pd.DataFrame({
    'actual': y_test,
    'predicted_proba': y_pred_proba,
    'predicted': y_pred
})

# Plot prediction probability distribution
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Distribution by actual class
for actual_class in [0, 1]:
    data = pred_df[pred_df['actual'] == actual_class]['predicted_proba']
    axes[0].hist(data, bins=50, alpha=0.6, 
                label=f'Actual: {"Churned" if actual_class == 1 else "Active"}')

axes[0].set_xlabel('Predicted Churn Probability')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Prediction Probability Distribution by Actual Class')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Overall distribution
axes[1].hist(pred_df['predicted_proba'], bins=50, edgecolor='black', alpha=0.7)
axes[1].axvline(0.5, color='red', linestyle='--', label='Decision Threshold (0.5)')
axes[1].set_xlabel('Predicted Churn Probability')
axes[1].set_ylabel('Frequency')
axes[1].set_title('Overall Prediction Probability Distribution')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Save Model and Predictions to Snowflake

In [None]:
# Save model metadata to Snowflake
model_id = best_predictor.save_model_to_snowflake(
    model_name=f'{best_model_name.upper()}_Churn_Predictor'
)

print(f"Model saved with ID: {model_id}")

In [None]:
# Generate and save predictions for all customers
best_predictor.predict_and_save(model_id)

print("Predictions saved to Snowflake successfully!")

## 9. Verify Saved Data in Snowflake

In [None]:
# Check model metadata
model_query = """
SELECT *
FROM ECOMMERCE_DB.ML_MODELS.MODEL_METADATA
ORDER BY training_date DESC
LIMIT 5
"""

models_df = connector.execute_query_to_df(model_query)
print("\nRecent Models in Snowflake:")
models_df

In [None]:
# Check predictions
predictions_query = f"""
SELECT 
    predicted_churn,
    COUNT(*) as customer_count,
    AVG(churn_probability) as avg_probability
FROM ECOMMERCE_DB.ML_MODELS.PREDICTIONS
WHERE model_id = '{model_id}'
GROUP BY predicted_churn
"""

pred_summary_df = connector.execute_query_to_df(predictions_query)
print("\nPrediction Summary:")
pred_summary_df

In [None]:
# Sample predictions
sample_predictions_query = f"""
SELECT 
    customer_id,
    churn_probability,
    predicted_churn
FROM ECOMMERCE_DB.ML_MODELS.PREDICTIONS
WHERE model_id = '{model_id}'
ORDER BY churn_probability DESC
LIMIT 10
"""

sample_preds_df = connector.execute_query_to_df(sample_predictions_query)
print("\nTop 10 Customers at Risk of Churn:")
sample_preds_df

## 10. Summary

In [None]:
print("=" * 60)
print("MODEL TRAINING SUMMARY")
print("=" * 60)
print(f"\nBest Model: {best_model_name.upper()}")
print(f"\nPerformance Metrics:")
for metric, value in model_results[best_model_name]['metrics'].items():
    print(f"  {metric.capitalize()}: {value:.4f}")

if feature_importance_df is not None:
    print(f"\nTop 5 Most Important Features:")
    for idx, row in feature_importance_df.head(5).iterrows():
        print(f"  {row['feature']}: {row['importance']:.4f}")

print(f"\nModel ID: {model_id}")
print(f"\nModel and predictions saved to Snowflake successfully!")
print("=" * 60)

## 11. Cleanup

In [None]:
# Disconnect from Snowflake
connector.disconnect()
print("Disconnected from Snowflake")