# Risk Predictor - Model Development and Training

This notebook focuses on building and training machine learning models for risk prediction.

## Objectives:
- Data preprocessing and feature engineering
- Model selection and comparison
- Hyperparameter tuning
- Model training and validation
- Performance evaluation

## 1. Import Libraries and Load Data

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Scikit-learn components
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Machine Learning Models
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Evaluation metrics
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Model persistence
import joblib
import pickle

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported successfully!')

In [None]:
# Load processed data (or create sample data)
data_path = '/home/jovyan/data/processed/'

try:
    df = pd.read_csv(f'{data_path}processed_risk_data.csv')
    print(f'Processed data loaded successfully!')
except FileNotFoundError:
    print('Creating sample dataset for model training...')
    # Create more comprehensive sample data
    np.random.seed(42)
    n_samples = 5000
    
    df = pd.DataFrame({
        'age': np.random.randint(18, 80, n_samples),
        'income': np.random.lognormal(10.5, 0.8, n_samples),
        'credit_score': np.random.normal(650, 100, n_samples).clip(300, 850),
        'employment_length': np.random.exponential(5, n_samples).clip(0, 40),
        'loan_amount': np.random.lognormal(10, 0.7, n_samples),
        'debt_to_income': np.random.beta(2, 5, n_samples) * 100,
        'previous_defaults': np.random.poisson(0.3, n_samples),
        'education_level': np.random.choice(['High School', 'Bachelor', 'Master', 'PhD'], 
                                          n_samples, p=[0.4, 0.35, 0.2, 0.05])
    })
    
    # Create target variable with realistic relationships
    risk_score = (
        -0.01 * df['age'] +
        -0.00001 * df['income'] +
        -0.005 * df['credit_score'] +
        -0.02 * df['employment_length'] +
        0.00002 * df['loan_amount'] +
        0.02 * df['debt_to_income'] +
        0.5 * df['previous_defaults'] +
        np.random.normal(0, 0.5, n_samples)
    )
    
    df['risk_level'] = pd.cut(risk_score, 
                             bins=[-np.inf, -0.5, 0.5, np.inf], 
                             labels=['Low', 'Medium', 'High'])

print(f'Dataset shape: {df.shape}')
df.head()

## 2. Data Preprocessing

In [None]:
# Separate features and target
X = df.drop('risk_level', axis=1)
y = df['risk_level']

# Identify column types
numerical_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

print(f'Numerical features: {numerical_features}')
print(f'Categorical features: {categorical_features}')
print(f'Target distribution:\n{y.value_counts()}')

In [None]:
# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(drop='first', sparse=False), categorical_features)
    ])

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f'Target encoding: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}')

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

print(f'Training set size: {X_train.shape}')
print(f'Test set size: {X_test.shape}')
print(f'Training target distribution: {np.bincount(y_train)}')
print(f'Test target distribution: {np.bincount(y_test)}')

## 3. Model Selection and Comparison

In [None]:
# Define models to compare
models = {
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42, probability=True),
    'KNN': KNeighborsClassifier()
}

# Store results
model_results = {}

print('Comparing models using cross-validation...')
print('=' * 50)

In [None]:
# Compare models with cross-validation
for name, model in models.items():
    # Create pipeline
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
    
    model_results[name] = {
        'cv_mean': cv_scores.mean(),
        'cv_std': cv_scores.std(),
        'pipeline': pipeline
    }
    
    print(f'{name}:')
    print(f'  CV Accuracy: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})')
    print()

In [None]:
# Visualize model comparison
model_names = list(model_results.keys())
cv_means = [model_results[name]['cv_mean'] for name in model_names]
cv_stds = [model_results[name]['cv_std'] for name in model_names]

plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, cv_means, yerr=cv_stds, capsize=5, alpha=0.7)
plt.title('Model Comparison - Cross Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Models')
plt.xticks(rotation=45)
plt.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar, mean in zip(bars, cv_means):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001,
             f'{mean:.3f}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

# Select best model
best_model_name = max(model_results.keys(), key=lambda k: model_results[k]['cv_mean'])
print(f'Best performing model: {best_model_name}')

## 4. Hyperparameter Tuning

In [None]:
# Hyperparameter tuning for the best model
best_pipeline = model_results[best_model_name]['pipeline']

# Define parameter grid based on best model
if best_model_name == 'Random Forest':
    param_grid = {
        'classifier__n_estimators': [100, 200, 300],
        'classifier__max_depth': [10, 20, None],
        'classifier__min_samples_split': [2, 5, 10]
    }
elif best_model_name == 'Gradient Boosting':
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.05, 0.1, 0.15],
        'classifier__max_depth': [3, 5, 7]
    }
elif best_model_name == 'Logistic Regression':
    param_grid = {
        'classifier__C': [0.1, 1, 10, 100],
        'classifier__penalty': ['l1', 'l2'],
        'classifier__solver': ['liblinear', 'saga']
    }
else:
    param_grid = {}

print(f'Tuning hyperparameters for {best_model_name}...')

In [None]:
# Perform grid search
if param_grid:
    grid_search = GridSearchCV(
        best_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    print(f'Best parameters: {grid_search.best_params_}')
    print(f'Best CV score: {grid_search.best_score_:.4f}')
    
    # Use the best model
    final_model = grid_search.best_estimator_
else:
    print('Using default parameters for this model')
    final_model = best_pipeline
    final_model.fit(X_train, y_train)

## 5. Model Evaluation

In [None]:
# Make predictions
y_pred = final_model.predict(X_test)
y_pred_proba = final_model.predict_proba(X_test)

# Calculate metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print('=== MODEL PERFORMANCE ===')
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1-Score: {f1:.4f}')

In [None]:
# Detailed classification report
target_names = label_encoder.classes_
print('\nDetailed Classification Report:')
print(classification_report(y_test, y_pred, target_names=target_names))

In [None]:
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=target_names, yticklabels=target_names)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

## 6. Feature Importance Analysis

In [None]:
# Feature importance (for tree-based models)
if hasattr(final_model.named_steps['classifier'], 'feature_importances_'):
    # Get feature names after preprocessing
    feature_names = (numerical_features + 
                    list(final_model.named_steps['preprocessor']
                         .named_transformers_['cat']
                         .get_feature_names_out(categorical_features)))
    
    importances = final_model.named_steps['classifier'].feature_importances_
    
    # Create feature importance dataframe
    feature_imp_df = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    }).sort_values('importance', ascending=False)
    
    # Plot feature importance
    plt.figure(figsize=(10, 8))
    sns.barplot(data=feature_imp_df.head(15), y='feature', x='importance')
    plt.title('Top 15 Feature Importances')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()
    
    print('Top 10 Most Important Features:')
    print(feature_imp_df.head(10))
else:
    print('Feature importance not available for this model type')

## 7. Model Saving

In [None]:
# Save the trained model and preprocessing components
model_path = '/home/jovyan/data/models/'

# Create directory if it doesn't exist
import os
os.makedirs(model_path, exist_ok=True)

# Save model
joblib.dump(final_model, f'{model_path}risk_predictor_model.pkl')
joblib.dump(label_encoder, f'{model_path}label_encoder.pkl')

# Save model metadata
model_metadata = {
    'model_type': best_model_name,
    'accuracy': accuracy,
    'precision': precision,
    'recall': recall,
    'f1_score': f1,
    'feature_names': list(X.columns),
    'target_classes': list(target_names)
}

import json
with open(f'{model_path}model_metadata.json', 'w') as f:
    json.dump(model_metadata, f, indent=2)

print('Model saved successfully!')
print(f'Model file: {model_path}risk_predictor_model.pkl')
print(f'Label encoder: {model_path}label_encoder.pkl')
print(f'Metadata: {model_path}model_metadata.json')

## 8. Model Summary and Next Steps

In [None]:
print('=== MODEL DEVELOPMENT SUMMARY ===')
print(f'Best Model: {best_model_name}')
print(f'Final Accuracy: {accuracy:.4f}')
print(f'Training Samples: {len(X_train)}')
print(f'Test Samples: {len(X_test)}')
print(f'Number of Features: {X.shape[1]}')
print(f'Number of Classes: {len(target_names)}')

print('\n=== NEXT STEPS ===')
print('1. Deploy model to production API')
print('2. Set up model monitoring and logging')
print('3. Create model validation pipeline')
print('4. Implement A/B testing framework')
print('5. Schedule model retraining')