# Baseline Random Forest Classifier

This notebook builds and evaluates a baseline Random Forest classifier for soccer match prediction: Over/Under 2.5 goals (binary classification).

## Structure:
- **#0**: Setup and Data Loading
- **#1**: Baseline Model Training
- **#2**: Hyperparameter Tuning
- **#3**: Feature Importance Analysis
- **#4**: Learning Curves
- **#5**: Final Evaluation on Test Set

## #0: Setup and Data Loading

### #0.1: Import Required Libraries
Import all necessary libraries for model training, evaluation, and visualization.

In [None]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             roc_auc_score, confusion_matrix, classification_report,
                             roc_curve, auc)
from sklearn.model_selection import learning_curve, GridSearchCV
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

### #0.2: Load Preprocessed Data
Load the baseline preprocessed data containing training, validation, and test sets.

In [None]:
with open('./processed/baseline_preprocessed.pkl', 'rb') as file:
    data = pickle.load(file)

X_train = data['X_train']
y_train = data['y_train']
X_val = data['X_val']
y_val = data['y_val']
X_test = data['X_test']
y_test = data['y_test']

In [None]:
# Check data shapes
print(f"Training set: X={X_train.shape}, y={y_train.shape}")
print(f"Validation set: X={X_val.shape}, y={y_val.shape}")
print(f"Test set: X={X_test.shape}, y={y_test.shape}")
print(f"\nClass distribution in training set:")
class_dist = pd.Series(y_train).value_counts(normalize=True).sort_index()
class_names = {0: 'Under 2.5 goals', 1: 'Over 2.5 goals'}
for idx, val in class_dist.items():
    print(f"{class_names[idx]}: {val:.4f}")

## #1: Baseline Model Training

### #1.1: Train Baseline Random Forest
Train an untuned Random Forest classifier with default parameters on the training set.

In [None]:
# Train baseline Random Forest with default parameters
baseline_rf = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=20,
    min_samples_leaf=10,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

print("Training baseline Random Forest...")
baseline_rf.fit(X_train, y_train)
print("Training completed!")

### #1.2: Evaluate Baseline Model on Validation Set
Generate predictions and calculate performance metrics on the validation set.

In [None]:
def evaluate_model(model, X, y, dataset_name="Dataset"):
    """
    Evaluate model and return comprehensive metrics.
    """
    # Predictions
    y_pred = model.predict(X)
    y_pred_proba = model.predict_proba(X)
    
    # Calculate metrics
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y, y_pred, average='weighted', zero_division=0)
    
    # ROC-AUC for binary classification
    try:
        roc_auc = roc_auc_score(y, y_pred_proba[:, 1])
    except:
        roc_auc = None
    
    print(f"\n{'='*60}")
    print(f"Performance Metrics - {dataset_name}")
    print(f"{'='*60}")
    print(f"Accuracy:  {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall:    {recall:.4f}")
    print(f"F1-Score:  {f1:.4f}")
    if roc_auc:
        print(f"ROC-AUC:   {roc_auc:.4f}")
    
    # Confusion Matrix
    cm = confusion_matrix(y, y_pred)
    print(f"\nConfusion Matrix:")
    print(cm)
    
    # Classification Report
    print(f"\nClassification Report:")
    print(classification_report(y, y_pred, target_names=['Under 2.5', 'Over 2.5'], zero_division=0))
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
        'roc_auc': roc_auc,
        'confusion_matrix': cm,
        'y_pred': y_pred,
        'y_pred_proba': y_pred_proba
    }

# Evaluate on training set
baseline_train_results = evaluate_model(baseline_rf, X_train, y_train, "Baseline - Training Set")

In [None]:
# Evaluate on validation set
baseline_val_results = evaluate_model(baseline_rf, X_val, y_val, "Baseline - Validation Set")

### #1.3: Visualize Confusion Matrix
Create a heatmap visualization of the confusion matrix for the validation set.

In [None]:
def plot_confusion_matrix(cm, title="Confusion Matrix"):
    """
    Plot confusion matrix as a heatmap.
    """
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                xticklabels=['Under 2.5', 'Over 2.5'],
                yticklabels=['Under 2.5', 'Over 2.5'])
    plt.title(title)
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
    plt.tight_layout()
    plt.show()

plot_confusion_matrix(baseline_val_results['confusion_matrix'], 
                     "Baseline Model - Validation Set Confusion Matrix")

## #2: Hyperparameter Tuning

### #2.1: Bayesian Optimization with Optuna
Use Optuna to perform Bayesian optimization to find optimal hyperparameters.

In [None]:
def objective(trial):
    """
    Objective function for Optuna hyperparameter optimization.
    """
    # Define hyperparameter search space
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 5, 30),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
        'random_state': 42,
        'n_jobs': -1
    }
    
    # Train model
    model = RandomForestClassifier(**params)
    model.fit(X_train, y_train)
    
    # Evaluate on validation set
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred, average='weighted')
    
    return f1

# Create study and optimize
print("Starting Bayesian Optimization with Optuna...")
study = optuna.create_study(
    direction='maximize',
    sampler=TPESampler(seed=42)
)

study.optimize(objective, n_trials=50, show_progress_bar=True)

print(f"\nBest trial:")
print(f"  Value (F1-Score): {study.best_trial.value:.4f}")
print(f"  Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

### #2.2: Visualize Optuna Optimization History
Plot the optimization history to understand how the search progressed.

In [None]:
# Plot optimization history
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Plot all trials
trials_df = study.trials_dataframe()
ax1.plot(trials_df['number'], trials_df['value'], 'b-', alpha=0.3, label='All trials')
ax1.plot(trials_df['number'], trials_df['value'].cummax(), 'r-', linewidth=2, label='Best so far')
ax1.set_xlabel('Trial Number')
ax1.set_ylabel('F1-Score')
ax1.set_title('Optuna Optimization History')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Plot top 10 trials
top_trials = trials_df.nlargest(10, 'value')
ax2.barh(range(len(top_trials)), top_trials['value'])
ax2.set_yticks(range(len(top_trials)))
ax2.set_yticklabels([f"Trial {int(t)}" for t in top_trials['number']])
ax2.set_xlabel('F1-Score')
ax2.set_title('Top 10 Trials')
ax2.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

### #2.3: Grid Search Refinement
Perform a focused grid search around the best parameters from Optuna to fine-tune.

In [None]:
# Get best parameters from Optuna
best_params = study.best_trial.params

# Define refined grid around best parameters
param_grid = {
    'n_estimators': [max(100, best_params['n_estimators'] - 50), 
                     best_params['n_estimators'], 
                     best_params['n_estimators'] + 50],
    'max_depth': [max(5, best_params['max_depth'] - 2), 
                  best_params['max_depth'], 
                  min(30, best_params['max_depth'] + 2)],
    'min_samples_split': [max(2, best_params['min_samples_split'] - 2), 
                          best_params['min_samples_split'], 
                          best_params['min_samples_split'] + 2],
    'min_samples_leaf': [max(1, best_params['min_samples_leaf'] - 1), 
                         best_params['min_samples_leaf'], 
                         best_params['min_samples_leaf'] + 1],
    'max_features': [best_params['max_features']],
    'bootstrap': [best_params['bootstrap']],
    'random_state': [42],
    'n_jobs': [-1]
}

print("Performing Grid Search refinement...")
print(f"Grid search space size: {np.prod([len(v) for v in param_grid.values() if len(v) > 1])} combinations")

# Combine train and validation for grid search (since we're using cv)
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([pd.Series(y_train), pd.Series(y_val)]).values

grid_search = GridSearchCV(
    RandomForestClassifier(),
    param_grid,
    cv=5,
    scoring='f1_weighted',
    n_jobs=-1,
    verbose=2
)

grid_search.fit(X_train_val, y_train_val)

print(f"\nBest Grid Search F1-Score: {grid_search.best_score_:.4f}")
print(f"Best Grid Search Parameters:")
for key, value in grid_search.best_params_.items():
    print(f"  {key}: {value}")

### #2.4: Train Final Tuned Model
Train the tuned Random Forest model with the best parameters from grid search.

In [None]:
# Use the best model from grid search
tuned_rf = grid_search.best_estimator_

print("Tuned Random Forest model trained successfully!")
print(f"Best parameters: {grid_search.best_params_}")

### #2.5: Compare Baseline vs Tuned Models
Evaluate and compare performance of baseline and tuned models on the validation set.

In [None]:
# Evaluate tuned model on validation set
tuned_val_results = evaluate_model(tuned_rf, X_val, y_val, "Tuned - Validation Set")

# Create comparison dataframe
comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    'Baseline': [
        baseline_val_results['accuracy'],
        baseline_val_results['precision'],
        baseline_val_results['recall'],
        baseline_val_results['f1'],
        baseline_val_results['roc_auc']
    ],
    'Tuned': [
        tuned_val_results['accuracy'],
        tuned_val_results['precision'],
        tuned_val_results['recall'],
        tuned_val_results['f1'],
        tuned_val_results['roc_auc']
    ]
})

comparison_df['Improvement'] = comparison_df['Tuned'] - comparison_df['Baseline']
comparison_df['Improvement %'] = (comparison_df['Improvement'] / comparison_df['Baseline'] * 100).round(2)

print("\n" + "="*80)
print("Baseline vs Tuned Model Comparison (Validation Set)")
print("="*80)
print(comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize comparison
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Bar plot comparison
metrics = comparison_df['Metric'].tolist()
x = np.arange(len(metrics))
width = 0.35

axes[0].bar(x - width/2, comparison_df['Baseline'], width, label='Baseline', alpha=0.8)
axes[0].bar(x + width/2, comparison_df['Tuned'], width, label='Tuned', alpha=0.8)
axes[0].set_ylabel('Score')
axes[0].set_title('Baseline vs Tuned Model Performance')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics, rotation=45, ha='right')
axes[0].legend()
axes[0].grid(True, alpha=0.3, axis='y')

# Improvement percentage
colors = ['green' if x > 0 else 'red' for x in comparison_df['Improvement %']]
axes[1].barh(metrics, comparison_df['Improvement %'], color=colors, alpha=0.7)
axes[1].set_xlabel('Improvement (%)')
axes[1].set_title('Performance Improvement')
axes[1].axvline(x=0, color='black', linestyle='--', linewidth=0.8)
axes[1].grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

## #3: Feature Importance Analysis

### #3.1: Calculate and Visualize Feature Importance
Analyze which features are most important for the tuned model's predictions.

In [None]:
# Get feature importances
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': tuned_rf.feature_importances_
}).sort_values('importance', ascending=False)

print("Top 20 Most Important Features:")
print(feature_importance.head(20).to_string(index=False))

# Plot top 30 features
plt.figure(figsize=(12, 10))
top_n = 30
top_features = feature_importance.head(top_n)
plt.barh(range(len(top_features)), top_features['importance'], alpha=0.8)
plt.yticks(range(len(top_features)), top_features['feature'])
plt.xlabel('Importance')
plt.title(f'Top {top_n} Feature Importances - Tuned Random Forest')
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

### #3.2: Feature Importance Distribution Analysis
Analyze the distribution of feature importances and cumulative importance.

In [None]:
# Calculate cumulative importance
feature_importance['cumulative_importance'] = feature_importance['importance'].cumsum()

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Importance distribution
ax1.hist(feature_importance['importance'], bins=50, alpha=0.7, edgecolor='black')
ax1.set_xlabel('Feature Importance')
ax1.set_ylabel('Frequency')
ax1.set_title('Distribution of Feature Importances')
ax1.axvline(feature_importance['importance'].mean(), color='red', linestyle='--', 
            linewidth=2, label=f'Mean: {feature_importance["importance"].mean():.4f}')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Cumulative importance
ax2.plot(range(len(feature_importance)), feature_importance['cumulative_importance'], 'b-', linewidth=2)
ax2.axhline(y=0.95, color='r', linestyle='--', label='95% threshold')
ax2.axhline(y=0.90, color='orange', linestyle='--', label='90% threshold')
ax2.set_xlabel('Number of Features')
ax2.set_ylabel('Cumulative Importance')
ax2.set_title('Cumulative Feature Importance')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Report how many features capture 90% and 95% of importance
n_features_90 = (feature_importance['cumulative_importance'] <= 0.90).sum() + 1
n_features_95 = (feature_importance['cumulative_importance'] <= 0.95).sum() + 1

print(f"\nFeature Importance Summary:")
print(f"  Total features: {len(feature_importance)}")
print(f"  Features capturing 90% importance: {n_features_90} ({n_features_90/len(feature_importance)*100:.1f}%)")
print(f"  Features capturing 95% importance: {n_features_95} ({n_features_95/len(feature_importance)*100:.1f}%)")

## #4: Learning Curves

### #4.1: Generate Learning Curves for Both Models
Plot learning curves to analyze training vs validation performance and identify overfitting.

In [None]:
def plot_learning_curve(model, X, y, title, cv=5):
    """
    Generate and plot learning curves for a model.
    """
    train_sizes, train_scores, val_scores = learning_curve(
        model, X, y, 
        cv=cv,
        scoring='f1_weighted',
        n_jobs=-1,
        train_sizes=np.linspace(0.1, 1.0, 10),
        random_state=42
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    val_scores_mean = np.mean(val_scores, axis=1)
    val_scores_std = np.std(val_scores, axis=1)
    
    plt.figure(figsize=(10, 6))
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, val_scores_mean - val_scores_std,
                     val_scores_mean + val_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, val_scores_mean, 'o-', color="g", label="Cross-validation score")
    
    plt.xlabel("Training Examples")
    plt.ylabel("F1-Score")
    plt.title(title)
    plt.legend(loc="best")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return train_scores_mean, val_scores_mean

# Generate learning curves for baseline model
print("Generating learning curves for baseline model...")
baseline_train_lc, baseline_val_lc = plot_learning_curve(
    baseline_rf, X_train_val, y_train_val, 
    "Learning Curve - Baseline Random Forest"
)

In [None]:
# Generate learning curves for tuned model
print("Generating learning curves for tuned model...")
tuned_train_lc, tuned_val_lc = plot_learning_curve(
    tuned_rf, X_train_val, y_train_val, 
    "Learning Curve - Tuned Random Forest"
)

## #5: Final Evaluation on Test Set

### #5.1: Evaluate Baseline Model on Test Set
Test the baseline model on the held-out test set to get final performance metrics.

In [None]:
# Evaluate baseline model on test set
baseline_test_results = evaluate_model(baseline_rf, X_test, y_test, "Baseline - Test Set")

### #5.2: Evaluate Tuned Model on Test Set
Test the tuned model on the held-out test set to get final performance metrics.

In [None]:
# Evaluate tuned model on test set
tuned_test_results = evaluate_model(tuned_rf, X_test, y_test, "Tuned - Test Set")

### #5.3: Final Comparison - Test Set Performance
Compare baseline and tuned model performance on the test set with visualizations.

In [None]:
# Create comprehensive comparison for test set
test_comparison_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-Score', 'ROC-AUC'],
    'Baseline': [
        baseline_test_results['accuracy'],
        baseline_test_results['precision'],
        baseline_test_results['recall'],
        baseline_test_results['f1'],
        baseline_test_results['roc_auc']
    ],
    'Tuned': [
        tuned_test_results['accuracy'],
        tuned_test_results['precision'],
        tuned_test_results['recall'],
        tuned_test_results['f1'],
        tuned_test_results['roc_auc']
    ]
})

test_comparison_df['Improvement'] = test_comparison_df['Tuned'] - test_comparison_df['Baseline']
test_comparison_df['Improvement %'] = (test_comparison_df['Improvement'] / test_comparison_df['Baseline'] * 100).round(2)

print("\n" + "="*80)
print("FINAL COMPARISON: Baseline vs Tuned Model (Test Set)")
print("="*80)
print(test_comparison_df.to_string(index=False))
print("="*80)

In [None]:
# Visualize test set comparison
fig, axes = plt.subplots(2, 2, figsize=(15, 12))

# 1. Metrics comparison bar plot
metrics = test_comparison_df['Metric'].tolist()
x = np.arange(len(metrics))
width = 0.35

axes[0, 0].bar(x - width/2, test_comparison_df['Baseline'], width, label='Baseline', alpha=0.8)
axes[0, 0].bar(x + width/2, test_comparison_df['Tuned'], width, label='Tuned', alpha=0.8)
axes[0, 0].set_ylabel('Score')
axes[0, 0].set_title('Test Set: Baseline vs Tuned Model Performance')
axes[0, 0].set_xticks(x)
axes[0, 0].set_xticklabels(metrics, rotation=45, ha='right')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3, axis='y')

# 2. Improvement percentage
colors = ['green' if x > 0 else 'red' for x in test_comparison_df['Improvement %']]
axes[0, 1].barh(metrics, test_comparison_df['Improvement %'], color=colors, alpha=0.7)
axes[0, 1].set_xlabel('Improvement (%)')
axes[0, 1].set_title('Test Set: Performance Improvement')
axes[0, 1].axvline(x=0, color='black', linestyle='--', linewidth=0.8)
axes[0, 1].grid(True, alpha=0.3, axis='x')

# 3. Baseline confusion matrix
sns.heatmap(baseline_test_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
            xticklabels=['Under 2.5', 'Over 2.5'], yticklabels=['Under 2.5', 'Over 2.5'],
            ax=axes[1, 0])
axes[1, 0].set_title('Baseline Model - Test Set Confusion Matrix')
axes[1, 0].set_ylabel('True Label')
axes[1, 0].set_xlabel('Predicted Label')

# 4. Tuned confusion matrix
sns.heatmap(tuned_test_results['confusion_matrix'], annot=True, fmt='d', cmap='Blues',
            xticklabels=['Under 2.5', 'Over 2.5'], yticklabels=['Under 2.5', 'Over 2.5'],
            ax=axes[1, 1])
axes[1, 1].set_title('Tuned Model - Test Set Confusion Matrix')
axes[1, 1].set_ylabel('True Label')
axes[1, 1].set_xlabel('Predicted Label')

plt.tight_layout()
plt.show()

### #5.4: Performance Summary Across All Sets
Compare model performance across training, validation, and test sets to check for overfitting.

In [None]:
# Create comprehensive summary across all datasets
summary_data = {
    'Dataset': ['Training', 'Validation', 'Test'] * 2,
    'Model': ['Baseline'] * 3 + ['Tuned'] * 3,
    'Accuracy': [
        baseline_train_results['accuracy'],
        baseline_val_results['accuracy'],
        baseline_test_results['accuracy'],
        None,  # Tuned model not trained on original train set
        tuned_val_results['accuracy'],
        tuned_test_results['accuracy']
    ],
    'F1-Score': [
        baseline_train_results['f1'],
        baseline_val_results['f1'],
        baseline_test_results['f1'],
        None,
        tuned_val_results['f1'],
        tuned_test_results['f1']
    ],
    'ROC-AUC': [
        baseline_train_results['roc_auc'],
        baseline_val_results['roc_auc'],
        baseline_test_results['roc_auc'],
        None,
        tuned_val_results['roc_auc'],
        tuned_test_results['roc_auc']
    ]
}

summary_df = pd.DataFrame(summary_data)

print("\n" + "="*80)
print("PERFORMANCE SUMMARY ACROSS ALL DATASETS")
print("="*80)
print(summary_df.to_string(index=False))
print("="*80)

# Plot performance across datasets
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# F1-Score across datasets
baseline_f1 = [baseline_train_results['f1'], baseline_val_results['f1'], baseline_test_results['f1']]
tuned_f1 = [None, tuned_val_results['f1'], tuned_test_results['f1']]
datasets = ['Training', 'Validation', 'Test']

ax1.plot(datasets, baseline_f1, 'o-', linewidth=2, markersize=8, label='Baseline', color='blue')
ax1.plot(datasets[1:], [f for f in tuned_f1 if f is not None], 'o-', linewidth=2, markersize=8, label='Tuned', color='green')
ax1.set_ylabel('F1-Score')
ax1.set_title('F1-Score Across Datasets')
ax1.legend()
ax1.grid(True, alpha=0.3)

# Accuracy across datasets
baseline_acc = [baseline_train_results['accuracy'], baseline_val_results['accuracy'], baseline_test_results['accuracy']]
tuned_acc = [None, tuned_val_results['accuracy'], tuned_test_results['accuracy']]

ax2.plot(datasets, baseline_acc, 'o-', linewidth=2, markersize=8, label='Baseline', color='blue')
ax2.plot(datasets[1:], [a for a in tuned_acc if a is not None], 'o-', linewidth=2, markersize=8, label='Tuned', color='green')
ax2.set_ylabel('Accuracy')
ax2.set_title('Accuracy Across Datasets')
ax2.legend()
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

### #5.5: Save Best Model and Results
Save the tuned model and best parameters for future use.

In [None]:
# Save the tuned model
import joblib

model_save_path = './processed/tuned_random_forest_model.pkl'
joblib.dump(tuned_rf, model_save_path)
print(f"Tuned model saved to: {model_save_path}")

# Save the best parameters
params_save_path = './processed/best_rf_parameters.pkl'
best_params_final = grid_search.best_params_
joblib.dump(best_params_final, params_save_path)
print(f"Best parameters saved to: {params_save_path}")

# Save all results
results_save_path = './processed/baseline_rf_results.pkl'
results_dict = {
    'baseline_val': baseline_val_results,
    'baseline_test': baseline_test_results,
    'tuned_val': tuned_val_results,
    'tuned_test': tuned_test_results,
    'best_params': best_params_final,
    'feature_importance': feature_importance,
    'optuna_study': study
}
joblib.dump(results_dict, results_save_path)
print(f"All results saved to: {results_save_path}")

print("\n" + "="*80)
print("BEST MODEL CONFIGURATION")
print("="*80)
for key, value in best_params_final.items():
    print(f"  {key}: {value}")
print("="*80)

### #5.6: Final Summary and Conclusions
Display key findings and insights from the baseline Random Forest analysis.

In [None]:
print("\n" + "="*80)
print("BASELINE RANDOM FOREST - FINAL SUMMARY")
print("="*80)
print("\n1. MODEL PERFORMANCE ON TEST SET:")
print(f"   Baseline Model:")
print(f"     - Accuracy:  {baseline_test_results['accuracy']:.4f}")
print(f"     - F1-Score:  {baseline_test_results['f1']:.4f}")
print(f"     - ROC-AUC:   {baseline_test_results['roc_auc']:.4f}")
print(f"\n   Tuned Model:")
print(f"     - Accuracy:  {tuned_test_results['accuracy']:.4f}")
print(f"     - F1-Score:  {tuned_test_results['f1']:.4f}")
print(f"     - ROC-AUC:   {tuned_test_results['roc_auc']:.4f}")

improvement_f1 = ((tuned_test_results['f1'] - baseline_test_results['f1']) / baseline_test_results['f1'] * 100)
improvement_acc = ((tuned_test_results['accuracy'] - baseline_test_results['accuracy']) / baseline_test_results['accuracy'] * 100)

print(f"\n2. IMPROVEMENT THROUGH HYPERPARAMETER TUNING:")
print(f"   - Accuracy improved by:  {improvement_acc:+.2f}%")
print(f"   - F1-Score improved by:  {improvement_f1:+.2f}%")

print(f"\n3. TOP 5 MOST IMPORTANT FEATURES:")
for idx, row in feature_importance.head(5).iterrows():
    print(f"   {idx+1}. {row['feature']}: {row['importance']:.4f}")

print(f"\n4. HYPERPARAMETER OPTIMIZATION:")
print(f"   - Optuna trials: 50")
print(f"   - Best Optuna F1-Score: {study.best_trial.value:.4f}")
print(f"   - Grid search refinement completed")
print(f"   - Final model trained on combined train+val set")

print(f"\n5. MODEL SAVED:")
print(f"   - Model: {model_save_path}")
print(f"   - Parameters: {params_save_path}")
print(f"   - Results: {results_save_path}")

print("\n" + "="*80)
print("Analysis Complete!")
print("="*80)