In [2]:
import math
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_selection import f_classif

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

from sklearn.metrics import (accuracy_score, f1_score, recall_score, precision_score, 
                             roc_curve, roc_auc_score, confusion_matrix, 
                             classification_report, ConfusionMatrixDisplay)

In [3]:
"""
Cardiovascular Disease Prediction Model
Achieves 83% accuracy using Decision Tree and Random Forest models
"""

# ============================================================================
# LIBRARIES IMPORT
# ============================================================================
import math
import re
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import f_classif

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier

from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_curve, roc_auc_score

# ============================================================================
# DATASET IMPORT
# ============================================================================
df = pd.read_csv("../CarePulse/new data/cardio_train.csv", sep=";")

# ============================================================================
# CONFIGURATION
# ============================================================================
TARGET = 'cardio'
INPUT_FEATURES = ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo',
                  'cholesterol', 'gluc', 'smoke', 'alco', 'active']

CACHE_MODELS_DIR_NAME = 'models_cache'
os.makedirs(CACHE_MODELS_DIR_NAME, exist_ok=True)

plt.style.use(plt.style.available[12])

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================
def drop_unwanted_features(df, features_to_drop):
    """Drop unwanted features from dataframe and INPUT_FEATURES list"""
    print(f"Dropping features: {features_to_drop}")
    
    if features_to_drop in list(df.columns):
        df = df.drop(columns=features_to_drop)
    
    if features_to_drop in INPUT_FEATURES:
        INPUT_FEATURES.remove(features_to_drop)
    return df

def IQR(series):
    """Apply IQR method to clip outliers"""
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR_val = Q3 - Q1
    min_v = Q1 - 1.5 * IQR_val
    max_v = Q3 + 1.5 * IQR_val
    return series.clip(lower=min_v, upper=max_v)

# ============================================================================
# CUSTOM MODEL CLASS
# ============================================================================
class CustomModel:
    def __init__(self, name, model, extra_train_param=None):
        self.name = str(name)
        self.model = model
        self.extra_train_param = extra_train_param
        self.y_train_hat = None
        self.y_test_hat = None
        self.feature_names = None
        self.load()

    def fit(self, x_train, y_train):
        if not getattr(self, "trained", False):
            if self.extra_train_param is None:
                self.model.fit(x_train, y_train)
            else:
                self.model.fit(x_train, y_train, **self.extra_train_param)
            
            self.feature_names = list(x_train.columns)
            self.trained = True
            self.save()

    def _align_features(self, X):
        """Ensure X has the same columns as training data"""
        if self.feature_names is not None:
            missing_cols = set(self.feature_names) - set(X.columns)
            extra_cols = set(X.columns) - set(self.feature_names)
            
            if missing_cols:
                raise ValueError(f"Missing columns in input: {missing_cols}")
            if extra_cols:
                X = X[self.feature_names]
            else:
                X = X[self.feature_names]
        return X

    def predict_on_train(self, x_train):
        if self.y_train_hat is None:
            x_train = self._align_features(x_train)
            self.y_train_hat = self.model.predict(x_train)

    def predict_on_test(self, x_test):
        if self.y_test_hat is None:
            x_test = self._align_features(x_test)
            self.y_test_hat = self.model.predict(x_test)

    def save(self):
        file_name = re.sub(r'\W+', '_', str(self.name).lower())
        file_path = CACHE_MODELS_DIR_NAME + '/' + file_name + '.pickle'
        with open(file_path, 'wb') as f:
            pickle.dump({
                "model": self.model,
                "trained": self.trained,
                "feature_names": self.feature_names
            }, f)

    def load(self):
        file_name = re.sub(r'\W+', '_', str(self.name).lower())
        file_path = CACHE_MODELS_DIR_NAME + '/' + file_name + '.pickle'
        
        if os.path.exists(file_path):
            with open(file_path, 'rb') as f:
                data = pickle.load(f)
                self.model = data["model"]
                self.trained = data["trained"]
                self.feature_names = data.get("feature_names", None)
        else:
            self.trained = False

# ============================================================================
# DATA PREPROCESSING
# ============================================================================
# Drop ID column
df = drop_unwanted_features(df, 'id')

# Handle outliers using IQR method
exclude = ['gluc', 'alco', 'smoke', 'colesterol', 'active']
for num_feature in [f for f in INPUT_FEATURES if f not in exclude]:
    for gender_category in df[TARGET].unique():
        mask = df[TARGET] == gender_category
        df.loc[mask, num_feature] = IQR(df.loc[mask, num_feature])

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================
# Convert age from days to years
df['age_years'] = (df['age'] / 365).round().astype(int)

# Convert height from cm to meters
df['height'] = df['height'] / 100

# Calculate BMI
df['bmi'] = df['weight'] / ((df['height'] / 100) ** 2)

# Calculate pulse pressure
df['pulse_pressure'] = df['ap_hi'] - df['ap_lo']

# Create health index
df['health_index'] = (df['active'] * 1) - (df['smoke'] * 0.5) - (df['alco'] * 0.5)

# Create interaction feature
df['cholesterol_gluc_interaction'] = df['cholesterol'] * df['gluc']

# Update input features list
new_features = [
    'age_years',
    'bmi',
    'pulse_pressure',
    'health_index',
    'cholesterol_gluc_interaction'
]
INPUT_FEATURES = INPUT_FEATURES + [f for f in new_features if f not in INPUT_FEATURES]

# Drop original age and height columns
df = drop_unwanted_features(df, 'age')
df = drop_unwanted_features(df, 'height')

# Final feature list
INPUT_FEATURES = ['gender', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 
                  'smoke', 'alco', 'active', 'age_years', 'bmi', 'pulse_pressure',
                  'health_index', 'cholesterol_gluc_interaction']

# ============================================================================
# FEATURE SELECTION (F-test)
# ============================================================================
f_values, p_values = f_classif(df[INPUT_FEATURES], df[TARGET])

print("\nFeature Importance (F-test):")
print("=" * 60)
for i in range(len(INPUT_FEATURES)):
    print(f"{INPUT_FEATURES[i]:35s}: F={f_values[i]:8.3f}, p={p_values[i]:.3f}")

# ============================================================================
# TRAIN-TEST SPLIT
# ============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    df.drop(TARGET, axis=1),
    df.loc[:, TARGET],
    test_size=0.25,
    random_state=42,
    stratify=df.loc[:, TARGET],
)

print(f"\nDataset Split:")
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape:  {X_test.shape}")
print(f"y_test shape:  {y_test.shape}")

Dropping features: id
Dropping features: age
Dropping features: height

Feature Importance (F-test):
gender                             : F=   4.604, p=0.032
weight                             : F=2702.000, p=0.000
ap_hi                              : F=21922.892, p=0.000
ap_lo                              : F=11575.413, p=0.000
cholesterol                        : F=15554.452, p=0.000
gluc                               : F= 562.773, p=0.000
smoke                              : F=  16.791, p=0.000
alco                               : F=   3.761, p=0.052
active                             : F=  89.091, p=0.000
age_years                          : F=4193.662, p=0.000
bmi                                : F=3015.623, p=0.000
pulse_pressure                     : F=11607.067, p=0.000
health_index                       : F=  44.285, p=0.000
cholesterol_gluc_interaction       : F=6925.072, p=0.000

Dataset Split:
X_train shape: (52500, 14)
y_train shape: (52500,)
X_test shape:  (17500, 14)
y_t

In [4]:
# Create visualization directory
VIZ_DIR = 'visualizations'
os.makedirs(VIZ_DIR, exist_ok=True)

In [5]:
# ============================================================================
# VISUALIZATION 1: CORRELATION MATRIX
# ============================================================================
print("\n[4] GENERATING VISUALIZATIONS")
print("-" * 80)

plt.figure(figsize=(14, 12))
correlation_matrix = df[INPUT_FEATURES + [TARGET]].corr()
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.2f', 
            cmap='coolwarm', center=0, square=True, linewidths=1,
            cbar_kws={"shrink": 0.8})
plt.title('Correlation Matrix of Features', fontsize=16, fontweight='bold', pad=20)
plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/01_correlation_matrix.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: Correlation Matrix")
plt.close()

# ============================================================================
# VISUALIZATION 2: FEATURE DISTRIBUTIONS
# ============================================================================
fig, axes = plt.subplots(4, 4, figsize=(16, 14))
axes = axes.ravel()

for idx, feature in enumerate(INPUT_FEATURES):
    if idx < len(axes):
        for target_val in [0, 1]:
            data = df[df[TARGET] == target_val][feature]
            axes[idx].hist(data, alpha=0.6, bins=30, 
                          label=f'Cardio={target_val}', density=True)
        axes[idx].set_title(feature, fontweight='bold')
        axes[idx].legend()
        axes[idx].grid(alpha=0.3)

plt.suptitle('Feature Distributions by Target Class', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/02_feature_distributions.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: Feature Distributions")
plt.close()


[4] GENERATING VISUALIZATIONS
--------------------------------------------------------------------------------
‚úì Saved: Correlation Matrix
‚úì Saved: Feature Distributions


In [6]:
# ============================================================================
# MODEL TRAINING - BASELINE MODELS (Before Tuning)
# ============================================================================
print("\n[5] TRAINING BASELINE MODELS (Before Hyperparameter Tuning)")
print("-" * 80)

baseline_models = {
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(n_estimators=105, random_state=42),
    'XGBoost': XGBClassifier(n_estimators=100, random_state=42, eval_metric='logloss'),
    'CatBoost': CatBoostClassifier(iterations=100, random_state=42, verbose=0)
}

baseline_results = {}

for name, model in baseline_models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    baseline_results[name] = {
        'model': model,
        'accuracy': accuracy,
        'f1_score': f1,
        'precision': precision,
        'recall': recall,
        'y_pred': y_pred
    }
    
    print(f"  Accuracy:  {accuracy:.4f}")
    print(f"  F1 Score:  {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")



[5] TRAINING BASELINE MODELS (Before Hyperparameter Tuning)
--------------------------------------------------------------------------------

Training Decision Tree...
  Accuracy:  0.7788
  F1 Score:  0.7765
  Precision: 0.7841
  Recall:    0.7691

Training Random Forest...
  Accuracy:  0.8098
  F1 Score:  0.7975
  Precision: 0.8519
  Recall:    0.7497

Training Gradient Boosting...
  Accuracy:  0.8306
  F1 Score:  0.8052
  Precision: 0.9465
  Recall:    0.7005

Training XGBoost...
  Accuracy:  0.8310
  F1 Score:  0.8119
  Precision: 0.9144
  Recall:    0.7301

Training CatBoost...
  Accuracy:  0.8311
  F1 Score:  0.8115
  Precision: 0.9176
  Recall:    0.7274


In [7]:
# # ============================================================================
# # HYPERPARAMETER TUNING
# # ============================================================================
# print("\n[6] HYPERPARAMETER TUNING")
# print("-" * 80)

# # Decision Tree Parameters
# dt_param_grid = {
#     'max_depth': [5, 10, 15, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'criterion': ['gini', 'entropy']
# }

# # Random Forest Parameters
# rf_param_grid = {
#     'bootstrap': True,
#   'criterion': 'entropy',
#   'max_depth': 25,
#   'max_features': 0.4,
#   'min_impurity_decrease': 0.0,
#   'min_samples_leaf': 4,
#   'min_samples_split': 12,
#   'n_estimators': 100
# }

# # Gradient Boosting Parameters
# gb_param_grid = {
#     'n_estimators': [100, 150, 200],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'min_samples_split': [2, 5],
#     'subsample': [0.8, 0.9, 1.0]
# }

# # XGBoost Parameters
# xgb_param_grid = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'max_depth': [3, 5, 7],
#     'subsample': [0.8, 0.9, 1.0],
#     'colsample_bytree': [0.8, 0.9, 1.0]
# }

# # CatBoost Parameters
# catboost_param_grid = {
#     'iterations': [100, 200, 300],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'depth': [4, 6, 8],
#     'l2_leaf_reg': [1, 3, 5]
# }

# tuned_models = {}
# param_grids = {
#     'Decision Tree': dt_param_grid,
#     'Random Forest': rf_param_grid,
#     'Gradient Boosting': gb_param_grid,
#     'XGBoost': xgb_param_grid,
#     'CatBoost': catboost_param_grid
# }

# for name, model in baseline_models.items():
#     print(f"\nTuning {name}...")
    
#     if name in ['Random Forest', 'XGBoost']:
#         # Use RandomizedSearchCV for larger parameter spaces
#         search = RandomizedSearchCV(
#             model, param_grids[name], n_iter=20, cv=3, 
#             scoring='f1', random_state=42, n_jobs=-1, verbose=0
#         )
#     else:
#         # Use GridSearchCV for smaller parameter spaces
#         search = GridSearchCV(
#             model, param_grids[name], cv=3, 
#             scoring='f1', n_jobs=-1, verbose=0
#         )
    
#     search.fit(X_train, y_train)
    
#     best_model = search.best_estimator_
#     y_pred = best_model.predict(X_test)
    
#     accuracy = accuracy_score(y_test, y_pred)
#     f1 = f1_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred)
#     recall = recall_score(y_test, y_pred)
    
#     tuned_models[name] = {
#         'model': best_model,
#         'accuracy': accuracy,
#         'f1_score': f1,
#         'precision': precision,
#         'recall': recall,
#         'y_pred': y_pred,
#         'best_params': search.best_params_
#     }
    
#     print(f"  Best Parameters: {search.best_params_}")
#     print(f"  Accuracy:  {accuracy:.4f} (Œî {accuracy - baseline_results[name]['accuracy']:+.4f})")
#     print(f"  F1 Score:  {f1:.4f} (Œî {f1 - baseline_results[name]['f1_score']:+.4f})")


In [8]:
# ============================================================================
# HYPERPARAMETER TUNING + INTEGRATION OF PRE-TUNED RANDOM FOREST
# ============================================================================

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier

print("\n[6] HYPERPARAMETER TUNING")
print("-" * 80)

# --------------------------
# PARAMETER GRIDS
# --------------------------

dt_param_grid = {
    'max_depth': [5, 10, 15, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'criterion': ['gini', 'entropy']
}

gb_param_grid = {
    'n_estimators': [100, 150, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5],
    'subsample': [0.8, 0.9, 1.0]
}

xgb_param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

catboost_param_grid = {
    'iterations': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}

param_grids = {
    'Decision Tree': dt_param_grid,
    'Gradient Boosting': gb_param_grid,
    'XGBoost': xgb_param_grid,
    'CatBoost': catboost_param_grid
}

# --------------------------
# PRE-TUNED RANDOM FOREST PARAMS
# --------------------------

rf_best_params = {
    'bootstrap': True,
    'criterion': 'entropy',
    'max_depth': 25,
    'max_features': 0.4,
    'min_impurity_decrease': 0.0,
    'min_samples_leaf': 4,
    'min_samples_split': 12,
    'n_estimators': 100
}

# --------------------------
# TUNING LOOP
# --------------------------

tuned_models = {}

for name, model in baseline_models.items():
    print(f"\nTuning {name}...")

    # ===== RANDOM FOREST (ALREADY TUNED) =====
    if name == 'Random Forest':

        best_model = RandomForestClassifier(
            **rf_best_params,
            n_jobs=-1,
            random_state=42
        )

        best_model.fit(X_train, y_train)
        y_pred = best_model.predict(X_test)

        tuned_models[name] = {
            'model': best_model,
            'accuracy': accuracy_score(y_test, y_pred),
            'f1_score': f1_score(y_test, y_pred),
            'precision': precision_score(y_test, y_pred),
            'recall': recall_score(y_test, y_pred),
            'y_pred': y_pred,
            'best_params': rf_best_params
        }

        print("  Best Parameters: (pre-tuned & fixed)")
        print(f"  Accuracy: {tuned_models[name]['accuracy']:.4f} "
              f"(Œî {tuned_models[name]['accuracy'] - baseline_results[name]['accuracy']:+.4f})")
        print(f"  F1 Score: {tuned_models[name]['f1_score']:.4f} "
              f"(Œî {tuned_models[name]['f1_score'] - baseline_results[name]['f1_score']:+.4f})")

        continue

    # ===== OTHER MODELS (SEARCH REQUIRED) =====

    if name == 'XGBoost':
        search = RandomizedSearchCV(
            model,
            param_grids[name],
            n_iter=20,
            cv=3,
            scoring='f1',
            random_state=42,
            n_jobs=-1,
            verbose=0
        )
    else:
        search = GridSearchCV(
            model,
            param_grids[name],
            cv=3,
            scoring='f1',
            n_jobs=-1,
            verbose=0
        )

    search.fit(X_train, y_train)
    best_model = search.best_estimator_
    y_pred = best_model.predict(X_test)

    tuned_models[name] = {
        'model': best_model,
        'accuracy': accuracy_score(y_test, y_pred),
        'f1_score': f1_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred),
        'recall': recall_score(y_test, y_pred),
        'y_pred': y_pred,
        'best_params': search.best_params_
    }

    print(f"  Best Parameters: {search.best_params_}")
    print(f"  Accuracy: {tuned_models[name]['accuracy']:.4f} "
          f"(Œî {tuned_models[name]['accuracy'] - baseline_results[name]['accuracy']:+.4f})")
    print(f"  F1 Score: {tuned_models[name]['f1_score']:.4f} "
          f"(Œî {tuned_models[name]['f1_score'] - baseline_results[name]['f1_score']:+.4f})")



[6] HYPERPARAMETER TUNING
--------------------------------------------------------------------------------

Tuning Decision Tree...
  Best Parameters: {'criterion': 'entropy', 'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 2}
  Accuracy: 0.8273 (Œî +0.0485)
  F1 Score: 0.8054 (Œî +0.0289)

Tuning Random Forest...
  Best Parameters: (pre-tuned & fixed)
  Accuracy: 0.8299 (Œî +0.0201)
  F1 Score: 0.8103 (Œî +0.0128)

Tuning Gradient Boosting...
  Best Parameters: {'learning_rate': 0.05, 'max_depth': 7, 'min_samples_split': 5, 'n_estimators': 150, 'subsample': 0.9}
  Accuracy: 0.8332 (Œî +0.0026)
  F1 Score: 0.8121 (Œî +0.0069)

Tuning XGBoost...
  Best Parameters: {'subsample': 0.9, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}
  Accuracy: 0.8335 (Œî +0.0026)
  F1 Score: 0.8141 (Œî +0.0022)

Tuning CatBoost...
  Best Parameters: {'depth': 4, 'iterations': 300, 'l2_leaf_reg': 1, 'learning_rate': 0.1}
  Accuracy: 0.8326 (Œî +0.0014)
  F1

In [9]:
# ============================================================================
# VISUALIZATION 3: BEFORE vs AFTER COMPARISON
# ============================================================================
print("\n[7] GENERATING COMPARISON VISUALIZATIONS")
print("-" * 80)

comparison_data = []
for name in baseline_models.keys():
    comparison_data.append({
        'Model': name,
        'Metric': 'Accuracy',
        'Before': baseline_results[name]['accuracy'],
        'After': tuned_models[name]['accuracy']
    })
    comparison_data.append({
        'Model': name,
        'Metric': 'F1 Score',
        'Before': baseline_results[name]['f1_score'],
        'After': tuned_models[name]['f1_score']
    })

comparison_df = pd.DataFrame(comparison_data)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Accuracy Comparison
accuracy_df = comparison_df[comparison_df['Metric'] == 'Accuracy']
x = np.arange(len(baseline_models))
width = 0.35

ax1.bar(x - width/2, accuracy_df['Before'], width, label='Before Tuning', alpha=0.8)
ax1.bar(x + width/2, accuracy_df['After'], width, label='After Tuning', alpha=0.8)
ax1.set_xlabel('Model', fontsize=12, fontweight='bold')
ax1.set_ylabel('Accuracy', fontsize=12, fontweight='bold')
ax1.set_title('Accuracy: Before vs After Hyperparameter Tuning', fontsize=14, fontweight='bold')
ax1.set_xticks(x)
ax1.set_xticklabels(baseline_models.keys(), rotation=45, ha='right')
ax1.legend()
ax1.grid(axis='y', alpha=0.3)
ax1.set_ylim([0.7, 0.85])

# F1 Score Comparison
f1_df = comparison_df[comparison_df['Metric'] == 'F1 Score']
ax2.bar(x - width/2, f1_df['Before'], width, label='Before Tuning', alpha=0.8)
ax2.bar(x + width/2, f1_df['After'], width, label='After Tuning', alpha=0.8)
ax2.set_xlabel('Model', fontsize=12, fontweight='bold')
ax2.set_ylabel('F1 Score', fontsize=12, fontweight='bold')
ax2.set_title('F1 Score: Before vs After Hyperparameter Tuning', fontsize=14, fontweight='bold')
ax2.set_xticks(x)
ax2.set_xticklabels(baseline_models.keys(), rotation=45, ha='right')
ax2.legend()
ax2.grid(axis='y', alpha=0.3)
ax2.set_ylim([0.7, 0.85])

plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/03_before_after_comparison.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: Before vs After Comparison")
plt.close()

# ============================================================================
# VISUALIZATION 4: CONFUSION MATRICES FOR ALL MODELS (AFTER TUNING)
# ============================================================================
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.ravel()

for idx, (name, results) in enumerate(tuned_models.items()):
    cm = confusion_matrix(y_test, results['y_pred'])
    
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Disease', 'Disease'])
    disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
    axes[idx].set_title(f'{name}\nAccuracy: {results["accuracy"]:.4f} | F1: {results["f1_score"]:.4f}', 
                       fontsize=11, fontweight='bold')
    axes[idx].grid(False)

# Remove empty subplot
axes[5].axis('off')

plt.suptitle('Confusion Matrices - Tuned Models', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/04_confusion_matrices_tuned.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: Confusion Matrices (Tuned Models)")
plt.close()

# ============================================================================
# VISUALIZATION 5: FEATURE IMPORTANCE (TOP MODELS)
# ============================================================================
fig, axes = plt.subplots(2, 2, figsize=(16, 12))
axes = axes.ravel()

models_with_importance = ['Decision Tree', 'Random Forest', 'Gradient Boosting', 'XGBoost']

for idx, name in enumerate(models_with_importance):
    model = tuned_models[name]['model']
    
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1][:10]  # Top 10 features
        
        axes[idx].barh(range(len(indices)), importances[indices], alpha=0.8)
        axes[idx].set_yticks(range(len(indices)))
        axes[idx].set_yticklabels([INPUT_FEATURES[i] for i in indices])
        axes[idx].set_xlabel('Importance', fontweight='bold')
        axes[idx].set_title(f'{name} - Top 10 Features', fontsize=12, fontweight='bold')
        axes[idx].grid(axis='x', alpha=0.3)
        axes[idx].invert_yaxis()

plt.suptitle('Feature Importance Analysis', fontsize=16, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/05_feature_importance.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: Feature Importance")
plt.close()

# ============================================================================
# VISUALIZATION 6: ROC CURVES
# ============================================================================
plt.figure(figsize=(12, 8))

for name, results in tuned_models.items():
    model = results['model']
    
    if hasattr(model, 'predict_proba'):
        y_proba = model.predict_proba(X_test)[:, 1]
    else:
        y_proba = model.decision_function(X_test)
    
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    auc = roc_auc_score(y_test, y_proba)
    
    plt.plot(fpr, tpr, linewidth=2, label=f'{name} (AUC = {auc:.4f})')

plt.plot([0, 1], [0, 1], 'k--', linewidth=2, label='Random Classifier')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=12, fontweight='bold')
plt.ylabel('True Positive Rate', fontsize=12, fontweight='bold')
plt.title('ROC Curves - Tuned Models', fontsize=14, fontweight='bold')
plt.legend(loc="lower right", fontsize=10)
plt.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/06_roc_curves.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: ROC Curves")
plt.close()

# ============================================================================
# VISUALIZATION 7: DETAILED CONFUSION MATRIX FOR GRADIENT BOOSTING
# ============================================================================
plt.figure(figsize=(10, 8))

gb_results = tuned_models['Gradient Boosting']
cm = confusion_matrix(y_test, gb_results['y_pred'])

sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True,
            xticklabels=['No Disease', 'Disease'],
            yticklabels=['No Disease', 'Disease'],
            linewidths=2, linecolor='black',
            annot_kws={'size': 16, 'weight': 'bold'})

plt.title('Gradient Boosting - Detailed Confusion Matrix\n' + 
          f'Accuracy: {gb_results["accuracy"]:.4f} | F1: {gb_results["f1_score"]:.4f} | ' +
          f'Precision: {gb_results["precision"]:.4f} | Recall: {gb_results["recall"]:.4f}',
          fontsize=14, fontweight='bold', pad=20)
plt.ylabel('True Label', fontsize=12, fontweight='bold')
plt.xlabel('Predicted Label', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig(f'{VIZ_DIR}/07_gb_confusion_matrix_detailed.png', dpi=300, bbox_inches='tight')
print("‚úì Saved: Gradient Boosting Detailed Confusion Matrix")
plt.close()

# ============================================================================
# FINAL RESULTS SUMMARY
# ============================================================================
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)

print("\nüìä MODEL PERFORMANCE COMPARISON (After Tuning)")
print("-" * 80)

summary_df = pd.DataFrame({
    'Model': list(tuned_models.keys()),
    'Accuracy': [tuned_models[m]['accuracy'] for m in tuned_models.keys()],
    'F1 Score': [tuned_models[m]['f1_score'] for m in tuned_models.keys()],
    'Precision': [tuned_models[m]['precision'] for m in tuned_models.keys()],
    'Recall': [tuned_models[m]['recall'] for m in tuned_models.keys()]
}).sort_values('F1 Score', ascending=False)

print(summary_df.to_string(index=False))

best_model_name = summary_df.iloc[0]['Model']
best_model_f1 = summary_df.iloc[0]['F1 Score']

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"   F1 Score: {best_model_f1:.4f}")
print(f"   Best Parameters: {tuned_models[best_model_name]['best_params']}")

print("\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETE!")
print(f"   All visualizations saved in '{VIZ_DIR}/' directory")
print("="*80)

# ============================================================================
# SAVE BEST MODEL
# ============================================================================
best_model = tuned_models[best_model_name]['model']
with open(f'{CACHE_MODELS_DIR_NAME}/best_model_{best_model_name.lower().replace(" ", "_")}.pkl', 'wb') as f:
    pickle.dump(best_model, f)

print(f"\nüíæ Best model saved: {best_model_name}")


[7] GENERATING COMPARISON VISUALIZATIONS
--------------------------------------------------------------------------------
‚úì Saved: Before vs After Comparison
‚úì Saved: Confusion Matrices (Tuned Models)
‚úì Saved: Feature Importance
‚úì Saved: ROC Curves
‚úì Saved: Gradient Boosting Detailed Confusion Matrix

FINAL RESULTS SUMMARY

üìä MODEL PERFORMANCE COMPARISON (After Tuning)
--------------------------------------------------------------------------------
            Model  Accuracy  F1 Score  Precision   Recall
          XGBoost  0.833543  0.814139   0.920901 0.729560
Gradient Boosting  0.833200  0.812053   0.929266 0.721098
         CatBoost  0.832571  0.811479   0.927762 0.721098
    Random Forest  0.829886  0.810346   0.914845 0.727273
    Decision Tree  0.827257  0.805432   0.921231 0.715495

üèÜ BEST MODEL: XGBoost
   F1 Score: 0.8141
   Best Parameters: {'subsample': 0.9, 'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'colsample_bytree': 1.0}

‚úÖ ANALYSIS CO

In [20]:
tuned_models['Random Forest']['model']

0,1,2
,n_estimators,100
,criterion,'entropy'
,max_depth,25
,min_samples_split,12
,min_samples_leaf,4
,min_weight_fraction_leaf,0.0
,max_features,0.4
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [24]:
"""
Save Random Forest Model and All Required Metadata
Creates 4 pickle files: model, feature_info, label_encoders, and model_metadata
"""

import pickle
import pandas as pd
from datetime import datetime
import os

# ============================================================================
# CONFIGURATION
# ============================================================================
SAVE_DIR = 'saved_models'
os.makedirs(SAVE_DIR, exist_ok=True)

# Generate timestamp for file naming
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

print("="*80)
print("SAVING RANDOM FOREST MODEL AND METADATA")
print("="*80)
print(f"Timestamp: {timestamp}")
print(f"Save Directory: {SAVE_DIR}/")

# ============================================================================
# 1. SAVE THE TRAINED MODEL (Random Forest)
# ============================================================================
print("\n[1] Saving Heart Disease Model (Tuned Random Forest)...")

# Extract the tuned Random Forest model from tuned_models dictionary
best_rf_model = tuned_models['Random Forest']['model']

# Save the model
model_filename = f'heart_disease_model_{timestamp}.pkl'
model_path = os.path.join(SAVE_DIR, model_filename)

with open(model_path, 'wb') as f:
    pickle.dump(best_rf_model, f)

print(f"‚úì Model saved: {model_filename}")
print(f"  Model Type: {type(best_rf_model).__name__}")
print(f"  Number of estimators: {best_rf_model.n_estimators}")
print(f"  Max depth: {best_rf_model.max_depth}")
print(f"  Criterion: {best_rf_model.criterion}")

# ============================================================================
# 2. SAVE FEATURE INFORMATION
# ============================================================================
print("\n[2] Saving Feature Information...")

# Create comprehensive feature information
feature_info = {
    'feature_names': list(X_train.columns),
    'n_features': len(X_train.columns),
    'feature_types': {
        'numerical': ['age_years', 'weight', 'ap_hi', 'ap_lo', 'bmi', 
                     'pulse_pressure', 'health_index', 'cholesterol_gluc_interaction'],
        'categorical': ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active']
    },
    'feature_descriptions': {
        'gender': 'Gender (1: Female, 2: Male)',
        'age_years': 'Age in years',
        'weight': 'Weight in kg',
        'ap_hi': 'Systolic blood pressure',
        'ap_lo': 'Diastolic blood pressure',
        'cholesterol': 'Cholesterol level (1: Normal, 2: Above normal, 3: Well above normal)',
        'gluc': 'Glucose level (1: Normal, 2: Above normal, 3: Well above normal)',
        'smoke': 'Smoking status (0: No, 1: Yes)',
        'alco': 'Alcohol intake (0: No, 1: Yes)',
        'active': 'Physical activity (0: No, 1: Yes)',
        'bmi': 'Body Mass Index (calculated)',
        'pulse_pressure': 'Pulse pressure (ap_hi - ap_lo)',
        'health_index': 'Health index (active - 0.5*smoke - 0.5*alco)',
        'cholesterol_gluc_interaction': 'Interaction between cholesterol and glucose'
    },
    'feature_ranges': {
        'age_years': {'min': int(df['age_years'].min()), 'max': int(df['age_years'].max())},
        'weight': {'min': float(df['weight'].min()), 'max': float(df['weight'].max())},
        'ap_hi': {'min': float(df['ap_hi'].min()), 'max': float(df['ap_hi'].max())},
        'ap_lo': {'min': float(df['ap_lo'].min()), 'max': float(df['ap_lo'].max())},
        'bmi': {'min': float(df['bmi'].min()), 'max': float(df['bmi'].max())},
        'pulse_pressure': {'min': float(df['pulse_pressure'].min()), 'max': float(df['pulse_pressure'].max())}
    },
    'engineered_features': {
        'bmi': 'weight / (height_m ** 2)',
        'pulse_pressure': 'ap_hi - ap_lo',
        'health_index': '(active * 1) - (smoke * 0.5) - (alco * 0.5)',
        'cholesterol_gluc_interaction': 'cholesterol * gluc'
    },
    'input_requirements': {
        'required_raw_inputs': ['age_days', 'gender', 'height_cm', 'weight', 
                                'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 
                                'smoke', 'alco', 'active'],
        'preprocessing_steps': [
            '1. Convert age from days to years: age_years = (age_days / 365).round()',
            '2. Convert height to meters: height_m = height_cm / 100',
            '3. Calculate BMI: bmi = weight / (height_m ** 2)',
            '4. Calculate pulse_pressure: pulse_pressure = ap_hi - ap_lo',
            '5. Calculate health_index: health_index = (active * 1) - (smoke * 0.5) - (alco * 0.5)',
            '6. Calculate interaction: cholesterol_gluc_interaction = cholesterol * gluc'
        ]
    }
}

feature_info_filename = f'feature_info_{timestamp}.pkl'
feature_info_path = os.path.join(SAVE_DIR, feature_info_filename)

with open(feature_info_path, 'wb') as f:
    pickle.dump(feature_info, f)

print(f"‚úì Feature info saved: {feature_info_filename}")
print(f"  Total features: {feature_info['n_features']}")
print(f"  Numerical features: {len(feature_info['feature_types']['numerical'])}")
print(f"  Categorical features: {len(feature_info['feature_types']['categorical'])}")

# ============================================================================
# 3. SAVE LABEL ENCODERS (for categorical variables if needed)
# ============================================================================
print("\n[3] Saving Label Encoders...")

# Since most categorical variables are already numeric, we'll save the mapping
label_encoders = {
    'gender': {
        'mapping': {1: 'Female', 2: 'Male'},
        'inverse_mapping': {'Female': 1, 'Male': 2}
    },
    'cholesterol': {
        'mapping': {1: 'Normal', 2: 'Above Normal', 3: 'Well Above Normal'},
        'inverse_mapping': {'Normal': 1, 'Above Normal': 2, 'Well Above Normal': 3}
    },
    'gluc': {
        'mapping': {1: 'Normal', 2: 'Above Normal', 3: 'Well Above Normal'},
        'inverse_mapping': {'Normal': 1, 'Above Normal': 2, 'Well Above Normal': 3}
    },
    'smoke': {
        'mapping': {0: 'No', 1: 'Yes'},
        'inverse_mapping': {'No': 0, 'Yes': 1}
    },
    'alco': {
        'mapping': {0: 'No', 1: 'Yes'},
        'inverse_mapping': {'No': 0, 'Yes': 1}
    },
    'active': {
        'mapping': {0: 'No', 1: 'Yes'},
        'inverse_mapping': {'No': 0, 'Yes': 1}
    },
    'target': {
        'mapping': {0: 'No Disease', 1: 'Disease'},
        'inverse_mapping': {'No Disease': 0, 'Disease': 1}
    }
}

label_encoders_filename = f'label_encoders_{timestamp}.pkl'
label_encoders_path = os.path.join(SAVE_DIR, label_encoders_filename)

with open(label_encoders_path, 'wb') as f:
    pickle.dump(label_encoders, f)

print(f"‚úì Label encoders saved: {label_encoders_filename}")
print(f"  Encoded variables: {list(label_encoders.keys())}")

# ============================================================================
# 4. SAVE MODEL METADATA
# ============================================================================
print("\n[4] Saving Model Metadata...")

# Get Random Forest specific parameters and metrics
rf_model = tuned_models['Random Forest']['model']
rf_params = tuned_models['Random Forest']['best_params']
rf_metrics = {
    'accuracy': tuned_models['Random Forest']['accuracy'],
    'f1_score': tuned_models['Random Forest']['f1_score'],
    'precision': tuned_models['Random Forest']['precision'],
    'recall': tuned_models['Random Forest']['recall']
}

# Get baseline metrics for comparison
rf_baseline = {
    'accuracy': baseline_results['Random Forest']['accuracy'],
    'f1_score': baseline_results['Random Forest']['f1_score'],
    'precision': baseline_results['Random Forest']['precision'],
    'recall': baseline_results['Random Forest']['recall']
}

# Collect all performance metrics and model information
model_metadata = {
    'model_info': {
        'model_name': 'Random Forest Classifier',
        'model_type': 'RandomForestClassifier',
        'library': 'scikit-learn',
        'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'model_version': '1.0',
        'best_parameters': rf_params,
        'n_estimators': rf_model.n_estimators,
        'max_depth': rf_model.max_depth,
        'min_samples_split': rf_model.min_samples_split,
        'min_samples_leaf': rf_model.min_samples_leaf,
        'max_features': rf_model.max_features,
        'criterion': rf_model.criterion,
        'bootstrap': rf_model.bootstrap
    },
    'training_info': {
        'dataset_size': len(df),
        'train_size': len(X_train),
        'test_size': len(X_test),
        'train_test_split': 0.75,
        'random_state': 42,
        'cv_folds': 3,
        'class_distribution': {
            'train': dict(y_train.value_counts().to_dict()),
            'test': dict(y_test.value_counts().to_dict())
        }
    },
    'performance_metrics': {
        'test_set': {
            'accuracy': float(rf_metrics['accuracy']),
            'f1_score': float(rf_metrics['f1_score']),
            'precision': float(rf_metrics['precision']),
            'recall': float(rf_metrics['recall'])
        },
        'baseline_comparison': {
            'baseline_accuracy': float(rf_baseline['accuracy']),
            'baseline_f1_score': float(rf_baseline['f1_score']),
            'improvement_accuracy': float(rf_metrics['accuracy'] - rf_baseline['accuracy']),
            'improvement_f1_score': float(rf_metrics['f1_score'] - rf_baseline['f1_score'])
        }
    },
    'feature_importance': {
        feature_name: float(importance) 
        for feature_name, importance in zip(X_train.columns, rf_model.feature_importances_)
    },
    'top_features': {
        'names': [X_train.columns[i] for i in rf_model.feature_importances_.argsort()[::-1][:10]],
        'importances': [float(rf_model.feature_importances_[i]) 
                       for i in rf_model.feature_importances_.argsort()[::-1][:10]]
    },
    'usage_instructions': {
        'prediction_example': """
# Load the model
import pickle
with open('heart_disease_model_*.pkl', 'rb') as f:
    model = pickle.load(f)

# Prepare input data (14 features in this order)
input_data = {
    'gender': 2,  # Male
    'weight': 75,
    'ap_hi': 120,
    'ap_lo': 80,
    'cholesterol': 1,
    'gluc': 1,
    'smoke': 0,
    'alco': 0,
    'active': 1,
    'age_years': 50,
    'bmi': 24.5,
    'pulse_pressure': 40,
    'health_index': 1.0,
    'cholesterol_gluc_interaction': 1
}

# Convert to DataFrame
import pandas as pd
input_df = pd.DataFrame([input_data])

# Make prediction
prediction = model.predict(input_df)
probability = model.predict_proba(input_df)

print(f"Prediction: {prediction[0]}")  # 0: No Disease, 1: Disease
print(f"Probability: {probability[0]}")  # [prob_no_disease, prob_disease]
        """,
        'required_preprocessing': 'See feature_info.pkl for detailed preprocessing steps'
    },
    'file_references': {
        'model_file': model_filename,
        'feature_info_file': feature_info_filename,
        'label_encoders_file': label_encoders_filename,
        'metadata_file': f'model_metadata_{timestamp}.pkl'
    }
}

model_metadata_filename = f'model_metadata_{timestamp}.pkl'
model_metadata_path = os.path.join(SAVE_DIR, model_metadata_filename)

with open(model_metadata_path, 'wb') as f:
    pickle.dump(model_metadata, f)

print(f"‚úì Model metadata saved: {model_metadata_filename}")

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*80)
print("SAVE SUMMARY")
print("="*80)

print(f"\n‚úÖ All files saved successfully in '{SAVE_DIR}/' directory:")
print(f"\n1. üì¶ {model_filename}")
print(f"   - Trained Random Forest model")
print(f"   - Size: {os.path.getsize(model_path) / 1024:.2f} KB")

print(f"\n2. üìä {feature_info_filename}")
print(f"   - Feature names, types, descriptions")
print(f"   - Preprocessing requirements")
print(f"   - Size: {os.path.getsize(feature_info_path) / 1024:.2f} KB")

print(f"\n3. üè∑Ô∏è  {label_encoders_filename}")
print(f"   - Categorical variable mappings")
print(f"   - Target variable encoding")
print(f"   - Size: {os.path.getsize(label_encoders_path) / 1024:.2f} KB")

print(f"\n4. üìù {model_metadata_filename}")
print(f"   - Model parameters and performance")
print(f"   - Training information")
print(f"   - Usage instructions")
print(f"   - Size: {os.path.getsize(model_metadata_path) / 1024:.2f} KB")

print("\n" + "="*80)
print("MODEL PERFORMANCE SUMMARY")
print("="*80)
print(f"Test Set Accuracy:  {model_metadata['performance_metrics']['test_set']['accuracy']:.6f}")
print(f"Test Set F1 Score:  {model_metadata['performance_metrics']['test_set']['f1_score']:.6f}")
print(f"Test Set Precision: {model_metadata['performance_metrics']['test_set']['precision']:.6f}")
print(f"Test Set Recall:    {model_metadata['performance_metrics']['test_set']['recall']:.6f}")


print("\nüèÜ Top 5 Most Important Features:")
for i, (name, importance) in enumerate(zip(
    model_metadata['top_features']['names'][:5],
    model_metadata['top_features']['importances'][:5]
), 1):
    print(f"  {i}. {name:30s}: {importance:.4f}")

print("\n" + "="*80)
print("‚úÖ MODEL SAVING COMPLETE!")
print("="*80)

# ============================================================================
# BONUS: CREATE A QUICK LOAD FUNCTION
# ============================================================================
print("\nüí° Quick Load Function (save this for later use):")
print("-" * 80)
print("""
def load_heart_disease_model(timestamp):
    '''Load the complete model package'''
    import pickle
    
    # Load all components
    with open(f'saved_models/heart_disease_model_{timestamp}.pkl', 'rb') as f:
        model = pickle.load(f)
    
    with open(f'saved_models/feature_info_{timestamp}.pkl', 'rb') as f:
        feature_info = pickle.load(f)
    
    with open(f'saved_models/label_encoders_{timestamp}.pkl', 'rb') as f:
        label_encoders = pickle.load(f)
    
    with open(f'saved_models/model_metadata_{timestamp}.pkl', 'rb') as f:
        metadata = pickle.load(f)
    
    return {
        'model': model,
        'feature_info': feature_info,
        'label_encoders': label_encoders,
        'metadata': metadata
    }

# Usage:
# package = load_heart_disease_model('""" + timestamp + """')
# model = package['model']
# predictions = model.predict(your_data)
""")

SAVING RANDOM FOREST MODEL AND METADATA
Timestamp: 20251231_170752
Save Directory: saved_models/

[1] Saving Heart Disease Model (Tuned Random Forest)...
‚úì Model saved: heart_disease_model_20251231_170752.pkl
  Model Type: RandomForestClassifier
  Number of estimators: 100
  Max depth: 25
  Criterion: entropy

[2] Saving Feature Information...
‚úì Feature info saved: feature_info_20251231_170752.pkl
  Total features: 14
  Numerical features: 8
  Categorical features: 6

[3] Saving Label Encoders...
‚úì Label encoders saved: label_encoders_20251231_170752.pkl
  Encoded variables: ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'target']

[4] Saving Model Metadata...
‚úì Model metadata saved: model_metadata_20251231_170752.pkl

SAVE SUMMARY

‚úÖ All files saved successfully in 'saved_models/' directory:

1. üì¶ heart_disease_model_20251231_170752.pkl
   - Trained Random Forest model
   - Size: 23737.08 KB

2. üìä feature_info_20251231_170752.pkl
   - Feature names, types

In [25]:
X_train

Unnamed: 0,gender,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,age_years,bmi,pulse_pressure,health_index,cholesterol_gluc_interaction
9696,1,67.0,110,70,1,1,0,0,1,58,268386.476526,40,1.0,1
44942,1,73.0,95,60,1,1,0,0,1,55,296158.059150,35,1.0,1
54675,1,50.0,100,70,1,3,0,0,1,54,192893.792678,30,1.0,3
2344,1,76.0,120,80,1,1,0,0,1,52,289590.001524,40,1.0,1
39922,1,88.0,135,90,1,3,0,1,1,63,361604.207758,45,0.5,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12738,1,80.0,120,80,1,1,0,0,1,60,283446.712018,40,1.0,1
56810,2,80.0,150,90,1,1,0,0,1,54,276816.608997,60,1.0,1
56100,2,68.0,120,80,1,3,0,1,1,60,238086.901719,40,0.5,3
31228,1,85.0,150,105,3,1,0,0,1,52,377777.777778,45,1.0,3


In [16]:
# """
# Random Forest - Extensive Hyperparameter Tuning with GridSearchCV
# Testing comprehensive parameter combinations for optimal performance
# """

# # ============================================================================
# # LIBRARIES IMPORT
# # ============================================================================
# import numpy as np
# import pandas as pd
# import matplotlib.pyplot as plt
# import seaborn as sns
# import pickle
# import time
# from datetime import datetime
# import warnings
# warnings.filterwarnings('ignore')

# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import GridSearchCV, cross_val_score
# from sklearn.metrics import (accuracy_score, f1_score, recall_score, 
#                              precision_score, roc_auc_score, confusion_matrix,
#                              classification_report, ConfusionMatrixDisplay,
#                              make_scorer)

# # ============================================================================
# # LOAD PREPROCESSED DATA
# # ============================================================================
# # Assuming X_train, X_test, y_train, y_test are already available
# # If not, run the preprocessing code first

# print("="*80)
# print("RANDOM FOREST - EXTENSIVE HYPERPARAMETER TUNING")
# print("="*80)
# print(f"Start Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
# print(f"\nDataset Info:")
# print(f"  Training samples: {len(X_train)}")
# print(f"  Testing samples:  {len(X_test)}")
# print(f"  Number of features: {X_train.shape[1]}")

# # ============================================================================
# # BASELINE RANDOM FOREST
# # ============================================================================
# print("\n" + "="*80)
# print("[1] BASELINE RANDOM FOREST (Your Best Parameters So Far)")
# print("="*80)

# # Using your best parameters as baseline
# baseline_rf = RandomForestClassifier(
#     n_estimators=100,
#     max_depth=30,
#     min_samples_split=10,
#     min_samples_leaf=4,
#     max_features='log2',
#     random_state=42,
#     n_jobs=-1
# )

# print("\nüìã Baseline Parameters:")
# print(f"  n_estimators:      100")
# print(f"  max_depth:         30")
# print(f"  min_samples_split: 10")
# print(f"  min_samples_leaf:  4")
# print(f"  max_features:      log2")

# print("\nTraining baseline model...")
# start_time = time.time()
# baseline_rf.fit(X_train, y_train)
# training_time = time.time() - start_time

# y_pred_baseline = baseline_rf.predict(X_test)
# y_proba_baseline = baseline_rf.predict_proba(X_test)[:, 1]

# baseline_metrics = {
#     'accuracy': accuracy_score(y_test, y_pred_baseline),
#     'f1_score': f1_score(y_test, y_pred_baseline),
#     'precision': precision_score(y_test, y_pred_baseline),
#     'recall': recall_score(y_test, y_pred_baseline),
#     'roc_auc': roc_auc_score(y_test, y_proba_baseline),
#     'training_time': training_time
# }

# print(f"\nüìä Baseline Performance:")
# print(f"  Accuracy:      {baseline_metrics['accuracy']:.6f}")
# print(f"  F1 Score:      {baseline_metrics['f1_score']:.6f}")
# print(f"  Precision:     {baseline_metrics['precision']:.6f}")
# print(f"  Recall:        {baseline_metrics['recall']:.6f}")
# print(f"  ROC AUC:       {baseline_metrics['roc_auc']:.6f}")
# print(f"  Training Time: {baseline_metrics['training_time']:.2f} seconds")

# # ============================================================================
# # EXPANDED PARAMETER GRID
# # ============================================================================
# print("\n" + "="*80)
# print("[2] EXPANDED PARAMETER GRID FOR GRIDSEARCHCV")
# print("="*80)

# # Comprehensive parameter grid with more options
# rf_param_grid_expanded = {
#     # Number of trees in the forest
#     'n_estimators': [50, 100, 150, 200, 250, 300, 400, 500],
    
#     # Maximum depth of each tree
#     'max_depth': [5, 10, 15, 20, 25, 30, 40, 50, None],
    
#     # Minimum samples required to split an internal node
#     'min_samples_split': [2, 5, 10, 15, 20, 25],
    
#     # Minimum samples required at a leaf node
#     'min_samples_leaf': [1, 2, 4, 6, 8, 10],
    
#     # Number of features to consider at every split
#     'max_features': ['sqrt', 'log2', None, 0.3, 0.5, 0.7],
    
#     # Bootstrap samples when building trees
#     'bootstrap': [True, False],
    
#     # Criterion to measure split quality
#     'criterion': ['gini', 'entropy', 'log_loss'],
    
#     # Minimum weighted fraction of samples required at leaf
#     'min_weight_fraction_leaf': [0.0, 0.1, 0.2],
    
#     # Maximum number of leaf nodes
#     'max_leaf_nodes': [None, 10, 20, 50, 100],
    
#     # Minimum impurity decrease for split
#     'min_impurity_decrease': [0.0, 0.01, 0.05, 0.1]
# }

# print("\nüìã Parameter Grid Details:")
# print(f"  n_estimators: {rf_param_grid_expanded['n_estimators']}")
# print(f"  max_depth: {rf_param_grid_expanded['max_depth']}")
# print(f"  min_samples_split: {rf_param_grid_expanded['min_samples_split']}")
# print(f"  min_samples_leaf: {rf_param_grid_expanded['min_samples_leaf']}")
# print(f"  max_features: {rf_param_grid_expanded['max_features']}")
# print(f"  bootstrap: {rf_param_grid_expanded['bootstrap']}")
# print(f"  criterion: {rf_param_grid_expanded['criterion']}")
# print(f"  min_weight_fraction_leaf: {rf_param_grid_expanded['min_weight_fraction_leaf']}")
# print(f"  max_leaf_nodes: {rf_param_grid_expanded['max_leaf_nodes']}")
# print(f"  min_impurity_decrease: {rf_param_grid_expanded['min_impurity_decrease']}")

# total_combinations = np.prod([len(v) for v in rf_param_grid_expanded.values()])
# print(f"\n‚ö†Ô∏è  Total possible combinations: {total_combinations:,}")
# print(f"    This is computationally expensive. Consider using a subset.")

# # ============================================================================
# # PRACTICAL PARAMETER GRID (RECOMMENDED)
# # ============================================================================
# print("\n" + "="*80)
# print("[3] PRACTICAL PARAMETER GRID (RECOMMENDED FOR GRIDSEARCHCV)")
# print("="*80)

# # More practical grid with most important parameters
# rf_param_grid_practical = {
#     'n_estimators': [100, 200, 300, 400, 500],
#     'max_depth': [10, 20, 30, 40, None],
#     'min_samples_split': [2, 5, 10, 15],
#     'min_samples_leaf': [1, 2, 4, 8],
#     'max_features': ['sqrt', 'log2', 0.5],
#     'bootstrap': [True, False],
#     'criterion': ['gini', 'entropy'],
#     'min_impurity_decrease': [0.0, 0.01]
# }

# practical_combinations = np.prod([len(v) for v in rf_param_grid_practical.values()])
# print(f"\n‚úÖ Practical combinations: {practical_combinations:,}")
# print(f"   With 3-fold CV: {practical_combinations * 3:,} model fits")

# print("\nüìã Practical Grid Details:")
# for param, values in rf_param_grid_practical.items():
#     print(f"  {param}: {values}")

# # ============================================================================
# # GRIDSEARCHCV - STAGE 1: FAST COARSE SEARCH (Around Your Best Parameters)
# # ============================================================================
# print("\n" + "="*80)
# print("[4] STAGE 1: FAST COARSE SEARCH (Around Your Best Parameters)")
# print("="*80)

# # Much smaller, focused search centered around your best parameters
# rf_param_grid_coarse = {
#     'n_estimators': [100, 150, 200],
#     'max_depth': [25, 30, 35],
#     'min_samples_split': [8, 10, 12],
#     'min_samples_leaf': [3, 4, 5],
#     'max_features': ['log2', 0.4],
#     'criterion': ['gini', 'entropy']
# }

# print("\nüîç Starting Fast Coarse Grid Search...")
# print(f"   Combinations to test: {np.prod([len(v) for v in rf_param_grid_coarse.values()])}")
# print(f"   Cross-validation folds: 3")
# print(f"   Scoring metric: F1 Score")
# print(f"   Parallel jobs: All available CPUs")
# print(f"   Estimated time: ~5-10 minutes")

# grid_search_coarse = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
#     param_grid=rf_param_grid_coarse,
#     cv=3,  # Reduced from 5 to 3 for speed
#     scoring='f1',
#     n_jobs=-1,
#     verbose=1,  # Reduced verbosity
#     return_train_score=True
# )

# start_time = time.time()
# grid_search_coarse.fit(X_train, y_train)
# coarse_time = time.time() - start_time

# print(f"\n‚úÖ Coarse search completed in {coarse_time/60:.2f} minutes")
# print(f"\nüèÜ Best Parameters (Coarse):")
# for param, value in grid_search_coarse.best_params_.items():
#     print(f"  {param}: {value}")
# print(f"\nüìä Best Cross-Validation F1 Score: {grid_search_coarse.best_score_:.6f}")

# # Test on test set
# y_pred_coarse = grid_search_coarse.best_estimator_.predict(X_test)
# y_proba_coarse = grid_search_coarse.best_estimator_.predict_proba(X_test)[:, 1]

# coarse_metrics = {
#     'accuracy': accuracy_score(y_test, y_pred_coarse),
#     'f1_score': f1_score(y_test, y_pred_coarse),
#     'precision': precision_score(y_test, y_pred_coarse),
#     'recall': recall_score(y_test, y_pred_coarse),
#     'roc_auc': roc_auc_score(y_test, y_proba_coarse)
# }

# print(f"\nüìä Test Set Performance (Coarse):")
# print(f"  Accuracy:  {coarse_metrics['accuracy']:.6f}")
# print(f"  F1 Score:  {coarse_metrics['f1_score']:.6f}")
# print(f"  Precision: {coarse_metrics['precision']:.6f}")
# print(f"  Recall:    {coarse_metrics['recall']:.6f}")
# print(f"  ROC AUC:   {coarse_metrics['roc_auc']:.6f}")

# # ============================================================================
# # GRIDSEARCHCV - STAGE 2: FOCUSED FINE-TUNING
# # ============================================================================
# print("\n" + "="*80)
# print("[5] STAGE 2: FOCUSED FINE-TUNING AROUND BEST PARAMETERS")
# print("="*80)

# # Build a smaller fine-tuning grid around best parameters
# best_params = grid_search_coarse.best_params_

# # Create tight fine-tuning ranges
# rf_param_grid_fine = {}

# # n_estimators: narrow range around best
# best_n_est = best_params['n_estimators']
# rf_param_grid_fine['n_estimators'] = [
#     max(50, best_n_est - 25),
#     best_n_est,
#     best_n_est + 25
# ]

# # max_depth: narrow range around best
# best_max_depth = best_params['max_depth']
# rf_param_grid_fine['max_depth'] = [
#     best_max_depth - 3,
#     best_max_depth,
#     best_max_depth + 3
# ]

# # min_samples_split: narrow range
# best_mss = best_params['min_samples_split']
# rf_param_grid_fine['min_samples_split'] = [
#     max(2, best_mss - 1),
#     best_mss,
#     best_mss + 1
# ]

# # min_samples_leaf: narrow range
# best_msl = best_params['min_samples_leaf']
# rf_param_grid_fine['min_samples_leaf'] = [
#     max(1, best_msl - 1),
#     best_msl,
#     best_msl + 1
# ]

# # max_features: keep best + 1 alternative
# rf_param_grid_fine['max_features'] = [best_params['max_features'], 0.5]

# # criterion: keep best only
# rf_param_grid_fine['criterion'] = [best_params['criterion']]

# # Additional quick-test parameters
# rf_param_grid_fine['bootstrap'] = [True]
# rf_param_grid_fine['min_impurity_decrease'] = [0.0, 0.001]

# print(f"\nüîç Starting Focused Fine-Tuning...")
# print(f"   Combinations to test: {np.prod([len(v) for v in rf_param_grid_fine.values()])}")
# print(f"   Cross-validation folds: 3")
# print(f"   Estimated time: ~3-5 minutes")

# grid_search_fine = GridSearchCV(
#     estimator=RandomForestClassifier(random_state=42, n_jobs=-1),
#     param_grid=rf_param_grid_fine,
#     cv=3,
#     scoring='f1',
#     n_jobs=-1,
#     verbose=1,
#     return_train_score=True
# )

# start_time = time.time()
# grid_search_fine.fit(X_train, y_train)
# fine_time = time.time() - start_time

# print(f"\n‚úÖ Fine-tuning completed in {fine_time/60:.2f} minutes")
# print(f"\nüèÜ Best Parameters (Fine-Tuned):")
# for param, value in grid_search_fine.best_params_.items():
#     print(f"  {param}: {value}")
# print(f"\nüìä Best Cross-Validation F1 Score: {grid_search_fine.best_score_:.6f}")

# # Test on test set
# y_pred_fine = grid_search_fine.best_estimator_.predict(X_test)
# y_proba_fine = grid_search_fine.best_estimator_.predict_proba(X_test)[:, 1]

# fine_metrics = {
#     'accuracy': accuracy_score(y_test, y_pred_fine),
#     'f1_score': f1_score(y_test, y_pred_fine),
#     'precision': precision_score(y_test, y_pred_fine),
#     'recall': recall_score(y_test, y_pred_fine),
#     'roc_auc': roc_auc_score(y_test, y_proba_fine)
# }

# print(f"\nüìä Test Set Performance (Fine-Tuned):")
# print(f"  Accuracy:  {fine_metrics['accuracy']:.6f}")
# print(f"  F1 Score:  {fine_metrics['f1_score']:.6f}")
# print(f"  Precision: {fine_metrics['precision']:.6f}")
# print(f"  Recall:    {fine_metrics['recall']:.6f}")
# print(f"  ROC AUC:   {fine_metrics['roc_auc']:.6f}")

# # ============================================================================
# # PERFORMANCE COMPARISON
# # ============================================================================
# print("\n" + "="*80)
# print("[6] PERFORMANCE COMPARISON - ALL STAGES")
# print("="*80)

# comparison_df = pd.DataFrame({
#     'Stage': ['Baseline', 'Coarse Search', 'Fine-Tuned'],
#     'Accuracy': [baseline_metrics['accuracy'], coarse_metrics['accuracy'], fine_metrics['accuracy']],
#     'F1 Score': [baseline_metrics['f1_score'], coarse_metrics['f1_score'], fine_metrics['f1_score']],
#     'Precision': [baseline_metrics['precision'], coarse_metrics['precision'], fine_metrics['precision']],
#     'Recall': [baseline_metrics['recall'], coarse_metrics['recall'], fine_metrics['recall']],
#     'ROC AUC': [baseline_metrics['roc_auc'], coarse_metrics['roc_auc'], fine_metrics['roc_auc']]
# })

# print("\n" + comparison_df.to_string(index=False))

# # Calculate improvements
# print(f"\nüìà Improvement from Baseline to Fine-Tuned:")
# print(f"  Accuracy:  {(fine_metrics['accuracy'] - baseline_metrics['accuracy'])*100:+.4f}%")
# print(f"  F1 Score:  {(fine_metrics['f1_score'] - baseline_metrics['f1_score'])*100:+.4f}%")
# print(f"  Precision: {(fine_metrics['precision'] - baseline_metrics['precision'])*100:+.4f}%")
# print(f"  Recall:    {(fine_metrics['recall'] - baseline_metrics['recall'])*100:+.4f}%")
# print(f"  ROC AUC:   {(fine_metrics['roc_auc'] - baseline_metrics['roc_auc'])*100:+.4f}%")

# # ============================================================================
# # VISUALIZATIONS
# # ============================================================================
# print("\n" + "="*80)
# print("[7] GENERATING VISUALIZATIONS")
# print("="*80)

# # Create output directory
# import os
# VIZ_DIR = 'rf_visualizations'
# os.makedirs(VIZ_DIR, exist_ok=True)

# # 1. Performance Comparison Bar Plot
# fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# metrics = ['Accuracy', 'F1 Score', 'Precision', 'Recall', 'ROC AUC']
# stages = comparison_df['Stage'].values
# x = np.arange(len(metrics))
# width = 0.25

# for ax, metric_subset in [(axes[0], ['Accuracy', 'F1 Score', 'ROC AUC']), 
#                            (axes[1], ['Precision', 'Recall'])]:
#     for idx, stage in enumerate(stages):
#         values = [comparison_df[comparison_df['Stage'] == stage][m].values[0] 
#                  for m in metric_subset]
#         positions = np.arange(len(metric_subset)) + (idx - 1) * width
#         ax.bar(positions, values, width, label=stage, alpha=0.8)
    
#     ax.set_ylabel('Score', fontsize=12, fontweight='bold')
#     ax.set_title(f'Performance Comparison - {", ".join(metric_subset)}', 
#                 fontsize=13, fontweight='bold')
#     ax.set_xticks(np.arange(len(metric_subset)))
#     ax.set_xticklabels(metric_subset, rotation=0)
#     ax.legend()
#     ax.grid(axis='y', alpha=0.3)
#     ax.set_ylim([0.65, 0.95])

# plt.tight_layout()
# plt.savefig(f'{VIZ_DIR}/01_performance_comparison.png', dpi=300, bbox_inches='tight')
# print("‚úì Saved: Performance Comparison")
# plt.close()

# # 2. Confusion Matrices Comparison
# fig, axes = plt.subplots(1, 3, figsize=(18, 5))

# predictions = [y_pred_baseline, y_pred_coarse, y_pred_fine]
# titles = ['Baseline', 'Coarse Search', 'Fine-Tuned']
# f1_scores = [baseline_metrics['f1_score'], coarse_metrics['f1_score'], fine_metrics['f1_score']]

# for idx, (pred, title, f1) in enumerate(zip(predictions, titles, f1_scores)):
#     cm = confusion_matrix(y_test, pred)
#     disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['No Disease', 'Disease'])
#     disp.plot(ax=axes[idx], cmap='Blues', values_format='d')
#     axes[idx].set_title(f'{title}\nF1 Score: {f1:.6f}', fontsize=12, fontweight='bold')
#     axes[idx].grid(False)

# plt.suptitle('Random Forest - Confusion Matrices Comparison', fontsize=15, fontweight='bold')
# plt.tight_layout()
# plt.savefig(f'{VIZ_DIR}/02_confusion_matrices.png', dpi=300, bbox_inches='tight')
# print("‚úì Saved: Confusion Matrices")
# plt.close()

# # 3. Feature Importance - Fine-Tuned Model
# plt.figure(figsize=(12, 8))
# importances = grid_search_fine.best_estimator_.feature_importances_
# indices = np.argsort(importances)[::-1]

# plt.barh(range(len(indices)), importances[indices], alpha=0.8, color='steelblue')
# plt.yticks(range(len(indices)), [X_train.columns[i] for i in indices])
# plt.xlabel('Feature Importance', fontsize=12, fontweight='bold')
# plt.title('Random Forest - Feature Importance (Fine-Tuned Model)', 
#          fontsize=14, fontweight='bold')
# plt.grid(axis='x', alpha=0.3)
# plt.gca().invert_yaxis()
# plt.tight_layout()
# plt.savefig(f'{VIZ_DIR}/03_feature_importance.png', dpi=300, bbox_inches='tight')
# print("‚úì Saved: Feature Importance")
# plt.close()

# # 4. GridSearchCV Results Heatmap (Top parameters)
# cv_results = pd.DataFrame(grid_search_fine.cv_results_)
# top_10 = cv_results.nlargest(10, 'mean_test_score')[
#     ['param_n_estimators', 'param_max_depth', 'param_min_samples_split', 
#      'mean_test_score', 'std_test_score']
# ]

# fig, ax = plt.subplots(figsize=(12, 8))
# top_10_display = top_10.copy()
# top_10_display.columns = ['N Estimators', 'Max Depth', 'Min Samples Split', 
#                           'Mean F1', 'Std F1']
# top_10_display = top_10_display.reset_index(drop=True)

# # Create text display
# cell_text = []
# for idx, row in top_10_display.iterrows():
#     cell_text.append([
#         str(row['N Estimators']),
#         str(row['Max Depth']),
#         str(row['Min Samples Split']),
#         f"{row['Mean F1']:.6f}",
#         f"{row['Std F1']:.6f}"
#     ])

# table = ax.table(cellText=cell_text, colLabels=top_10_display.columns,
#                 cellLoc='center', loc='center',
#                 colWidths=[0.15, 0.15, 0.2, 0.15, 0.15])
# table.auto_set_font_size(False)
# table.set_fontsize(10)
# table.scale(1, 2)

# # Color the best row
# for i in range(len(top_10_display.columns)):
#     table[(1, i)].set_facecolor('#90EE90')
#     table[(1, i)].set_text_props(weight='bold')

# ax.axis('off')
# ax.set_title('Top 10 Parameter Combinations (Fine-Tuned GridSearch)', 
#             fontsize=14, fontweight='bold', pad=20)
# plt.tight_layout()
# plt.savefig(f'{VIZ_DIR}/04_top_parameters.png', dpi=300, bbox_inches='tight')
# print("‚úì Saved: Top Parameters Table")
# plt.close()

# # 5. Quick Learning Curve - Compare n_estimators
# print("\nüìä Generating quick learning curve...")
# n_estimators_range = [100, 150, 200, 250]  # Reduced range for speed
# train_scores = []
# test_scores = []

# # Use best parameters from fine-tuning, vary only n_estimators
# best_params_fine = grid_search_fine.best_params_
# for n_est in n_estimators_range:
#     rf_temp = RandomForestClassifier(
#         n_estimators=n_est,
#         max_depth=best_params_fine.get('max_depth', 30),
#         min_samples_split=best_params_fine.get('min_samples_split', 10),
#         min_samples_leaf=best_params_fine.get('min_samples_leaf', 4),
#         max_features=best_params_fine.get('max_features', 'log2'),
#         criterion=best_params_fine.get('criterion', 'gini'),
#         bootstrap=best_params_fine.get('bootstrap', True),
#         random_state=42,
#         n_jobs=-1
#     )
#     rf_temp.fit(X_train, y_train)
#     train_scores.append(f1_score(y_train, rf_temp.predict(X_train)))
#     test_scores.append(f1_score(y_test, rf_temp.predict(X_test)))

# plt.figure(figsize=(10, 6))
# plt.plot(n_estimators_range, train_scores, 'o-', label='Training F1', linewidth=2, markersize=8)
# plt.plot(n_estimators_range, test_scores, 's-', label='Test F1', linewidth=2, markersize=8)
# plt.xlabel('Number of Estimators', fontsize=12, fontweight='bold')
# plt.ylabel('F1 Score', fontsize=12, fontweight='bold')
# plt.title('Learning Curve - Impact of n_estimators', fontsize=14, fontweight='bold')
# plt.legend(fontsize=11)
# plt.grid(alpha=0.3)
# plt.tight_layout()
# plt.savefig(f'{VIZ_DIR}/05_learning_curve.png', dpi=300, bbox_inches='tight')
# print("‚úì Saved: Learning Curve")
# plt.close()

# # ============================================================================
# # SAVE FINAL MODEL
# # ============================================================================
# print("\n" + "="*80)
# print("[8] SAVING FINAL MODEL")
# print("="*80)

# final_model = grid_search_fine.best_estimator_
# model_path = 'models_cache/random_forest_final_tuned.pkl'

# with open(model_path, 'wb') as f:
#     pickle.dump({
#         'model': final_model,
#         'best_params': grid_search_fine.best_params_,
#         'best_score': grid_search_fine.best_score_,
#         'test_metrics': fine_metrics,
#         'feature_names': list(X_train.columns)
#     }, f)

# print(f"‚úÖ Model saved to: {model_path}")

# # ============================================================================
# # FINAL SUMMARY
# # ============================================================================
# print("\n" + "="*80)
# print("FINAL SUMMARY")
# print("="*80)

# print("\nüèÜ BEST RANDOM FOREST MODEL:")
# print(f"  Cross-Validation F1: {grid_search_fine.best_score_:.6f}")
# print(f"  Test Set F1:         {fine_metrics['f1_score']:.6f}")
# print(f"  Test Set Accuracy:   {fine_metrics['accuracy']:.6f}")

# print("\nüìã Optimal Hyperparameters:")
# for param, value in grid_search_fine.best_params_.items():
#     print(f"  {param}: {value}")

# print(f"\n‚è±Ô∏è  Total Tuning Time: {(coarse_time + fine_time)/60:.2f} minutes")
# print(f"  - Coarse Search: {coarse_time/60:.2f} minutes")
# print(f"  - Fine Tuning:   {fine_time/60:.2f} minutes")

# print(f"\nüìÅ Outputs:")
# print(f"  Visualizations: {VIZ_DIR}/")
# print(f"  Model file: {model_path}")

# print("\n" + "="*80)
# print("‚úÖ RANDOM FOREST TUNING COMPLETE!")
# print("="*80)
# print(f"End Time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

In [17]:
# rr = RandomForestClassifier(
#     bootstrap=True,
#     criterion="entropy",
#     max_depth=25,
#     max_features=0.4,
#     min_impurity_decrease=0.0,
#     min_samples_leaf=4,
#     min_samples_split=12,
#     n_estimators=100,
#     n_jobs=-1,
#     random_state=42
# )