# Cross-Validation and Hyperparameter Tuning for All Models

In [3]:
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

In [5]:
# Load preprocessed data
X_train, X_test, y_train, y_test, scaler = joblib.load("data_preprocessed.pkl")

In [7]:
print("\n🔍 Performing GridSearchCV on all models with 5-fold cross-validation...\n")


🔍 Performing GridSearchCV on all models with 5-fold cross-validation...



In [9]:
# 1. Logistic Regression
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'solver': ['liblinear']
}

In [11]:
grid_lr = GridSearchCV(LogisticRegression(), param_grid_lr, cv=5, scoring='f1', verbose=1)
grid_lr.fit(X_train, y_train)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


In [13]:
# 2. Decision Tree
param_grid_dt = {
    'max_depth': [3, 5, 7, 10],
    'criterion': ['gini', 'entropy']
}

In [15]:
grid_dt = GridSearchCV(DecisionTreeClassifier(random_state=42), param_grid_dt, cv=5, scoring='f1', verbose=1)
grid_dt.fit(X_train, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [17]:
# 3. Random Forest
param_grid_rf = {
    'n_estimators': [50, 100, 150],
    'max_depth': [5, 10, 15],
    'criterion': ['gini', 'entropy']
}

In [19]:
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=5, scoring='f1', verbose=1)
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [20]:
# 4. SVM
param_grid_svm = {
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf']
}

In [21]:
grid_svm = GridSearchCV(SVC(probability=True), param_grid_svm, cv=5, scoring='f1', verbose=1)
grid_svm.fit(X_train, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


In [25]:
# Evaluate all models on test set
models = {
    "Logistic Regression": grid_lr.best_estimator_,
    "Decision Tree": grid_dt.best_estimator_,
    "Random Forest": grid_rf.best_estimator_,
    "SVM": grid_svm.best_estimator_
}

In [27]:
print("\n✅ Cross-validation complete.\n")


✅ Cross-validation complete.



In [29]:
for name, model in models.items():
    print(f"\n📊 {name} Evaluation:")
    y_pred = model.predict(X_test)
    print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
    print(f"Precision: {precision_score(y_test, y_pred):.4f}")
    print(f"Recall: {recall_score(y_test, y_pred):.4f}")
    print(f"F1 Score: {f1_score(y_test, y_pred):.4f}")


📊 Logistic Regression Evaluation:
Accuracy: 0.9839
Precision: 0.9836
Recall: 1.0000
F1 Score: 0.9917

📊 Decision Tree Evaluation:
Accuracy: 0.9355
Precision: 0.9828
Recall: 0.9500
F1 Score: 0.9661

📊 Random Forest Evaluation:
Accuracy: 0.9677
Precision: 0.9833
Recall: 0.9833
F1 Score: 0.9833

📊 SVM Evaluation:
Accuracy: 0.9677
Precision: 0.9833
Recall: 0.9833
F1 Score: 0.9833


In [31]:
# Save best models
joblib.dump(grid_lr.best_estimator_, 'logistic_regression_best.pkl')
joblib.dump(grid_dt.best_estimator_, 'decision_tree_best.pkl')
joblib.dump(grid_rf.best_estimator_, 'random_forest_best.pkl')
joblib.dump(grid_svm.best_estimator_, 'svm_best.pkl')

['svm_best.pkl']