In [1]:
# Import Libraries
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import joblib
import os

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")


In [2]:
# Load Preprocessed Data
X_train = pd.read_csv("X_train.csv")
X_test = pd.read_csv("X_test.csv")
y_train = pd.read_csv("y_train.csv").values.ravel()
y_test = pd.read_csv("y_test.csv").values.ravel()

In [3]:
# Define Models and Param Grids
models = [
    ('LogisticRegression', LogisticRegression(), {
        'C': [0.1, 1, 10],
        'solver': ['liblinear']
    }),
    ('RandomForest', RandomForestClassifier(), {
        'n_estimators': [50, 100],
        'max_depth': [4, 8, 16]
    }),
    ('SVM', SVC(probability=True), {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear']
    }),
    ('KNN', KNeighborsClassifier(), {
        'n_neighbors': [3, 5, 7]
    })
]

In [5]:
# Train & Tune Models Using GridSearchCV
best_model = None
best_score = 0
best_model_name = ""
results = []

for name, model, params in models:
    print(f"Training {name}...")
    grid = GridSearchCV(model, params, cv=5, scoring='accuracy')
    grid.fit(X_train, y_train)
    
    y_pred = grid.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    print(f"{name} Accuracy: {score:.4f}")
    print(classification_report(y_test, y_pred))

    results.append((name, score, grid.best_params_))
    
    if score > best_score:
        best_score = score
        best_model = grid.best_estimator_
        best_model_name = name

Training LogisticRegression...
LogisticRegression Accuracy: 0.8696
              precision    recall  f1-score   support

           0       0.90      0.79      0.84        82
           1       0.85      0.93      0.89       102

    accuracy                           0.87       184
   macro avg       0.88      0.86      0.87       184
weighted avg       0.87      0.87      0.87       184

Training RandomForest...
RandomForest Accuracy: 0.9022
              precision    recall  f1-score   support

           0       0.91      0.87      0.89        82
           1       0.90      0.93      0.91       102

    accuracy                           0.90       184
   macro avg       0.90      0.90      0.90       184
weighted avg       0.90      0.90      0.90       184

Training SVM...
SVM Accuracy: 0.8913
              precision    recall  f1-score   support

           0       0.92      0.83      0.87        82
           1       0.87      0.94      0.91       102

    accuracy           

In [6]:
# Save Best Model
# Ensure directory exists
os.makedirs("models", exist_ok=True)

joblib.dump(best_model, "models/best_model.pkl")
print(f"Best model ({best_model_name}) saved with accuracy {best_score:.4f}")

Best model (RandomForest) saved with accuracy 0.9022


In [8]:
# Print Summary of All Models
results_df = pd.DataFrame(results, columns=['Model', 'Accuracy', 'Best Params'])
print("\n Model Comparison:")
print(results_df.sort_values(by='Accuracy', ascending=False))


 Model Comparison:
                Model  Accuracy                           Best Params
1        RandomForest  0.902174  {'max_depth': 4, 'n_estimators': 50}
2                 SVM  0.891304             {'C': 1, 'kernel': 'rbf'}
0  LogisticRegression  0.869565      {'C': 10, 'solver': 'liblinear'}
3                 KNN  0.869565                    {'n_neighbors': 5}
