In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib

# 1. Load dataset
df = pd.read_csv("../data/selected_features.csv")
X = df.drop("num", axis=1)
y = (df["num"] > 0).astype(int)  # binary target

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 2. Define models & parameter grids
param_grids = {
    "Logistic Regression": {
        "C": [0.01, 0.1, 1, 10],
        "solver": ["liblinear", "lbfgs"]
    },
    "Decision Tree": {
        "max_depth": [3, 5, 7, None],
        "min_samples_split": [2, 5, 10]
    },
    "Random Forest": {
        "n_estimators": [50, 100, 200],
        "max_depth": [5, 10, None],
        "min_samples_split": [2, 5, 10]
    },
    "SVM": {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"]
    }
}

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "SVM": SVC(probability=True)
}

# 3. Run GridSearchCV for each model
best_models = {}
for name, model in models.items():
    print(f"\nTuning {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring="accuracy", n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best parameters:", grid.best_params_)
    print("Best CV score:", grid.best_score_)

    # Evaluate on test set
    y_pred = grid.predict(X_test)
    print("Test set performance:")
    print(classification_report(y_test, y_pred))

    best_models[name] = grid.best_estimator_

# Select the model with the highest test accuracy
best_model_name = max(best_models, key=lambda k: best_models[k].score(X_test, y_test))
final_model = best_models[best_model_name]

joblib.dump(final_model, "../models/final_model.pkl")
print(f"\nFinal best model saved as final_model.pkl ({best_model_name})")



Tuning Logistic Regression...
Best parameters: {'C': 1, 'solver': 'lbfgs'}
Best CV score: 0.8511054421768707
Test set performance:
              precision    recall  f1-score   support

           0       0.93      0.85      0.89        33
           1       0.84      0.93      0.88        28

    accuracy                           0.89        61
   macro avg       0.89      0.89      0.89        61
weighted avg       0.89      0.89      0.89        61


Tuning Decision Tree...
Best parameters: {'max_depth': 3, 'min_samples_split': 2}
Best CV score: 0.7850340136054421
Test set performance:
              precision    recall  f1-score   support

           0       0.82      0.85      0.84        33
           1       0.81      0.79      0.80        28

    accuracy                           0.82        61
   macro avg       0.82      0.82      0.82        61
weighted avg       0.82      0.82      0.82        61


Tuning Random Forest...
Best parameters: {'max_depth': 10, 'min_samples_sp