In [5]:
import pandas as pd
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#Load Data splits
X_train, X_test, y_train, y_test = joblib.load("../data/processed/train_test_split.pkl")

#Logistic Regression Grid
log_reg_grid = {
    "C":[0.01,0.1,1,10], # regularization strength
    "penalty":["l2"], # 'l1' can be used with saga solver
    "solver":["lbfgs","saga"]
}

log_reg = LogisticRegression(max_iter=1000)
grid_search = GridSearchCV(log_reg, log_reg_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train,y_train)

print("Best Logistic Regression params:", grid_search.best_params_)
print("Best CV accuracy: ", grid_search.best_score_)

#Evaluate on test
y_pred = grid_search.predict(X_test)
print("Test Accuracy (Logistic): ",accuracy_score(y_test, y_pred))

Best Logistic Regression params: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Best CV accuracy:  0.7440766073871409
Test Accuracy (Logistic):  0.7663551401869159


In [7]:
#Save the best estimator from grid search
joblib.dump(grid_search.best_estimator_,"../models/best_model.pkl")

['../models/best_model.pkl']