In [2]:
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import resample
from sklearn.preprocessing import OrdinalEncoder
from sklearn.utils import resample
import pandas as pd

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
bank_marketing = fetch_ucirepo(id=222) 
  
# data (as pandas dataframes) 
x = bank_marketing.data.features 
y = bank_marketing.data.targets 




def prepare_data(x,y):
  # Get list of numeric columns
  numeric_columns = x.select_dtypes(include=['number']).columns.tolist()
  # Get list of categorical (non-numeric) columns
  categorical_columns = x.select_dtypes(exclude=['number']).columns.tolist()
  x_cat = x[categorical_columns]
  x_num = x[numeric_columns]

  #Categorical Columns Preprocessing
  x_cat_dfs = []

  # 1. Binary columns
  binary_cols = ['default', 'housing', 'loan']
  x_bin = x_cat[binary_cols]
  x_bin = x_bin.replace({'yes': 1, 'no': 0})
  x_cat_dfs.append(x_bin)

  # 2. job column
  x_job = x_cat['job']
  x_job = x_job.fillna('no job data')
  x_job = pd.get_dummies(x_job, columns=['job'], prefix='job')
  x_cat_dfs.append(x_job)

  #3. Marital Column
  x_mar = x_cat['marital']
  x_mar = pd.get_dummies(x_mar, columns=['marital'], prefix='marital')
  x_cat_dfs.append(x_mar)

  #4 Education Column
  x_ed = x_cat["education"]
  x_ed = x_ed.fillna("No education data")
  x_ed = x_ed.replace({'No education data':0, 'primary':1, 'secondary':2, 'tertiary':3})
  x_cat_dfs.append(x_ed)

  # 5. Contact Column
  x_con = x_cat['contact']
  x_con = x_con.fillna('No contact data')
  x_con = pd.get_dummies(x_con, columns=['contact'], prefix='contact')
  x_cat_dfs.append(x_con)

  #6 Month Column
  x_mon = x_cat['month']
  x_mon = x_mon.replace({'may':5, 'jun':6, 'jul':7, 'aug':8, 'oct':10, 'nov':11, 'dec':12, 'jan':1, 'feb':2,'mar':3, 'apr':4, 'sep':9})
  x_cat_dfs.append(x_mon)
  # Final categorical columns dataframe
  x_cat_final = pd.concat(x_cat_dfs,axis=1)

  # Numerical Columns Preprocessing


  x_num_final = x_num
  #Final DF:
  x = pd.concat([x_cat_final, x_num_final],axis=1)
  x.isna().sum()

  # Changing the target column
  y = y.replace({'no': 0, 'yes': 1})
  del x['default']


  # Combine x and y into one DataFrame
  df = pd.concat([x, y], axis=1)

  # Separate classes
  df_majority = df[df['y'] == 0]
  df_minority = df[df['y'] == 1]

  # Upsample minority
  df_minority_upsampled = resample(df_minority, replace=True, random_state = 1,
                                  n_samples=len(df_majority),  # match class 0
                                   )

  # Combine and shuffle
  df_upsampled = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1)

  # Split again
  x_ups = df_upsampled.drop('y', axis=1)
  y_ups = df_upsampled['y']
  x_ups = x_ups.replace({True:1,False:0})
  return x_ups,y_ups

x_ups,y_ups = prepare_data(x,y)

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, precision_score, recall_score, f1_score

xtrain,xtest,ytrain,ytest = train_test_split(x_ups,y_ups,test_size=0.25,random_state=20)

model = LogisticRegression()
model.fit(xtrain,ytrain)
ypred = model.predict(xtest)
print(accuracy_score(ytest,ypred))
print(precision_score(ytest,ypred))
print(recall_score(ytest,ypred))
print(f1_score(ytest,ypred))
print(confusion_matrix(ytest,ypred))

0.7679975953108562
0.7743506493506493
0.7602350831756151
0.7672279467202815
[[7698 2224]
 [2407 7632]]


In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(max_iter=1000))
])
param_grids = [
    {
        "logreg__solver": ["lbfgs", "newton-cg", "newton-cholesky", "sag"],
        "logreg__penalty": ["l2", "none"],
        "logreg__C": [0.001, 0.01, 0.1, 1, 10],
        "logreg__fit_intercept": [True, False],
        "logreg__class_weight": [None, "balanced"],
        "logreg__tol": [1e-4, 1e-3],
    },
    {
        "logreg__solver": ["liblinear"],
        "logreg__penalty": ["l1", "l2"],
        "logreg__C": [0.001, 0.01, 0.1, 1, 10],
        "logreg__fit_intercept": [True, False],
        "logreg__class_weight": [None, "balanced"],
        "logreg__tol": [1e-4, 1e-3],
    },
    {
        "logreg__solver": ["saga"],
        "logreg__penalty": ["l1", "l2", "elasticnet", "none"],
        "logreg__C": [0.001, 0.01, 0.1, 1, 10],
        "logreg__l1_ratio": [None, 0.1, 0.5, 0.9, 1.0],
        "logreg__fit_intercept": [True, False],
        "logreg__class_weight": [None, "balanced"],
        "logreg__tol": [1e-4, 1e-3],
    }
]
logreg = LogisticRegression()
scoring = {
    'accuracy': 'accuracy',
    'f1': 'f1',
    'precision': 'precision',
    'recall': 'recall',
    'balanced_accuracy': 'balanced_accuracy'
}
# Grid search
grid_search = GridSearchCV(
    pipe,
    param_grid=param_grids,
    cv=5,
    scoring=scoring,
    refit='f1',   # model refit based on F1 score
    n_jobs=-1,
    verbose=2
)
grid_search.fit(xtrain, ytrain)
results_df = pd.DataFrame(grid_search.cv_results_)
metrics_cols = [
    'mean_test_accuracy', 'mean_test_f1', 'mean_test_precision',
    'mean_test_recall', 'mean_test_balanced_accuracy'
]
param_cols = [col for col in results_df.columns if col.startswith('param_')]

final_results_df = results_df[param_cols + metrics_cols].sort_values('mean_test_f1', ascending=False).reset_index(drop=True)



Fitting 5 folds for each of 1200 candidates, totalling 6000 fits


In [6]:
final_results_df

Unnamed: 0,param_logreg__C,param_logreg__class_weight,param_logreg__fit_intercept,param_logreg__penalty,param_logreg__solver,param_logreg__tol,param_logreg__l1_ratio,mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_balanced_accuracy
0,0.1,balanced,True,elasticnet,saga,0.0010,1.0,0.808226,0.807622,0.808607,0.806646,0.808223
1,0.1,balanced,True,l1,saga,0.0010,0.1,0.808193,0.807594,0.808552,0.806646,0.808190
2,1.0,balanced,True,elasticnet,saga,0.0010,0.5,0.808159,0.807580,0.808455,0.806713,0.808156
3,10.0,balanced,True,l1,saga,0.0010,,0.808142,0.807567,0.808429,0.806713,0.808140
4,10.0,balanced,True,l1,saga,0.0010,0.1,0.808142,0.807560,0.808449,0.806679,0.808140
...,...,...,...,...,...,...,...,...,...,...,...,...
1195,10.0,balanced,False,none,saga,0.0010,0.5,,,,,
1196,10.0,balanced,False,none,saga,0.0001,0.9,,,,,
1197,10.0,balanced,False,none,saga,0.0010,0.9,,,,,
1198,10.0,balanced,False,none,saga,0.0001,1.0,,,,,


In [7]:
final_results_df.sort_values(by="mean_test_balanced_accuracy",ascending=False)

Unnamed: 0,param_logreg__C,param_logreg__class_weight,param_logreg__fit_intercept,param_logreg__penalty,param_logreg__solver,param_logreg__tol,param_logreg__l1_ratio,mean_test_accuracy,mean_test_f1,mean_test_precision,mean_test_recall,mean_test_balanced_accuracy
118,0.01,balanced,True,elasticnet,saga,0.0001,0.9,0.808243,0.807368,0.809478,0.805274,0.808237
0,0.10,balanced,True,elasticnet,saga,0.0010,1.0,0.808226,0.807622,0.808607,0.806646,0.808223
1,0.10,balanced,True,l1,saga,0.0010,0.1,0.808193,0.807594,0.808552,0.806646,0.808190
69,0.01,balanced,True,elasticnet,saga,0.0001,0.5,0.808193,0.807445,0.809024,0.805876,0.808188
138,0.01,balanced,True,l1,saga,0.0001,0.9,0.808176,0.807308,0.809391,0.805240,0.808170
...,...,...,...,...,...,...,...,...,...,...,...,...
1195,10.00,balanced,False,none,saga,0.0010,0.5,,,,,
1196,10.00,balanced,False,none,saga,0.0001,0.9,,,,,
1197,10.00,balanced,False,none,saga,0.0010,0.9,,,,,
1198,10.00,balanced,False,none,saga,0.0001,1.0,,,,,
