In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd
import numpy as np

# Train Data Loading

In [25]:
train_data = pd.read_excel('../Data/train_data.xlsx')
train_data = train_data.convert_dtypes()
test_data = pd.read_excel('../Data/test_data.xlsx')
test_data = test_data.convert_dtypes()

In [26]:
display(train_data.head())
print(train_data.info())
train_data.shape,test_data.shape

Unnamed: 0,y,age,job,marital,education,default,balance,housing,loan,contact,day_of_month,month,duration,campaign,pdays,previous,poutcome
0,yes,66,retired,married,secondary,no,2048,no,no,cellular,27,aug,212,1,-1,0,Not Specified
1,no,49,admin.,single,primary,no,181,yes,no,Not Specified,8,may,161,3,-1,0,Not Specified
2,no,41,blue-collar,divorced,primary,no,-129,yes,no,cellular,18,may,176,1,-1,0,Not Specified
3,no,42,Not Specified,single,Not Specified,no,1316,no,no,Not Specified,5,jun,285,1,-1,0,Not Specified
4,no,45,services,married,secondary,no,1621,no,no,cellular,18,aug,122,8,-1,0,Not Specified


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36168 entries, 0 to 36167
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   y             36168 non-null  string
 1   age           36168 non-null  Int64 
 2   job           36168 non-null  string
 3   marital       36168 non-null  string
 4   education     36168 non-null  string
 5   default       36168 non-null  string
 6   balance       36168 non-null  Int64 
 7   housing       36168 non-null  string
 8   loan          36168 non-null  string
 9   contact       36168 non-null  string
 10  day_of_month  36168 non-null  Int64 
 11  month         36168 non-null  string
 12  duration      36168 non-null  Int64 
 13  campaign      36168 non-null  Int64 
 14  pdays         36168 non-null  Int64 
 15  previous      36168 non-null  Int64 
 16  poutcome      36168 non-null  string
dtypes: Int64(7), string(10)
memory usage: 4.9 MB
None


((36168, 17), (9043, 17))

In [27]:
categorical_cols = train_data.select_dtypes(include = 'string').columns
train_data[categorical_cols] = train_data[categorical_cols].astype('category')
cat_train_data = train_data[categorical_cols].copy()
test_data[categorical_cols] = test_data[categorical_cols].astype('category')
cat_test_data = test_data[categorical_cols].copy()

In [28]:
cat_train_data.head()

Unnamed: 0,y,job,marital,education,default,housing,loan,contact,month,poutcome
0,yes,retired,married,secondary,no,no,no,cellular,aug,Not Specified
1,no,admin.,single,primary,no,yes,no,Not Specified,may,Not Specified
2,no,blue-collar,divorced,primary,no,yes,no,cellular,may,Not Specified
3,no,Not Specified,single,Not Specified,no,no,no,Not Specified,jun,Not Specified
4,no,services,married,secondary,no,no,no,cellular,aug,Not Specified


# Categorical Modeling

In [29]:
cat_cols = cat_train_data.columns[1:]
target_col = "y"

In [30]:
# Evaluation helper
def print_metrics(y_true, y_pred, y_prob):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")

In [31]:
# One-hot encode categorical features
X_train = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).fit_transform(cat_train_data[cat_cols])
y_train = (cat_train_data['y'] == "yes").astype(int).values

X_test = OneHotEncoder(handle_unknown = 'ignore', sparse_output = False).fit_transform(cat_test_data[cat_cols])
y_test = (cat_test_data['y'] == "yes").astype(int).values

In [32]:
# Train logistic regression (with balanced class weights for fairness)
cat_model = LogisticRegression(max_iter=1000, class_weight='balanced')
cat_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = cat_model.predict(X_test)
y_prob = cat_model.predict_proba(X_test)[:, 1]
print_metrics(y_test, y_pred, y_prob)

Confusion Matrix:
 [[6324 1662]
 [ 413  644]]

Classification Report:
               precision    recall  f1-score   support

           0      0.939     0.792     0.859      7986
           1      0.279     0.609     0.383      1057

    accuracy                          0.771      9043
   macro avg      0.609     0.701     0.621      9043
weighted avg      0.862     0.771     0.803      9043

ROC AUC: 0.7625


## Continuous Covariates

In [33]:
numeric_cols = ["age","balance","day_of_month","duration","campaign","pdays","previous"]
num_train_data = train_data[numeric_cols].copy()
num_test_data = test_data[numeric_cols].copy()

In [34]:
y_pred = cat_model.predict(X_train)
y_prob = cat_model.predict_proba(X_train)[:, 1]
num_train_data['cat_prob'] = y_prob
y_pred = cat_model.predict(X_test)
y_prob = cat_model.predict_proba(X_test)[:, 1]
num_test_data['cat_prob'] = y_prob

In [35]:
num_train_data.head()

Unnamed: 0,age,balance,day_of_month,duration,campaign,pdays,previous,cat_prob
0,66,2048,27,212,1,-1,0,0.539934
1,49,181,8,161,3,-1,0,0.205093
2,41,-129,18,176,1,-1,0,0.423813
3,42,1316,5,285,1,-1,0,0.387346
4,45,1621,18,122,8,-1,0,0.39686


In [36]:
# # Build a pipeline with scaling + SVM
# svm_clf = Pipeline([
#     ("scaler", StandardScaler()),
#     ("svm", SVC(kernel="rbf", probability=True))   # kernel='linear' also possible
# ])

# # Fit the SVM model
# svm_clf.fit(num_train_data, y_train)

# # Predictions
# y_pred = svm_clf.predict(num_test_data)
# y_prob = svm_clf.predict_proba(num_test_data)[:, 1]  # probability of class 1

# # Evaluation
# print_metrics(y_test, y_pred, y_prob)

In [37]:
# param_grid = {
#     "svm__C": [0.1, 1, 5, 10],
#     "svm__gamma": ["scale", 0.1, 0.01, 0.001],
#     "svm__kernel": ["rbf"]   # you can test 'linear' also
# }

# from sklearn.experimental import enable_halving_search_cv
# from sklearn.model_selection import HalvingGridSearchCV
# grid = HalvingGridSearchCV(
#     estimator=svm_clf,
#     param_grid=param_grid,
#     scoring="recall",
#     cv=5,
#     factor=2,       # reduction speed â€” 2 is safe
#     n_jobs=-1,
#     verbose=1
# )

# grid.fit(num_train_data, y_train)

# best_svm = grid.best_estimator_

# print("Best Parameters:", grid.best_params_)
# print("Best CV Score:", grid.best_score_)

# # Predict using best model
# y_pred = best_svm.predict(num_test_data)
# y_prob = best_svm.predict_proba(num_test_data)[:, 1]

# # Evaluate
# print_metrics(y_test, y_pred, y_prob)

In [38]:
# Build pipeline
log_clf = Pipeline([
    ("scaler", StandardScaler()),
    ("logreg", LogisticRegression(max_iter=500))
])

# Fit model
log_clf.fit(num_train_data, y_train)

# Predict
y_pred = log_clf.predict(num_test_data)
y_prob = log_clf.predict_proba(num_test_data)[:, 1]

# Evaluate
print_metrics(y_test, y_pred, y_prob)

Confusion Matrix:
 [[7810  176]
 [ 670  387]]

Classification Report:
               precision    recall  f1-score   support

           0      0.921     0.978     0.949      7986
           1      0.687     0.366     0.478      1057

    accuracy                          0.906      9043
   macro avg      0.804     0.672     0.713      9043
weighted avg      0.894     0.906     0.894      9043

ROC AUC: 0.9055


In [44]:
# Get the fitted logistic regression model inside the pipeline
log_model = log_clf.named_steps["logreg"]

# Coefficients and intercept
coefs = log_model.coef_[0]
intercept = log_model.intercept_[0]

print("Intercept:", intercept)
print("Coefficients:", coefs)
print("Variables:",num_train_data.columns)

Intercept: -2.866828872245475
Coefficients: [ 0.00143299  0.04364774  0.06328045  1.07677325 -0.21980253 -0.00361685
  0.02189016  1.15004277]
Variables: Index(['age', 'balance', 'day_of_month', 'duration', 'campaign', 'pdays',
       'previous', 'cat_prob'],
      dtype='object')
