In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Train Data Loading

In [None]:
train_data = pd.read_excel('../Data/train_data.xlsx')
train_data = train_data.convert_dtypes()
test_data = pd.read_excel('../Data/test_data.xlsx')
test_data = test_data.convert_dtypes()

In [3]:
train_data["log_balance"] = np.sign(train_data["balance"]) * np.log1p(np.abs(train_data["balance"]))
train_data["log_previous"] = np.log1p(train_data["previous"])
test_data["log_balance"] = np.sign(test_data["balance"]) * np.log1p(np.abs(test_data["balance"]))
test_data["log_previous"] = np.log1p(test_data["previous"])

In [6]:
train_data.columns

Index(['y', 'age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'day_of_month', 'month', 'duration', 'campaign', 'pdays',
       'poutcome', 'log_balance', 'log_previous'],
      dtype='object')

In [7]:
# Evaluation helper
def print_metrics(y_true, y_pred, y_prob= None):
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred, digits=3))
    if y_prob is not None:
        print(f"ROC AUC: {roc_auc_score(y_true, y_prob):.4f}")
        
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# -------------------------------
# 1. Define numerical & categorical columns
# -------------------------------
num_cols = ["age", "log_balance", "pdays", "log_previous"]

cat_cols = [
    "job", "marital", "education", "default",
    "housing", "loan", "contact", "month",
    "poutcome"
]

# -------------------------------
# 2. Preprocessing: scaler + one-hot
# -------------------------------
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# -------------------------------
# 3. Build final model pipeline
# -------------------------------
model = Pipeline([
    ("preprocess", preprocess),
    ("logreg", LogisticRegression(max_iter=500, class_weight="balanced"))
])

# -------------------------------
# 4. Split X and y
# -------------------------------
X_train = train_data[num_cols + cat_cols]
y_train = (train_data['y'] == 'yes').astype(int)

X_test = test_data[num_cols + cat_cols]
y_test = (test_data['y'] == 'yes').astype(int)

# -------------------------------
# 5. Fit model
# -------------------------------
model.fit(X_train, y_train)

# -------------------------------
# 6. Predict on test
# -------------------------------
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:, 1]

# Evaluate
print_metrics(y_test, y_pred)

Confusion Matrix:
 [[6261 1725]
 [ 404  653]]

Classification Report:
               precision    recall  f1-score   support

           0      0.939     0.784     0.855      7986
           1      0.275     0.618     0.380      1057

    accuracy                          0.765      9043
   macro avg      0.607     0.701     0.617      9043
weighted avg      0.862     0.765     0.799      9043

