In [None]:
import pandas as pd
import numpy as np
import shap
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from catboost import CatBoostClassifier

# Load Dataset

data = pd.read_csv(r"C:\Users\Shruthilaya\GUVI\data\bank_prediction\train.csv")

print("Shape of dataset:", data.shape)
print("Target variable distribution:\n", data["y"].value_counts())

X = data.drop("y", axis=1)
y = data["y"].map({"yes": 1, "no": 0})  # convert to binary

# Train-test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Preprocessing

numeric_features = X.select_dtypes(include=["int64", "float64"]).columns
categorical_features = X.select_dtypes(include=["object"]).columns

numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ]
)

# Models

models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": xgb.XGBClassifier(eval_metric="logloss", use_label_encoder=False),
    "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
}

# Training and Evaluation

results = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    clf = Pipeline(steps=[
        ("preprocessor", preprocessor),
        ("classifier", model)
    ])
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    
    acc = accuracy_score(y_test, y_pred)
    results[name] = acc
    
    print(f"{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))

print("\nSummary of Accuracies:", results)

Shape of dataset: (45211, 17)
Target variable distribution:
 y
no     39922
yes     5289
Name: count, dtype: int64

ðŸš€ Training Logistic Regression...
Logistic Regression Accuracy: 0.9012
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.64      0.35      0.45      1058

    accuracy                           0.90      9043
   macro avg       0.78      0.66      0.70      9043
weighted avg       0.89      0.90      0.89      9043


ðŸš€ Training Random Forest...
Random Forest Accuracy: 0.9045
              precision    recall  f1-score   support

           0       0.92      0.97      0.95      7985
           1       0.65      0.40      0.49      1058

    accuracy                           0.90      9043
   macro avg       0.79      0.68      0.72      9043
weighted avg       0.89      0.90      0.89      9043


ðŸš€ Training XGBoost...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.9055
              precision    recall  f1-score   support

           0       0.93      0.96      0.95      7985
           1       0.63      0.47      0.54      1058

    accuracy                           0.91      9043
   macro avg       0.78      0.72      0.74      9043
weighted avg       0.90      0.91      0.90      9043


ðŸš€ Training CatBoost...
CatBoost Accuracy: 0.9106
              precision    recall  f1-score   support

           0       0.94      0.97      0.95      7985
           1       0.66      0.50      0.57      1058

    accuracy                           0.91      9043
   macro avg       0.80      0.73      0.76      9043
weighted avg       0.90      0.91      0.91      9043


âœ… Summary of Accuracies: {'Logistic Regression': 0.901249585314608, 'Random Forest': 0.9044564856795311, 'XGBoost': 0.9054517306203693, 'CatBoost': 0.9106491208669689}
