### Import

In [16]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import warnings
import joblib
import pickle

%matplotlib inline
warnings.filterwarnings('ignore')

## Loading the dataset

In [17]:
df = pd.read_csv('bank-additional.csv', sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,no
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,no
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,no
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,no
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,no


In [18]:
df.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0,4119.0
mean,40.11362,256.788055,2.537266,960.42219,0.190337,0.084972,93.579704,-40.499102,3.621356,5166.481695
std,10.313362,254.703736,2.568159,191.922786,0.541788,1.563114,0.579349,4.594578,1.733591,73.667904
min,18.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.635,4963.6
25%,32.0,103.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.334,5099.1
50%,38.0,181.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,317.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,88.0,3643.0,35.0,999.0,6.0,1.4,94.767,-26.9,5.045,5228.1


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4119 entries, 0 to 4118
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             4119 non-null   int64  
 1   job             4119 non-null   object 
 2   marital         4119 non-null   object 
 3   education       4119 non-null   object 
 4   default         4119 non-null   object 
 5   housing         4119 non-null   object 
 6   loan            4119 non-null   object 
 7   contact         4119 non-null   object 
 8   month           4119 non-null   object 
 9   day_of_week     4119 non-null   object 
 10  duration        4119 non-null   int64  
 11  campaign        4119 non-null   int64  
 12  pdays           4119 non-null   int64  
 13  previous        4119 non-null   int64  
 14  poutcome        4119 non-null   object 
 15  emp.var.rate    4119 non-null   float64
 16  cons.price.idx  4119 non-null   float64
 17  cons.conf.idx   4119 non-null   f

## Data Cleaning

In [20]:
df['y'] = df['y'].map({'no': 0, 'yes': 1})

In [21]:
df.apply(lambda x: len(x.unique()))

age                67
job                12
marital             4
education           8
default             3
housing             3
loan                3
contact             2
month              10
day_of_week         5
duration          828
campaign           25
pdays              21
previous            7
poutcome            3
emp.var.rate       10
cons.price.idx     26
cons.conf.idx      26
euribor3m         234
nr.employed        11
y                   2
dtype: int64

In [22]:
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,30,blue-collar,married,basic.9y,no,yes,no,cellular,may,fri,...,2,999,0,nonexistent,-1.8,92.893,-46.2,1.313,5099.1,0
1,39,services,single,high.school,no,no,no,telephone,may,fri,...,4,999,0,nonexistent,1.1,93.994,-36.4,4.855,5191.0,0
2,25,services,married,high.school,no,yes,no,telephone,jun,wed,...,1,999,0,nonexistent,1.4,94.465,-41.8,4.962,5228.1,0
3,38,services,married,basic.9y,no,unknown,unknown,telephone,jun,fri,...,3,999,0,nonexistent,1.4,94.465,-41.8,4.959,5228.1,0
4,47,admin.,married,university.degree,no,yes,no,cellular,nov,mon,...,1,999,0,nonexistent,-0.1,93.2,-42.0,4.191,5195.8,0


In [23]:
unknown_values = ['unknown']
df.replace(unknown_values, np.nan, inplace=True)

In [24]:
df.isnull().sum()

age                 0
job                39
marital            11
education         167
default           803
housing           105
loan              105
contact             0
month               0
day_of_week         0
duration            0
campaign            0
pdays               0
previous            0
poutcome            0
emp.var.rate        0
cons.price.idx      0
cons.conf.idx       0
euribor3m           0
nr.employed         0
y                   0
dtype: int64

In [25]:
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
for feature in categorical_features:
    mode_value = df[feature].mode().values[0]
    df[feature].fillna(mode_value, inplace=True)

In [26]:
print('Value counts in y column:\n', df['y'].value_counts())

Value counts in y column:
 y
0    3668
1     451
Name: count, dtype: int64


## Class Imbalance Handling
### We apply both oversampling and undersampling to balance the target classes

In [27]:

df_majority = df[df.y == 0]
df_minority = df[df.y == 1]

# Oversample: yes (1) Duplicate the class
df_minority_oversampled = resample(df_minority,
                                   replace=True,
                                   n_samples=len(df_majority),
                                   random_state=42)
df_oversampled = pd.concat([df_majority, df_minority_oversampled])
df_oversampled = df_oversampled.sample(frac=1, random_state=42).reset_index(drop=True)

# Undersample: no (0) reduce class
df_majority_undersampled = resample(df_majority,
                                    replace=False,
                                    n_samples=len(df_minority),
                                    random_state=42)
df_undersampled = pd.concat([df_majority_undersampled, df_minority])
df_undersampled = df_undersampled.sample(frac=1, random_state=42).reset_index(drop=True)

In [28]:

def evaluate_models(df, dataset_name):
    X = df.drop('y', axis=1)
    y = df['y']

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    categorical_features = X.select_dtypes(include=['object']).columns.tolist()
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()

    preprocessor = ColumnTransformer([
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

    models = {
        'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
        'Random Forest': RandomForestClassifier(random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(random_state=42)
    }

    results = {}
    summary = []

    for name, model in models.items():
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        report = classification_report(y_test, y_pred, output_dict=True)

        results[name] = {
            'accuracy': acc,
            'f1_score': report['1']['f1-score'],
            'pipeline': pipeline
        }

        summary.append({
            'Dataset': dataset_name,
            'Model': name,
            'Accuracy': round(acc, 4),
            'F1 Score (class 1)': round(report['1']['f1-score'], 4)
        })

    best_model_name = max(results, key=lambda name: results[name]['f1_score'])
    best_model = results[best_model_name]

    print(f"📊 Dataset: {dataset_name}")
    print(f"✅ Best Model: {best_model_name}")
    print(f"   - Accuracy: {best_model['accuracy']:.4f}")
    print(f"   - F1 Score (class 1): {best_model['f1_score']:.4f}\n")

    return results, best_model_name, best_model['pipeline'], pd.DataFrame(summary)



## Best Model Saving
### Save the best performing model based

In [29]:
# Evaluate the model for both datasets
oversampled_results, best_name_os, best_pipeline_os, summary_os = evaluate_models(df_oversampled, "Oversampled")
undersampled_results, best_name_us, best_pipeline_us, summary_us = evaluate_models(df_undersampled, "Undersampled")

# Merge all results
combined_summary = pd.concat([summary_os, summary_us], ignore_index=True)

print("📋Performance Summary of All Models: \n")
print(combined_summary)

# Determine the best model and save it
best_f1_os = oversampled_results[best_name_os]['f1_score']
best_f1_us = undersampled_results[best_name_us]['f1_score']

if best_f1_os >= best_f1_us:
    best_model_pipeline = best_pipeline_os
    best_dataset = "Oversampled"
else:
    best_model_pipeline = best_pipeline_us
    best_dataset = "Undersampled"

# Save model
#joblib.dump(best_model_pipeline, "best_model.pkl")
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model_pipeline, f)
    
print(f"\n💾 The best model trained on the \"{best_dataset}\" dataset is \"{best_name_os if best_f1_os >= best_f1_us else best_name_us}\" and it has been saved as 'best_model.pkl'.")

📊 Dataset: Oversampled
✅ Best Model: Random Forest
   - Accuracy: 0.9728
   - F1 Score (class 1): 0.9735

📊 Dataset: Undersampled
✅ Best Model: Gradient Boosting
   - Accuracy: 0.8674
   - F1 Score (class 1): 0.8667

📋Performance Summary of All Models: 

        Dataset                Model  Accuracy  F1 Score (class 1)
0   Oversampled  Logistic Regression    0.8658              0.8675
1   Oversampled        Random Forest    0.9728              0.9735
2   Oversampled    Gradient Boosting    0.9101              0.9144
3  Undersampled  Logistic Regression    0.8398              0.8343
4  Undersampled        Random Forest    0.8619              0.8634
5  Undersampled    Gradient Boosting    0.8674              0.8667

💾 The best model trained on the "Oversampled" dataset is "Random Forest" and it has been saved as 'best_model.pkl'.


## Final Prediction Test
### Predict using saved model and a new input

In [30]:

# Input variables and their values
feature_names = ['age', 'job', 'marital', 'education', 'default', 'housing', 
                 'loan', 'contact', 'month', 'day_of_week', 'duration', 'campaign', 
                 'pdays', 'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx', 
                 'cons.conf.idx', 'euribor3m', 'nr.employed']

input_data = (50,'blue-collar','married','basic.4y','no','no','yes','cellular','jul','tue', 849,1,999,0,'nonexistent',1.4,93.918,-42.7,4.961,5228.1)
#input_data = (39,'services','single','high.school','no','no','no','telephone','may','fri',346,4,999,0,'nonexistent',1.1,93.994,-36.4,4.855,5191)

# 1. Load the model
best_model = joblib.load("best_model.pkl")

# 2. Convert data to DataFrame
input_data_df = pd.DataFrame([input_data], columns=feature_names)

# 3. Preprocessing is applied in the pipeline and prediction is done
prediction = best_model.predict(input_data_df)[0]

# 4. Print the result
prediction_label = "yes" if prediction == 1 else "no"
print("Prediction:", prediction_label)


Prediction: yes
