In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('bank.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,2,-1,0,unknown,yes


In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'deposit'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11162 entries, 0 to 11161
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        11162 non-null  int64 
 1   job        11162 non-null  object
 2   marital    11162 non-null  object
 3   education  11162 non-null  object
 4   default    11162 non-null  object
 5   balance    11162 non-null  int64 
 6   housing    11162 non-null  object
 7   loan       11162 non-null  object
 8   contact    11162 non-null  object
 9   day        11162 non-null  int64 
 10  month      11162 non-null  object
 11  duration   11162 non-null  int64 
 12  campaign   11162 non-null  int64 
 13  pdays      11162 non-null  int64 
 14  previous   11162 non-null  int64 
 15  poutcome   11162 non-null  object
 16  deposit    11162 non-null  object
dtypes: int64(7), object(10)
memory usage: 1.4+ MB


In [6]:
df.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
deposit      0
dtype: int64

### feature engineering

In [7]:
df.deposit.value_counts()

no     5873
yes    5289
Name: deposit, dtype: int64

In [8]:
df.education.value_counts()

secondary    5476
tertiary     3689
primary      1500
unknown       497
Name: education, dtype: int64

In [9]:
df.marital.value_counts()

married     6351
single      3518
divorced    1293
Name: marital, dtype: int64

In [10]:
df.contact.value_counts()

cellular     8042
unknown      2346
telephone     774
Name: contact, dtype: int64

In [11]:
df['deposit'].mask(df['deposit'] == 'yes', 1, inplace=True)
df['deposit'].mask(df['deposit'] == 'no', 0, inplace=True)

In [12]:
df['education'].mask(df['education'] == 'unknown', 0, inplace=True)
df['education'].mask(df['education'] == 'primary', 1, inplace=True)
df['education'].mask(df['education'] == 'secondary', 2, inplace=True)
df['education'].mask(df['education'] == 'tertiary', 3, inplace=True)

In [13]:
df['marital'].mask(df['marital'] == 'single', 0, inplace=True)
df['marital'].mask(df['marital'] == 'married', 1, inplace=True)
df['marital'].mask(df['marital'] == 'divorced', 2, inplace=True)

In [14]:
df['poutcome'].mask(df['poutcome'] == 'unknown', 0, inplace=True)
df['poutcome'].mask(df['poutcome'] == 'failure', 1, inplace=True)
df['poutcome'].mask(df['poutcome'] == 'other', 2, inplace=True)
df['poutcome'].mask(df['poutcome'] == 'success', 3, inplace=True)

In [15]:
df['default'].mask(df['default'] == 'yes', 1, inplace=True)
df['default'].mask(df['default'] == 'no', 0, inplace=True)

In [16]:
df['housing'].mask(df['housing'] == 'no', 0, inplace=True)
df['housing'].mask(df['housing'] == 'yes', 1, inplace=True)

In [17]:
df['loan'].mask(df['loan'] == 'no', 0, inplace=True)
df['loan'].mask(df['loan'] == 'yes', 1, inplace=True)

In [18]:
df['contact'].mask(df['contact'] == 'cellular', 1, inplace=True)
df['contact'].mask(df['contact'] == 'unknown', 2, inplace=True)
df['contact'].mask(df['contact'] == 'telephone', 3, inplace=True)

In [19]:
month = {
    'jan': 1,
    'feb': 2,
    'mar': 3,
    'apr': 4,
    'may': 5,
    'jun': 6,
    'jul': 7,
    'aug': 8,
    'sep': 9,
    'oct': 10,
    'nov': 11,
    'dec': 12
}
df["month"] = df["month"].map(month)
df["month"].value_counts()

5     2824
8     1519
7     1514
6     1222
11     943
4      923
2      776
10     392
1      344
9      319
3      276
12     110
Name: month, dtype: int64

In [20]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
0,59,admin.,1,2,0,2343,1,0,2,5,5,1042,1,-1,0,0,1
1,56,admin.,1,2,0,45,0,0,2,5,5,1467,1,-1,0,0,1
2,41,technician,1,2,0,1270,1,0,2,5,5,1389,1,-1,0,0,1
3,55,services,1,2,0,2476,1,0,2,5,5,579,1,-1,0,0,1
4,54,admin.,1,3,0,184,0,0,2,5,5,673,2,-1,0,0,1


In [21]:
df['deposit'] = pd.to_numeric(df['deposit'], errors='coerce')
df['education'] = pd.to_numeric(df['education'], errors='coerce')
df['month'] = pd.to_numeric(df['month'], errors='coerce')
df['poutcome'] = pd.to_numeric(df['poutcome'], errors='coerce')

In [22]:
df = pd.get_dummies(df, columns=["job", "marital", "contact"])

Data Splitting

In [23]:
X = df.drop('deposit', axis=1)
y = df['deposit']

In [24]:
# data splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Classification Decision Tree

In [25]:
clf = DecisionTreeClassifier()

In [27]:
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [28]:
y_pred = clf.predict(X_test)

In [29]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Confusion Matrix:
[[929 237]
 [264 803]]


In [30]:
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.78      0.80      0.79      1166
           1       0.77      0.75      0.76      1067

    accuracy                           0.78      2233
   macro avg       0.78      0.77      0.77      2233
weighted avg       0.78      0.78      0.78      2233



### Classification with Binary / Multinomial Logistic Regression

In [31]:
B_classifier = LogisticRegression(multi_class='multinomial', solver='lbfgs')
B_classifier.fit(X_train, y_train)

LogisticRegression(multi_class='multinomial')

In [32]:
# Predicting the Test set results
y_pred = B_classifier.predict(X_test)

### Classification with KNN

In [33]:
KNN_classifier = KNeighborsClassifier(n_neighbors=5)
KNN_classifier.fit(X_train, y_train)

KNeighborsClassifier()

In [34]:
y_pred = KNN_classifier.predict(X_test)

In [35]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: ', confusion_matrix(y_test, y_pred))
print('Classification Report: ', classification_report(y_test, y_pred))

Accuracy:  0.7456336766681594
Confusion Matrix:  [[893 273]
 [295 772]]
Classification Report:                precision    recall  f1-score   support

           0       0.75      0.77      0.76      1166
           1       0.74      0.72      0.73      1067

    accuracy                           0.75      2233
   macro avg       0.75      0.74      0.74      2233
weighted avg       0.75      0.75      0.75      2233



### Classification with Naive Bayes

In [36]:
B_classifier = GaussianNB()
B_classifier.fit(X_train, y_train)

GaussianNB()

In [37]:
y_pred = B_classifier.predict(X_test)

In [38]:
print('Accuracy: ', accuracy_score(y_test, y_pred))
print('Confusion Matrix: ', confusion_matrix(y_test, y_pred))
print('Classification Report: ', classification_report(y_test, y_pred))

Accuracy:  0.7389162561576355
Confusion Matrix:  [[886 280]
 [303 764]]
Classification Report:                precision    recall  f1-score   support

           0       0.75      0.76      0.75      1166
           1       0.73      0.72      0.72      1067

    accuracy                           0.74      2233
   macro avg       0.74      0.74      0.74      2233
weighted avg       0.74      0.74      0.74      2233



### Conclusion

- Decision Tree is the best model in terms of accuracy for this dataset, achieving 78%. However, overfitting may be a concern.
- KNN performs well but requires more computational resources, especially for larger datasets.
- Naive Bayes and Logistic Regression are simpler models, yet still competitive, especially if the dataset has a linear or near-linear distribution.