In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score


In [4]:
data = pd.read_csv('bank.csv', sep=';')
data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [15]:
data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [17]:
data.describe()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
count,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0,4521.0
mean,41.170095,4.411192,1.147755,1.231365,0.01681,1422.657819,0.566025,0.152842,0.652289,15.915284,5.540146,263.961292,2.79363,39.766645,0.542579,2.559168,0.11524
std,10.576211,3.255716,0.59965,0.748744,0.128575,3009.638142,0.495676,0.359875,0.901498,8.247667,3.002763,259.856633,3.109807,100.121124,1.693562,0.992051,0.319347
min,19.0,0.0,0.0,0.0,0.0,-3313.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,-1.0,0.0,0.0,0.0
25%,33.0,1.0,1.0,1.0,0.0,69.0,0.0,0.0,0.0,9.0,3.0,104.0,1.0,-1.0,0.0,3.0,0.0
50%,39.0,4.0,1.0,1.0,0.0,444.0,1.0,0.0,0.0,16.0,6.0,185.0,2.0,-1.0,0.0,3.0,0.0
75%,49.0,7.0,2.0,2.0,0.0,1480.0,1.0,0.0,2.0,21.0,8.0,329.0,3.0,-1.0,0.0,3.0,0.0
max,87.0,11.0,2.0,3.0,1.0,71188.0,1.0,1.0,2.0,31.0,11.0,3025.0,50.0,871.0,25.0,3.0,1.0


In [19]:
data.dtypes

age          int64
job          int32
marital      int32
education    int32
default      int32
balance      int64
housing      int32
loan         int32
contact      int32
day          int64
month        int32
duration     int64
campaign     int64
pdays        int64
previous     int64
poutcome     int32
y            int32
dtype: object

In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype
---  ------     --------------  -----
 0   age        4521 non-null   int64
 1   job        4521 non-null   int32
 2   marital    4521 non-null   int32
 3   education  4521 non-null   int32
 4   default    4521 non-null   int32
 5   balance    4521 non-null   int64
 6   housing    4521 non-null   int32
 7   loan       4521 non-null   int32
 8   contact    4521 non-null   int32
 9   day        4521 non-null   int64
 10  month      4521 non-null   int32
 11  duration   4521 non-null   int64
 12  campaign   4521 non-null   int64
 13  pdays      4521 non-null   int64
 14  previous   4521 non-null   int64
 15  poutcome   4521 non-null   int32
 16  y          4521 non-null   int32
dtypes: int32(10), int64(7)
memory usage: 424.0 KB


In [6]:
label_encoders = {}
for column in data.select_dtypes(include=['object']).columns:
    label_encoders[column] = LabelEncoder()
    data[column] = label_encoders[column].fit_transform(data[column])

In [7]:
# Split the dataset into features and target variable
X = data.drop('y', axis=1)  # Features
y = data['y']  # Target variable

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

3164
1357
3164
1357


In [9]:
classifier = DecisionTreeClassifier(random_state=42)
classifier.fit(X_train, y_train)


In [10]:
y_pred = classifier.predict(X_test)


In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))


Accuracy: 0.8614591009579956
              precision    recall  f1-score   support

           0       0.92      0.92      0.92      1205
           1       0.38      0.39      0.39       152

    accuracy                           0.86      1357
   macro avg       0.65      0.65      0.65      1357
weighted avg       0.86      0.86      0.86      1357

