In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('C:/Internship/bank_marketing.csv', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
1,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
2,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
3,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
4,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11158,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11159,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11160,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11161,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no


In [3]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,deposit
1,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,1,-1,0,unknown,yes
2,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,1,-1,0,unknown,yes
3,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,1,-1,0,unknown,yes
4,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,1,-1,0,unknown,yes


In [4]:
data.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
11158,33,blue-collar,single,primary,no,1,yes,no,cellular,20,apr,257,1,-1,0,unknown,no
11159,39,services,married,secondary,no,733,no,no,unknown,16,jun,83,4,-1,0,unknown,no
11160,32,technician,single,secondary,no,29,no,no,cellular,19,aug,156,2,-1,0,unknown,no
11161,43,technician,married,secondary,no,0,no,yes,cellular,8,may,9,2,172,5,failure,no
11162,34,technician,married,secondary,no,0,no,no,cellular,9,jul,628,1,-1,0,unknown,no


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11163 entries, 0 to 11162
Data columns (total 17 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       11163 non-null  object
 1   1       11163 non-null  object
 2   2       11163 non-null  object
 3   3       11163 non-null  object
 4   4       11163 non-null  object
 5   5       11163 non-null  object
 6   6       11163 non-null  object
 7   7       11163 non-null  object
 8   8       11163 non-null  object
 9   9       11163 non-null  object
 10  10      11163 non-null  object
 11  11      11163 non-null  object
 12  12      11163 non-null  object
 13  13      11163 non-null  object
 14  14      11163 non-null  object
 15  15      11163 non-null  object
 16  16      11163 non-null  object
dtypes: object(17)
memory usage: 1.4+ MB


In [6]:
data.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
count,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163,11163
unique,77,13,4,5,3,3806,3,3,4,32,13,1429,37,473,35,5,3
top,31,management,married,secondary,no,0,no,no,cellular,20,may,97,1,-1,0,unknown,no
freq,496,2566,6351,5476,10994,774,5881,9702,8042,570,2824,39,4798,8324,8324,8326,5873


In [7]:
data.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16], dtype='int64')

In [8]:
data.shape

(11163, 17)

In [9]:
data.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
dtype: int64

In [10]:
data.drop(columns=[0, 1], inplace=True)

In [11]:
numerical_cols = [col for col in data.columns if data[col].dtype.kind in ['i', 'f']]
categorical_cols = [col for col in data.columns if data[col].dtype.kind == 'O']

In [12]:
if numerical_cols:
    # Scale numerical columns
    scaler = StandardScaler()
    numerical_scaled = scaler.fit_transform(data[numerical_cols])
else:
    numerical_scaled = pd.DataFrame() 

In [13]:
encoder = OneHotEncoder(sparse_output=False)
categorical_encoded = pd.DataFrame(encoder.fit_transform(data[categorical_cols]))


In [14]:
data_encoded = pd.concat([pd.DataFrame(numerical_scaled), categorical_encoded], axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data_encoded, data_encoded.iloc[:, -1], test_size=0.2, random_state=42)

In [16]:
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [17]:
accuracy = clf.score(X_test, y_test)
print(f'Accuracy: {accuracy:.3f}')

Accuracy: 1.000


In [18]:
predictions = clf.predict(X_test)
print('predictions',predictions)

predictions [1. 1. 0. ... 1. 0. 0.]
