In [1]:
import pandas as pd
bank_data = pd.read_csv('dataset/bank-additional-full.csv')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [2]:
from sklearn.preprocessing import LabelEncoder

# Exclude 'duration' from predictors
predictors = bank_data.drop(['duration', 'y'], axis=1)
target = bank_data['y']

# Encode categorical variables
encoder = LabelEncoder()
predictors_encoded = predictors.apply(encoder.fit_transform)

# Verify the encoded predictors
predictors_encoded.head()


Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,39,3,1,0,0,0,0,1,6,1,0,26,0,1,8,18,16,287,8
1,40,7,1,3,1,0,0,1,6,1,0,26,0,1,8,18,16,287,8
2,20,7,1,3,0,2,0,1,6,1,0,26,0,1,8,18,16,287,8
3,23,0,1,1,0,0,0,1,6,1,0,26,0,1,8,18,16,287,8
4,39,7,1,3,0,0,2,1,6,1,0,26,0,1,8,18,16,287,8


In [3]:
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(predictors_encoded, target, test_size=0.2, random_state=42)

# Display the shapes of the split datasets
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")


X_train shape: (32950, 19), y_train shape: (32950,)
X_test shape: (8238, 19), y_test shape: (8238,)


In [4]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Build the initial Decision Tree model
model_initial = DecisionTreeClassifier(random_state=42)
model_initial.fit(X_train, y_train)

# Make predictions
y_test_pred_initial = model_initial.predict(X_test)

# Calculate accuracy, precision, and recall for the initial model
accuracy_initial = accuracy_score(y_test, y_test_pred_initial)
precision_initial = precision_score(y_test, y_test_pred_initial, average=None)
recall_initial = recall_score(y_test, y_test_pred_initial, average=None)

# Print metrics
print("Initial Model Metrics:")
print(f"Accuracy: {accuracy_initial}")
print(f"Precision: {precision_initial}")
print(f"Recall: {recall_initial}")


Initial Model Metrics:
Accuracy: 0.833818888079631
Precision: [0.91392299 0.29719626]
Recall: [0.89702862 0.34010695]


In [5]:
# Tune hyperparameters and build a new Decision Tree model
model_tuned = DecisionTreeClassifier(max_depth=5, min_samples_split=5, random_state=42)
model_tuned.fit(X_train, y_train)

# Make predictions with tuned model
y_test_pred_tuned = model_tuned.predict(X_test)

# Calculate accuracy, precision, and recall for the tuned model
accuracy_tuned = accuracy_score(y_test, y_test_pred_tuned)
precision_tuned = precision_score(y_test, y_test_pred_tuned, average=None)
recall_tuned = recall_score(y_test, y_test_pred_tuned, average=None)

# Print metrics
print("\nTuned Model Metrics:")
print(f"Accuracy: {accuracy_tuned}")
print(f"Precision: {precision_tuned}")
print(f"Recall: {recall_tuned}")



Tuned Model Metrics:
Accuracy: 0.8965768390386016
Precision: [0.9111536  0.60559796]
Recall: [0.97877585 0.25454545]


In [6]:
# Compare the performance metrics
print("Comparison of Initial and Tuned Models:")
print(f"Initial Model - Accuracy: {accuracy_initial}, Precision: {precision_initial}, Recall: {recall_initial}")
print(f"Tuned Model   - Accuracy: {accuracy_tuned}, Precision: {precision_tuned}, Recall: {recall_tuned}")


Comparison of Initial and Tuned Models:
Initial Model - Accuracy: 0.833818888079631, Precision: [0.91392299 0.29719626], Recall: [0.89702862 0.34010695]
Tuned Model   - Accuracy: 0.8965768390386016, Precision: [0.9111536  0.60559796], Recall: [0.97877585 0.25454545]
