In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [4]:
# Loading the dataset with semicolon delimiter

data = pd.read_csv('bank.csv', delimiter=';')

In [5]:
# Displaying the first few rows of the dataset

print(data.head())

   age          job  marital  education default  balance housing loan  \
0   30   unemployed  married    primary      no     1787      no   no   
1   33     services  married  secondary      no     4789     yes  yes   
2   35   management   single   tertiary      no     1350     yes   no   
3   30   management  married   tertiary      no     1476     yes  yes   
4   59  blue-collar  married  secondary      no        0     yes   no   

    contact  day month  duration  campaign  pdays  previous poutcome   y  
0  cellular   19   oct        79         1     -1         0  unknown  no  
1  cellular   11   may       220         1    339         4  failure  no  
2  cellular   16   apr       185         1    330         1  failure  no  
3   unknown    3   jun       199         4     -1         0  unknown  no  
4   unknown    5   may       226         1     -1         0  unknown  no  


In [6]:
# Preprocessing the data
# Encoding categorical variables

data = pd.get_dummies(data, drop_first=True)

In [7]:
# Splitting the data into features and target variable

X = data.drop('y_yes', axis=1)
y = data['y_yes']

In [8]:
# Splitting the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [9]:
# Building the decision tree classifier

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

In [10]:
# Making predictions on the test set

y_pred = clf.predict(X_test)

In [11]:
# Evaluating the model

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8695652173913043
Classification Report:
               precision    recall  f1-score   support

       False       0.93      0.92      0.93      1205
        True       0.42      0.44      0.43       152

    accuracy                           0.87      1357
   macro avg       0.68      0.68      0.68      1357
weighted avg       0.87      0.87      0.87      1357

Confusion Matrix:
 [[1113   92]
 [  85   67]]


In [12]:
#Finding the best parameters for the decision tree

param_grid = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [13]:
# Displaying the best parameters

print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2}


In [14]:
# Evaluating the best model

best_clf = grid_search.best_estimator_
y_pred_best = best_clf.predict(X_test)

In [15]:
print("Best Model Accuracy:", accuracy_score(y_test, y_pred_best))
print("Best Model Classification Report:\n", classification_report(y_test, y_pred_best))
print("Best Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))

Best Model Accuracy: 0.8887251289609432
Best Model Classification Report:
               precision    recall  f1-score   support

       False       0.92      0.96      0.94      1205
        True       0.50      0.36      0.42       152

    accuracy                           0.89      1357
   macro avg       0.71      0.66      0.68      1357
weighted avg       0.87      0.89      0.88      1357

Best Model Confusion Matrix:
 [[1152   53]
 [  98   54]]
