# Task 3
Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data. Use a dataset such as the Bank Marketing dataset from the UCI Machine Learning Repository.

### Loading the dataset

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder


In [36]:
bank_data = pd.read_csv("data/bank.csv")

In [37]:
bank_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,services,married,secondary,no,-333,yes,no,cellular,30,jul,329,5,-1,0,unknown,no
4517,57,self-employed,married,tertiary,yes,-3313,yes,yes,unknown,9,may,153,1,-1,0,unknown,no
4518,57,technician,married,secondary,no,295,no,no,cellular,19,aug,151,11,-1,0,unknown,no
4519,28,blue-collar,married,secondary,no,1137,no,no,cellular,6,feb,129,4,211,3,other,no


In [38]:
bank_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        4521 non-null   int64 
 1   job        4521 non-null   object
 2   marital    4521 non-null   object
 3   education  4521 non-null   object
 4   default    4521 non-null   object
 5   balance    4521 non-null   int64 
 6   housing    4521 non-null   object
 7   loan       4521 non-null   object
 8   contact    4521 non-null   object
 9   day        4521 non-null   int64 
 10  month      4521 non-null   object
 11  duration   4521 non-null   int64 
 12  campaign   4521 non-null   int64 
 13  pdays      4521 non-null   int64 
 14  previous   4521 non-null   int64 
 15  poutcome   4521 non-null   object
 16  y          4521 non-null   object
dtypes: int64(7), object(10)
memory usage: 600.6+ KB


### Pre-processing the data
Preprocess the dataset by handling missing values, encoding categorical variables, and splitting it into features (independent variables) and target (dependent variable).

> There are no missing values in this dataset

1. Encoding categorical variables

In [39]:
label_encoder = LabelEncoder()
bank_data['job'] = label_encoder.fit_transform(bank_data['job'])
bank_data['marital'] = label_encoder.fit_transform(bank_data['marital'])
bank_data['education'] = label_encoder.fit_transform(bank_data['education'])
bank_data['default'] = label_encoder.fit_transform(bank_data['default'])
bank_data['housing'] = label_encoder.fit_transform(bank_data['housing'])
bank_data['loan'] = label_encoder.fit_transform(bank_data['loan'])
bank_data['contact'] = label_encoder.fit_transform(bank_data['contact'])
bank_data['month'] = label_encoder.fit_transform(bank_data['month'])
bank_data['poutcome'] = label_encoder.fit_transform(bank_data['poutcome'])
bank_data['y'] = label_encoder.fit_transform(bank_data['y'])

In [40]:
bank_data

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3,0
1,33,7,1,1,0,4789,1,1,0,11,8,220,1,339,4,0,0
2,35,4,2,2,0,1350,1,0,0,16,0,185,1,330,1,0,0
3,30,4,1,2,0,1476,1,1,2,3,6,199,4,-1,0,3,0
4,59,1,1,1,0,0,1,0,2,5,8,226,1,-1,0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4516,33,7,1,1,0,-333,1,0,0,30,5,329,5,-1,0,3,0
4517,57,6,1,2,1,-3313,1,1,2,9,8,153,1,-1,0,3,0
4518,57,9,1,1,0,295,0,0,0,19,1,151,11,-1,0,3,0
4519,28,1,1,1,0,1137,0,0,0,6,3,129,4,211,3,1,0


> All categorical variables are now converted to numerical variables.

2. Split features and target variable

In [41]:
X = bank_data.drop('y', axis=1)
y = bank_data['y']

### Train Test Split
Split the dataset into training and testing sets to evaluate the model's performance.

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

### Model Training
Train a decision tree classifier using the training data.

In [43]:
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

### Model Evaluation
Evaluate the performance of the trained model using the testing data.

In [44]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.8718232044198895

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.92      0.93       807
           1       0.42      0.47      0.44        98

    accuracy                           0.87       905
   macro avg       0.68      0.70      0.68       905
weighted avg       0.88      0.87      0.88       905


Confusion Matrix:
 [[743  64]
 [ 52  46]]


### Prediction
Use the trained model to make predictions on new data. Let's load the larger dataset bank-full.csv and use the trained decision tree classifier to make predictions on it.

1. Load the dataset

In [45]:
bank_data_full = pd.read_csv('data/bank-full.csv')

In [46]:
bank_data_full

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [47]:
bank_data_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


2. Preprocess the larger dataset

In [48]:
label_encoder = LabelEncoder()
bank_data_full['job'] = label_encoder.fit_transform(bank_data_full['job'])
bank_data_full['marital'] = label_encoder.fit_transform(bank_data_full['marital'])
bank_data_full['education'] = label_encoder.fit_transform(bank_data_full['education'])
bank_data_full['default'] = label_encoder.fit_transform(bank_data_full['default'])
bank_data_full['housing'] = label_encoder.fit_transform(bank_data_full['housing'])
bank_data_full['loan'] = label_encoder.fit_transform(bank_data_full['loan'])
bank_data_full['contact'] = label_encoder.fit_transform(bank_data_full['contact'])
bank_data_full['month'] = label_encoder.fit_transform(bank_data_full['month'])
bank_data_full['poutcome'] = label_encoder.fit_transform(bank_data_full['poutcome'])
bank_data_full['y'] = label_encoder.fit_transform(bank_data_full['y'])

In [49]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,10,1,0,0,1787,0,0,0,19,10,79,1,-1,0,3,0
1,33,7,1,1,0,4789,1,1,0,11,8,220,1,339,4,0,0
2,35,4,2,2,0,1350,1,0,0,16,0,185,1,330,1,0,0
3,30,4,1,2,0,1476,1,1,2,3,6,199,4,-1,0,3,0
4,59,1,1,1,0,0,1,0,2,5,8,226,1,-1,0,3,0


3. Seperate Features and Target variable

In [50]:
X_full = bank_data_full.drop('y', axis=1)
y_full = bank_data_full['y']

4. Make predictions

In [51]:
predictions_full = model.predict(X_full)

# Convert predictions to DataFrame for further analysis if needed
predictions_df = pd.DataFrame({'Prediction': predictions_full})

In [52]:
predictions_df

Unnamed: 0,Prediction
0,0
1,0
2,0
3,0
4,0
...,...
45206,1
45207,1
45208,1
45209,0


Let's check the accuracy of our model on the larger dataset

In [53]:
# Calculate accuracy
accuracy_full = accuracy_score(y_full, predictions_full)
print("Accuracy:", accuracy_full)

# Generate classification report
report_full = classification_report(y_full, predictions_full)
print("\nClassification Report:\n", report_full)

# Generate confusion matrix
matrix_full = confusion_matrix(y_full, predictions_full)
print("\nConfusion Matrix:\n", matrix_full)

Accuracy: 0.8787463227975493

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.93      0.93     39922
           1       0.48      0.49      0.49      5289

    accuracy                           0.88     45211
   macro avg       0.71      0.71      0.71     45211
weighted avg       0.88      0.88      0.88     45211


Confusion Matrix:
 [[37144  2778]
 [ 2704  2585]]
