#  <font color = 'blue'> Data Modeling

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE


from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeClassifier

import tqdm
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, accuracy_score, balanced_accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc

In [2]:
# Load bank_mkt_important_features.csv csv file
bank_mkt = pd.read_csv('Cleaned_BankMarketing.csv')

In [3]:
bank_mkt.shape

(41188, 38)

In [4]:
# View the first 5 rows of the data frame
bank_mkt.head(5)

Unnamed: 0,subscription,month,day_of_week,age,jobblue.collar,jobentrepreneur,jobhousemaid,jobmanagement,jobretired,jobself.employed,...,housingyes,loanunknown,loanyes,method_of_contact,duration,campaign,contacted_previous_campaign,previous,poutcomenonexistent,poutcomesuccess
0,0,3,1,5,0,0,1,0,0,0,...,0,0,0,1,2,0,0,0,1,0
1,0,3,1,5,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,0
2,0,3,1,3,0,0,0,0,0,0,...,1,0,0,1,2,0,0,0,1,0
3,0,3,1,3,0,0,0,0,0,0,...,0,0,0,1,2,0,0,0,1,0
4,0,3,1,5,0,0,0,0,0,0,...,0,0,1,1,3,0,0,0,1,0


In [None]:
# # Check the structure of the dataset
# bank_mkt.info()

In [None]:
# # Check for missing values in the dataset
# bank_mkt.isna().sum()

In [5]:
# Subset predictor variables and store them in X
X = bank_mkt.iloc[:,1:38]
X.shape

(41188, 37)

In [6]:
# Subset target variable and store it in y
y = bank_mkt.loc[:,'subscription']
y.shape

(41188,)

### <font color = 'blue'> Data Splitting

In [7]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
# View the first 5 rows of X_train
X_train.head()

Unnamed: 0,month,day_of_week,age,jobblue.collar,jobentrepreneur,jobhousemaid,jobmanagement,jobretired,jobself.employed,jobservices,...,housingyes,loanunknown,loanyes,method_of_contact,duration,campaign,contacted_previous_campaign,previous,poutcomenonexistent,poutcomesuccess
39075,10,1,2,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0
34855,3,5,2,0,0,0,0,0,0,0,...,0,0,0,1,1,1,0,0,1,0
7107,3,4,3,1,0,0,0,0,0,0,...,1,0,0,1,3,1,0,0,1,0
31614,3,4,2,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,1,0,0
34878,3,5,2,0,0,0,0,0,0,0,...,0,0,0,0,3,1,0,0,1,0


In [9]:
# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_test shape:", X_test.shape)
print("y_test shape:", y_test.shape)

X_train shape: (28831, 37)
y_train shape: (28831,)
X_test shape: (12357, 37)
y_test shape: (12357,)


### <font color = 'blue'> Handling Unbalanced Data: SMOTE

Synthetic Minority Oversampling Technique (SMOTE) is a statistical technique for increasing the number of cases in your dataset in a balanced way. The component works by generating new instances from existing minority cases that you supply as input. 

In [None]:
# # Create a smote object
# smote = SMOTE(random_state=42)
# Resample the training data
# X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

In [None]:
# # Print the class distribution before and after oversampling
# print("Before oversampling:", Counter(y_train))
# print("After oversampling:", Counter(y_resampled))

In [None]:
# # Plot a bar chart to visualize the target variable after random oversampling
# plt.hist(y_resampled);

### <font color = 'blue'> Handling Unbalanced Data: Random Oversampling

This technique involves randomly duplicating samples from the minority class to balance the number of samples in each class. 

In [10]:
# Create a random oversampler object
oversampler = RandomOverSampler(random_state=42)

In [11]:
# Resample the training data
X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)

In [12]:
# Print the class distribution before and after oversampling
print("Before oversampling:", Counter(y_train))
print("After oversampling:", Counter(y_resampled))

Before oversampling: Counter({0: 25580, 1: 3251})
After oversampling: Counter({0: 25580, 1: 25580})


In [None]:
# # Plot a bar chart to visualize the target variable after random oversampling
# plt.hist(y_resampled);

In [19]:
X_resampled.head(1)
#X_training_random_oversampler = X_resampled
#X_training_random_oversampler = X_training_random_oversampler.reset_index()
X_training_random_oversampler.head(1)

Unnamed: 0,index,month,day_of_week,age,jobblue.collar,jobentrepreneur,jobhousemaid,jobmanagement,jobretired,jobself.employed,...,housingyes,loanunknown,loanyes,method_of_contact,duration,campaign,contacted_previous_campaign,previous,poutcomenonexistent,poutcomesuccess
0,0,10,1,2,0,0,0,0,0,0,...,0,0,0,0,1,1,0,1,0,0


In [21]:
y_resampled.head(1)
#y_training_random_oversampler = y_resampled
#y_training_random_oversampler = y_training_random_oversampler.reset_index()
y_training_random_oversampler.head(1)

Unnamed: 0,index,subscription
0,0,0


In [22]:
training_bal = pd.merge(X_training_random_oversampler, y_training_random_oversampler, on = 'index')
#training_bal = training_bal.drop('index', axis = 1)
training_bal.head()

Unnamed: 0,month,day_of_week,age,jobblue.collar,jobentrepreneur,jobhousemaid,jobmanagement,jobretired,jobself.employed,jobservices,...,loanunknown,loanyes,method_of_contact,duration,campaign,contacted_previous_campaign,previous,poutcomenonexistent,poutcomesuccess,subscription
0,10,1,2,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,3,5,2,0,0,0,0,0,0,0,...,0,0,1,1,1,0,0,1,0,0
2,3,4,3,1,0,0,0,0,0,0,...,0,0,1,3,1,0,0,1,0,0
3,3,4,2,0,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,0,0
4,3,5,2,0,0,0,0,0,0,0,...,0,0,0,3,1,0,0,1,0,0


In [23]:
# Saving combined files to .csv
training_bal.to_csv('training_ros.csv', index = False)

In [24]:
#X_test = X_test.reset_index()
X_test.head(1)

Unnamed: 0,index,month,day_of_week,age,jobblue.collar,jobentrepreneur,jobhousemaid,jobmanagement,jobretired,jobself.employed,...,housingyes,loanunknown,loanyes,method_of_contact,duration,campaign,contacted_previous_campaign,previous,poutcomenonexistent,poutcomesuccess
0,32884,3,1,5,0,0,0,0,0,0,...,0,0,1,0,3,0,0,1,0,0


In [25]:
#y_test = y_test.reset_index()
y_test.head(1)

Unnamed: 0,index,subscription
0,32884,0


In [27]:
testing_ros_bal = pd.merge(X_test, y_test, on = 'index')
#testing_ros_bal = testing_ros_bal.drop('index', axis = 1)
testing_ros_bal.head()

Unnamed: 0,month,day_of_week,age,jobblue.collar,jobentrepreneur,jobhousemaid,jobmanagement,jobretired,jobself.employed,jobservices,...,loanunknown,loanyes,method_of_contact,duration,campaign,contacted_previous_campaign,previous,poutcomenonexistent,poutcomesuccess,subscription
0,3,1,5,0,0,0,0,0,0,0,...,0,1,0,3,0,0,1,0,0,0
1,3,4,4,0,0,0,0,0,0,0,...,0,0,1,3,1,0,0,1,0,0
2,3,5,2,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
3,4,5,3,0,0,0,0,0,0,0,...,0,0,1,3,1,0,0,1,0,0
4,5,5,2,0,0,1,0,0,0,0,...,0,0,0,2,1,0,0,1,0,0


In [28]:
testing_ros_bal.to_csv('testing_ros_bal.csv', index = False)