In [59]:
# Load libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics

In [60]:
# Load the data
bank_df = pd.read_csv('bank.csv') 

In [61]:
bank_df.head(5)

Unnamed: 0,age,job,marital,education,default,balance,housing-loan,personal-loan,current-campaign,previous-campaign,subscribed
0,30,unemployed,married,primary,no,1787,no,no,1,0,no
1,33,services,married,secondary,no,4789,yes,yes,1,4,no
2,35,management,single,tertiary,no,1350,yes,no,1,1,no
3,30,management,married,tertiary,no,1476,yes,yes,4,0,no
4,59,blue-collar,married,secondary,no,0,yes,no,1,0,no


In [62]:
bank_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4521 entries, 0 to 4520
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   age                4521 non-null   int64 
 1   job                4521 non-null   object
 2   marital            4521 non-null   object
 3   education          4521 non-null   object
 4   default            4521 non-null   object
 5   balance            4521 non-null   int64 
 6   housing-loan       4521 non-null   object
 7   personal-loan      4521 non-null   object
 8   current-campaign   4521 non-null   int64 
 9   previous-campaign  4521 non-null   int64 
 10  subscribed         4521 non-null   object
dtypes: int64(4), object(7)
memory usage: 388.6+ KB


In [63]:
bank_df['subscribed'].value_counts()

no     4000
yes     521
Name: subscribed, dtype: int64

In [64]:
## Importing resample from *sklearn.utils* package.
from sklearn.utils import resample

In [65]:
# Separate the case of yes-subscribes and no-subscribes
bank_subscribed_no = bank_df[bank_df['subscribed'] == 'no']
bank_subscribed_yes = bank_df[bank_df['subscribed'] == 'yes']

In [66]:
##Upsample the yes-subscribed cases.
df_minority_upsampled = resample(bank_subscribed_yes, replace=True, n_samples=2000, random_state=42)

In [67]:
# Combine majority class with upsampled minority class
new_bank_df = pd.concat([bank_subscribed_no, df_minority_upsampled])

In [68]:
new_bank_df['subscribed'].value_counts()

no     4000
yes    2000
Name: subscribed, dtype: int64

In [69]:
# Assigning list of all column names in the DataFrame
X_features = list(new_bank_df.columns)
# Remove the response variable from the list
X_features.remove('subscribed')
X_features

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing-loan',
 'personal-loan',
 'current-campaign',
 'previous-campaign']

In [70]:
## get_dummies() will convert all the columns with data type as objects

encoded_bank_df = pd.get_dummies(new_bank_df[X_features], drop_first = True )

X = encoded_bank_df

In [71]:
X.head()

Unnamed: 0,age,balance,current-campaign,previous-campaign,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,...,job_unemployed,job_unknown,marital_married,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing-loan_yes,personal-loan_yes
0,30,1787,1,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
1,33,4789,1,4,0,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,1
2,35,1350,1,1,0,0,0,1,0,0,...,0,0,0,1,0,1,0,0,1,0
3,30,1476,4,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,1
4,59,0,1,0,1,0,0,0,0,0,...,0,0,1,0,1,0,0,0,1,0


In [72]:
# Encoding the subscribed column and assigning to Y
y = new_bank_df['subscribed'].map(lambda x: int(x =='yes'))

In [73]:
#  Create a PCA that will retain 6 features
pca = PCA(n_components=6, whiten=True)

# whiten = True transforms the values of each principal component so that they have zero mean
# and unit variance.

# if n_components is given 0 to 1, it means the fraction of variation that you want to retain.
# if n_components > 1, it means the number of features you want to retain

In [74]:
# Conduct PCA
features_pca = pca.fit_transform(X)

In [77]:
# Show results
print("Original number of features:", X.shape[1])
print("Reduced number of features:", features_pca.shape[1])

Original number of features: 23
Reduced number of features: 1


array([[ 0.10686138],
       [ 1.1508941 ],
       [-0.04511719],
       ...,
       [-0.35394115],
       [-0.51461484],
       [ 0.74016978]])

In [76]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(features_pca, y, test_size=0.3, random_state=42)

In [23]:
from sklearn.linear_model import LogisticRegression

# Initializing the model

model = LogisticRegression(max_iter=100000)

# Fitting the model with X and y values of the dataset

model.fit(train_X, train_y)

LogisticRegression(max_iter=100000)

In [24]:
pred_y = model.predict(test_X)

In [25]:
print(confusion_matrix(test_y,pred_y))

[[1158   67]
 [ 475  100]]


In [26]:
print(classification_report(test_y,pred_y))

              precision    recall  f1-score   support

           0       0.71      0.95      0.81      1225
           1       0.60      0.17      0.27       575

    accuracy                           0.70      1800
   macro avg       0.65      0.56      0.54      1800
weighted avg       0.67      0.70      0.64      1800



In [27]:
metrics.roc_auc_score(test_y, model.predict_proba(test_X)[:,1])

0.671645075421473

In [28]:
from sklearn.model_selection import GridSearchCV

## Creating a dictionary with hyperparameters and possible values 
## for searching

params =  {'penalty': ['l2'], 'C': range(1,50), 'solver': ['lbfgs', 'liblinear'],
            }

## Configuring grid search

modelCV = GridSearchCV(LogisticRegression(max_iter=100000), params, cv=5, scoring='roc_auc')

## fit the search with training set
modelCV.fit(train_X, train_y)

GridSearchCV(cv=5, estimator=LogisticRegression(max_iter=100000),
             param_grid={'C': range(1, 50), 'penalty': ['l2'],
                         'solver': ['lbfgs', 'liblinear']},
             scoring='roc_auc')

In [29]:
modelCV.best_score_

0.6728307254623045

In [30]:
modelCV.best_params_

{'C': 3, 'penalty': 'l2', 'solver': 'lbfgs'}

In [31]:
print(confusion_matrix(test_y,pred_y))

[[1158   67]
 [ 475  100]]


In [32]:
print(classification_report(test_y,pred_y))

              precision    recall  f1-score   support

           0       0.71      0.95      0.81      1225
           1       0.60      0.17      0.27       575

    accuracy                           0.70      1800
   macro avg       0.65      0.56      0.54      1800
weighted avg       0.67      0.70      0.64      1800



In [33]:
metrics.roc_auc_score(test_y, modelCV.predict_proba(test_X)[:,1])

0.6716422360248447