In [1]:

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV,KFold
from sklearn.preprocessing import LabelEncoder,StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import plotly.graph_objects as go
import seaborn as sns
import warnings
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
warnings.filterwarnings('ignore')
import plotly as px


In [2]:
df = pd.read_csv('bank-full.csv',sep = ';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
df.y.value_counts()

no     39922
yes     5289
Name: y, dtype: int64

In [4]:
df.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  y          45211 non-null  object
dtypes: int64(7), object(10)
memory usage: 5.9+ MB


In [6]:
def preprocess(df):
    cols = df.columns.tolist()
    for col in cols:
        if df[col].dtype == 'O':
            ls = LabelEncoder()
            df[col] = ls.fit_transform(df[col])
    ss = StandardScaler()
    y = df["y"]
    l = ss.fit_transform(df)
    df = pd.DataFrame(l, columns = cols)
    df['y'] = y


In [7]:
preprocess(df)

In [8]:
X,y = df.drop(columns = 'y'),df['y']
X_train, X_test , y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=42)
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)   
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)

In [9]:
accuracy_score(y_test,y_pred)
print(classification_report(y_test,y_pred))


              precision    recall  f1-score   support

           0       0.90      0.98      0.94     11966
           1       0.56      0.17      0.26      1598

    accuracy                           0.89     13564
   macro avg       0.73      0.58      0.60     13564
weighted avg       0.86      0.89      0.86     13564



In [10]:
data_1 = df[df['y']==1]
data_0 = df[df['y']==0].sample(data_1.shape[0],random_state = 42)
data_under = pd.concat([data_1,data_0])

df= data_under.sample(frac = 1).reset_index(drop =True)


In [11]:
df.shape

(10578, 17)

In [12]:
X,y = df.drop(columns = 'y'),df['y']
X_train, X_test , y_train, y_test = train_test_split(X,y , test_size=0.3, random_state=42)
y_train = y_train.values.reshape(-1,1)
y_test = y_test.values.reshape(-1,1)
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))


0.7488972904851922
              precision    recall  f1-score   support

           0       0.73      0.78      0.75      1556
           1       0.77      0.72      0.74      1618

    accuracy                           0.75      3174
   macro avg       0.75      0.75      0.75      3174
weighted avg       0.75      0.75      0.75      3174



In [13]:
import plotly.express as px

cm1 = confusion_matrix(y_test,y_pred)

fig = px.imshow(cm1,
                labels=dict(x = 'Real',y = 'Predicted'),
                x = [0,1],
                y = [0,1])
fig.show()

In [14]:

y_pred_proba_log = lr.predict_proba(X_test)[:,1]
fpr1, tpr1,thresholds1= roc_curve(y_test,y_pred_proba_log)
auc_score=roc_auc_score(y_test,y_pred_proba_log)


trace1 = go.Scatter(x=fpr1, y=tpr1, mode='lines', name='ROC curve(AUC = %0.2f)' % auc_score)
trace2 = go.Scatter(x=[0,1], y=[0,1], mode='lines', name='Reference guess')
trace3 = go.Scatter(x=fpr1, y=fpr1, mode='lines', name='Reference line')
layout = go.Layout(title='ROC-AUC Curve', xaxis=dict(title='False Positive Rate'), yaxis=dict(title='True Positive Rate'))
fig = go.Figure(data=[trace1,trace2, trace3], layout=layout)
fig.show()