## The tutorial teaches how to deal with the imbalanced dataset:
Two techniques we could use here are:
* SMOTE
* Near miss

In [1]:
# Importing the libraries:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import NearMiss

In [2]:
## importing the dataset:
bank = pd.read_csv("bank-full.csv", sep = ";", na_values = "unknown")

In [4]:
## checking the first 3 rows and all the columns

print(bank.head())

## 
print(bank.shape)
print(bank.columns)

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married        NaN      no     1506     yes   no   
4   33           NaN   single        NaN      no        1      no   no   

  contact  day month  duration  campaign  pdays  previous poutcome   y  
0     NaN    5   may       261         1     -1         0      NaN  no  
1     NaN    5   may       151         1     -1         0      NaN  no  
2     NaN    5   may        76         1     -1         0      NaN  no  
3     NaN    5   may        92         1     -1         0      NaN  no  
4     NaN    5   may       198         1     -1         0      NaN  no  
(45211, 17)
Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', '

In [5]:
## mapping the qualitative element in the columns to qualitatives ones:
bank["default"] = bank["default"].map({"no":0,"yes":1})
bank["housing"] = bank["housing"].map({"no":0,"yes":1})
bank["loan"] = bank["loan"].map({"no":0,"yes":1})
bank["y"] = bank["y"].map({"no":0,"yes":1})
bank.education = bank.education.map({"primary": 0, "secondary":1, "tertiary":2})

# converting the may month to number:
bank.month = pd.to_datetime(bank.month, format = "%b").dt.month

In [7]:
## checking for the number of NAN values columns wise:
bank.isnull().sum()

age              0
job            288
marital          0
education     1857
default          0
balance          0
housing          0
loan             0
contact      13020
day              0
month            0
duration         0
campaign         0
pdays            0
previous         0
poutcome     36959
y                0
dtype: int64

In [8]:
## above we saw that "poutcome" and "contact" 
bank.drop(["poutcome", "contact"], axis = 1, inplace = True)

## getting rid of the rows where the NAN values are existing:
bank.dropna(inplace = True)



In [None]:
## getting dummy values for multiple columns
bank = pd.get_dummies(bank, drop_first = True)

bank.y.value_counts()

X = bank.drop("y", axis = 1)
y = bank.y

In [9]:
bank = pd.get_dummies(bank, drop_first = True)

In [14]:
## Checking the values of column "Y", this is our target variable:
bank.y.value_counts()

0    38172
1     5021
Name: y, dtype: int64

In [15]:
## Separating the features and Target variable:
X = bank.drop("y", axis = 1)
y = bank.y

In [16]:
## Runing the model:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)
y_train.value_counts()
lr = LogisticRegression()
lr.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [17]:
## predicting on the basis of model.
y_pred = lr.predict(X_test)

In [18]:
# creating the confusion matrix for better understanding
confusion_matrix(y_test, y_pred)

array([[9374,  170],
       [ 982,  273]])

In [22]:
print("The accuracy score is",accuracy_score(y_test, y_pred))

print("Recall is",recall_score(y_test, y_pred))

The accuracy score is 0.8933234558755441
Recall is 0.21752988047808766


In [25]:
## So now let us balanced the imbalanced data:
# train, test and split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)

# fitting the SMOTE 
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)
np.bincount(y_train)

# Again reruning the logistic regression
lr = LogisticRegression()
lr.fit(X_train, y_train)

# prediction on the basis of SMOTE
y_pred = lr.predict(X_test)

# Confusion matrix
confusion_matrix(y_test, y_pred)

accuracy_score(y_test, y_pred)

recall_score(y_test, y_pred)



0.8047808764940239

In [26]:
# Confusion matrix
confusion_matrix(y_test, y_pred)

array([[7670, 1874],
       [ 245, 1010]])

In [28]:
print(accuracy_score(y_test, y_pred))

print("Recall has improve manifold",recall_score(y_test, y_pred))

0.8037781276044078
Recall has improve manifold 0.8047808764940239


In [33]:
## using another technique of NEAR miss to get the imbalanced data as balanced:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify=y)

nr = NearMiss()
X_train, y_train = nr.fit_sample(X_train, y_train)
np.bincount(y_train)

lr = LogisticRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)



In [32]:
# result of the confusion matrix:
confusion_matrix(y_test, y_pred)

array([[5102, 4442],
       [ 162, 1093]])

In [31]:
print("Accuracy score is",accuracy_score(y_test, y_pred))

print("Recall value",recall_score(y_test, y_pred))

Accuracy score is 0.573664228169275
Recall value 0.8709163346613545
