# Handling Imbalaned Dataset with Machine Learning

In [2]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.shape

(284807, 31)

In [4]:
# To check for null values
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
df.isnull().count()

Time      284807
V1        284807
V2        284807
V3        284807
V4        284807
V5        284807
V6        284807
V7        284807
V8        284807
V9        284807
V10       284807
V11       284807
V12       284807
V13       284807
V14       284807
V15       284807
V16       284807
V17       284807
V18       284807
V19       284807
V20       284807
V21       284807
V22       284807
V23       284807
V24       284807
V25       284807
V26       284807
V27       284807
V28       284807
Amount    284807
Class     284807
dtype: int64

#### No null values

In [6]:
#### To check whether the dataset is balanced or no
df['Class'].value_counts()
#### It is not balanced, there is huge difference between 0 and 1

0    284315
1       492
Name: Class, dtype: int64

In [7]:
#### Dividing the dataset into Independent and Dependent feature
X = df.drop('Class', axis= 'columns')
y = df.Class

In [8]:
# We need to import the models
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np

In [9]:
10.0 ** np.arange(-2, 3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

### Hyper Parameter Tuning -- Cross Validation, KFold

In [10]:
log_reg = LogisticRegression()
grid = {'C': 10.0 ** np.arange(-2, 3), 'penalty':['l1', 'l2']}
# cv means the cross validation types
cv = KFold(n_splits=5, shuffle=False, random_state=None)

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7)

In [12]:
clf = GridSearchCV(log_reg, grid, cv = cv,n_jobs=-1 ,scoring= 'f1_macro')
clf.fit(X_train, y_train)

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "C:\ProgramData\Anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty.

        nan 0.84650553       

In [13]:
y_pred = clf.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0 0 1 ... 1 0 0]
[[85278    35]
 [   40    90]]
0.9991222218320986
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85313
           1       0.72      0.69      0.71       130

    accuracy                           1.00     85443
   macro avg       0.86      0.85      0.85     85443
weighted avg       1.00      1.00      1.00     85443



In [14]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [15]:
y_pred = classifier.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0 0 0 ... 1 0 0]
[[85303    10]
 [   26   104]]
0.9995786664794073
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85313
           1       0.91      0.80      0.85       130

    accuracy                           1.00     85443
   macro avg       0.96      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



# Under Sampling

In [16]:
y_train.value_counts()

0    199002
1       362
Name: Class, dtype: int64

In [17]:
# For the undersampling technique, we are going toimport the Nearmiss module from the imblearn library
from collections import Counter
from imblearn.under_sampling import NearMiss
ns = NearMiss(sampling_strategy = 0.8)
X_train_ns, y_train_ns = ns.fit_resample(X_train, y_train)
print('The number of clases before fit {}'.format(Counter(y_train)))
print('The number of clases after fit {}'.format(Counter(y_train_ns)))

The number of clases before fit Counter({0: 199002, 1: 362})
The number of clases after fit Counter({0: 452, 1: 362})


In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_ns, y_train_ns)

In [19]:
y_pred = classifier.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0 1 1 ... 1 1 1]
[[67823 17490]
 [   11   119]]
0.7951733904474327
              precision    recall  f1-score   support

           0       1.00      0.79      0.89     85313
           1       0.01      0.92      0.01       130

    accuracy                           0.80     85443
   macro avg       0.50      0.86      0.45     85443
weighted avg       1.00      0.80      0.88     85443



# Over Sampling

In [20]:
from imblearn.over_sampling import RandomOverSampler
from collections import Counter
os = RandomOverSampler(sampling_strategy= 0.5)
X_train_os, y_train_os = os.fit_resample(X_train, y_train)

print('The number of clases before fit {}'.format(Counter(y_train)))
print('The number of clases after fit {}'.format(Counter(y_train_os)))

The number of clases before fit Counter({0: 199002, 1: 362})
The number of clases after fit Counter({0: 199002, 1: 99501})


In [21]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_os, y_train_os)

In [22]:
y_pred = classifier.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0 0 0 ... 1 0 0]
[[85304     9]
 [   25   105]]
0.999602073897218
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85313
           1       0.92      0.81      0.86       130

    accuracy                           1.00     85443
   macro avg       0.96      0.90      0.93     85443
weighted avg       1.00      1.00      1.00     85443



# SMOTETomek TECHNIQUE

In [23]:
from imblearn.combine import SMOTETomek

In [24]:
sm = SMOTETomek(sampling_strategy = 0.5)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

print('The number of clases before fit {}'.format(Counter(y_train)))
print('The number of clases after fit {}'.format(Counter(y_train_sm)))

The number of clases before fit Counter({0: 199002, 1: 362})
The number of clases after fit Counter({0: 198171, 1: 98670})


In [25]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train_sm, y_train_sm)

In [26]:
y_pred = classifier.predict(X_test)
print(y_pred)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[0 0 0 ... 1 0 0]
[[85291    22]
 [   21   109]]
0.9994967405170698
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85313
           1       0.83      0.84      0.84       130

    accuracy                           1.00     85443
   macro avg       0.92      0.92      0.92     85443
weighted avg       1.00      1.00      1.00     85443



# Ensemble Techniques

In [27]:
from imblearn.ensemble import EasyEnsembleClassifier

In [31]:
easy = EasyEnsembleClassifier(sampling_strategy = 0.8)
easy.fit(X_train, y_train)

In [32]:
y_pred = easy.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[83113  2200]
 [   11   119]]
0.9741230996102664
              precision    recall  f1-score   support

           0       1.00      0.97      0.99     85313
           1       0.05      0.92      0.10       130

    accuracy                           0.97     85443
   macro avg       0.53      0.94      0.54     85443
weighted avg       1.00      0.97      0.99     85443

