# Handling Imbalanced Dataset with Machine Learning

# Not only ACCURACY, try to improve PRECISION and RECALL as well.

Try to get more True Positive, True Negatives and less False Positive, False Negative in Confusion Matrix

In [1]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
df=pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [3]:
df.shape

(284807, 31)

In [4]:
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [5]:
df['Class'].value_counts()

#Below values indicate that, the dataset is imabalanced
#Because, there's a huge difference between 0's and 1's of 'Class' feature

0    284315
1       492
Name: Class, dtype: int64

In [6]:
#Independent and Dependent Features
X=df.drop("Class",axis=1)
y=df.Class

# Computing the Accuracy, Precision and Recall of Imbalanaced dataset without applying any techniques to avoid imbalanced dataset

# 1. K-Fold Cross Validation and Hyperparameter Tuning helps in handling imbalanced dataset

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold
import numpy as np
from sklearn.model_selection import GridSearchCV

In [8]:
10.0 **np.arange(-2,3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [9]:
log_class=LogisticRegression()
grid={'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}
cv=KFold(n_splits=5,random_state=None,shuffle=False)

In [10]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [11]:
clf=GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train,y_train)

        nan 0.83955585        nan 0.83947582]
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


GridSearchCV(cv=KFold(n_splits=5, random_state=None, shuffle=False),
             estimator=LogisticRegression(), n_jobs=-1,
             param_grid={'C': array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02]),
                         'penalty': ['l1', 'l2']},
             scoring='f1_macro')

In [12]:
y_pred=clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85252    34]
 [   57   100]]
0.998934962489613
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85286
           1       0.75      0.64      0.69       157

    accuracy                           1.00     85443
   macro avg       0.87      0.82      0.84     85443
weighted avg       1.00      1.00      1.00     85443



In [13]:
347*100

34700

In [14]:
y_train.value_counts()

0    199029
1       335
Name: Class, dtype: int64

In [15]:
#Giving more importance to class '1' compared to class '0'

class_weight=dict({0:1,1:100})

In [16]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(class_weight=class_weight)
classifier.fit(X_train,y_train)

RandomForestClassifier(class_weight={0: 1, 1: 100})

y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [17]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

##Observe ACCURACY, F1 SCORE, RECALL values are better when compared to Logistic Regression

[[85283     3]
 [   32   125]]
0.9995903701883126
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85286
           1       0.98      0.80      0.88       157

    accuracy                           1.00     85443
   macro avg       0.99      0.90      0.94     85443
weighted avg       1.00      1.00      1.00     85443



# 2. Under Sampling - Reducing the size of that class which has more datapoints (Here, '0')

In [18]:
from collections import Counter
Counter(y_train)

Counter({0: 199029, 1: 335})

In [24]:
from collections import Counter
from imblearn.under_sampling import NearMiss
ns=NearMiss(0.8)
X_train_ns,y_train_ns=ns.fit_resample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

#The class with more number of datapoints = datapoints in the other class(335) / 0.8
# 418 = 335 / 0.8



The number of classes before fit Counter({0: 199029, 1: 335})
The number of classes after fit Counter({0: 418, 1: 335})


In [25]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [27]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

#We have a lot of FALSE POSITIVES, hence UNDERSAMPLING is not so good for handling imbalanced dataset

[[63609 21677]
 [   10   147]]
0.7461816649696289
              precision    recall  f1-score   support

           0       1.00      0.75      0.85     85286
           1       0.01      0.94      0.01       157

    accuracy                           0.75     85443
   macro avg       0.50      0.84      0.43     85443
weighted avg       1.00      0.75      0.85     85443



# 3. Over Sampling

In [28]:
from imblearn.over_sampling import RandomOverSampler

In [32]:
os=RandomOverSampler(0.75)
X_train_ns,y_train_ns=os.fit_resample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

#149271 = 199029 * 0.75
#In oversampling, we increase the datapoints for that class which has less datapoints



The number of classes before fit Counter({0: 199029, 1: 335})
The number of classes after fit Counter({0: 199029, 1: 149271})


In [33]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [34]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

#Better results when compared to UNDERsampling

[[85281     5]
 [   27   130]]
0.9996254813150287
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85286
           1       0.96      0.83      0.89       157

    accuracy                           1.00     85443
   macro avg       0.98      0.91      0.95     85443
weighted avg       1.00      1.00      1.00     85443



# Undersampling would decrease the proportion of your majority class until the number is similar to the minority class. At the same time, Oversampling would resample the minority class proportion following the majority class proportion.

# 4. SMOTETomek

SMOTE is an oversampling technique and creates new minority class synthetic samples, and Tomek Links is an undersampling technique

In [36]:
from imblearn.combine import SMOTETomek

In [38]:
os=SMOTETomek(0.75)
X_train_ns,y_train_ns=os.fit_resample(X_train,y_train)
print("The number of classes before fit {}".format(Counter(y_train)))
print("The number of classes after fit {}".format(Counter(y_train_ns)))

The number of classes before fit Counter({0: 199029, 1: 335})
The number of classes after fit Counter({0: 198301, 1: 148543})


In [42]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(X_train_ns,y_train_ns)

RandomForestClassifier()

In [45]:
y_pred=classifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

#Observe that the results are better when compared to OVERsampling and Undersampling

[[85268    18]
 [   24   133]]
0.9995084442259752
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85286
           1       0.88      0.85      0.86       157

    accuracy                           1.00     85443
   macro avg       0.94      0.92      0.93     85443
weighted avg       1.00      1.00      1.00     85443



# 5. Ensemble Techniques

In [46]:
from imblearn.ensemble import EasyEnsembleClassifier

In [50]:
easy=EasyEnsembleClassifier()
easy.fit(X_train,y_train)

EasyEnsembleClassifier()

In [51]:
y_pred=easy.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[82138  3148]
 [    8   149]]
0.9630630946947087
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     85286
           1       0.05      0.95      0.09       157

    accuracy                           0.96     85443
   macro avg       0.52      0.96      0.53     85443
weighted avg       1.00      0.96      0.98     85443

