# Handling Imbalanced Dataset with Machine Learning

In [1]:
import numpy as np
import pandas as pd
df=pd.read_csv('/content/creditcard.csv')
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5969,6634,-1.611463,0.190648,0.901715,1.531254,-1.535865,0.799245,1.513786,0.495829,0.200390,...,0.211223,0.007477,1.026272,0.057628,-0.024955,-0.368263,0.081684,0.140669,458.92,0.0
5970,6635,-1.420272,1.449354,1.320110,-1.894320,0.913695,0.454601,0.894179,-0.385450,2.433841,...,-0.529027,-0.368394,-0.247773,-1.189156,-0.126040,0.701487,0.277333,-0.222694,0.77,0.0
5971,6637,-1.206696,0.284728,2.152053,-2.850437,-0.437285,-0.238376,-0.333341,0.334679,2.870542,...,0.039460,0.464476,-0.457193,-0.556105,0.517579,0.008006,0.366054,0.185008,14.00,0.0
5972,6644,1.067611,0.091006,-0.153917,0.704233,0.113894,-0.826866,0.567690,-0.464181,0.957295,...,-0.476723,-1.410090,-0.037550,-0.177773,0.321810,0.114930,-0.109640,0.023205,139.90,0.0


In [2]:
df.shape

(5974, 31)

In [3]:
# check the null values and drop it
df.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [4]:
# check my dependent feature ( output value ) i.e. how many 0 or 1 (not fraud or fraud)
df['Class'].value_counts()
# It is imbalanced data set
# 0 - not fraud transaction
# 1 - fraud transaction

0.0    5970
1.0       3
Name: Class, dtype: int64

In [5]:
# Create my X and Y
x=df.drop('Class',axis=1) # independent feature
y=df['Class'] # dependent feature

In [6]:
x

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62
1,0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69
2,1,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5969,6634,-1.611463,0.190648,0.901715,1.531254,-1.535865,0.799245,1.513786,0.495829,0.200390,...,0.935516,0.211223,0.007477,1.026272,0.057628,-0.024955,-0.368263,0.081684,0.140669,458.92
5970,6635,-1.420272,1.449354,1.320110,-1.894320,0.913695,0.454601,0.894179,-0.385450,2.433841,...,0.884055,-0.529027,-0.368394,-0.247773,-1.189156,-0.126040,0.701487,0.277333,-0.222694,0.77
5971,6637,-1.206696,0.284728,2.152053,-2.850437,-0.437285,-0.238376,-0.333341,0.334679,2.870542,...,0.082038,0.039460,0.464476,-0.457193,-0.556105,0.517579,0.008006,0.366054,0.185008,14.00
5972,6644,1.067611,0.091006,-0.153917,0.704233,0.113894,-0.826866,0.567690,-0.464181,0.957295,...,0.211006,-0.476723,-1.410090,-0.037550,-0.177773,0.321810,0.114930,-0.109640,0.023205,139.90


In [7]:
y

0       0.0
1       0.0
2       0.0
3       0.0
4       0.0
       ... 
5969    0.0
5970    0.0
5971    0.0
5972    0.0
5973    NaN
Name: Class, Length: 5974, dtype: float64

# Cross validation like K fold and Hyper Parameter tuning

In [8]:
# apply simple algorithm i.e. Random forest
# one thing about imbalanced dataset is it doesnot heavily impact ensemble techniques where decision trees are used(Random Forest)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.model_selection import KFold # cross validation
from sklearn.model_selection import GridSearchCV

In [9]:
10.0**np.arange(-2,3) # 10 to the power of

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

In [24]:
log_class=LogisticRegression(penalty='l2')
grid={'C':10.0**np.arange(-2,3),'penalty':['l2']} # c is basically a hyper parameter , penalty is 'L2' is also a hyper parameter
cv=KFold(n_splits=5,shuffle=False,random_state=None)

In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7)

In [26]:
# run gridsearch CV
clf = GridSearchCV(log_class, grid, cv=cv, n_jobs=-1, scoring='f1_macro')
clf.fit(x_train, y_train)
# grid is the parameter for param_grid,
# fi macro is scoring parameter
# n_job is the number of cores of computer
# cv is cross validation


5 fits failed out of a total of 25.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/linear_model/_logistic.py", line 1241, in fit
    raise ValueError(
ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0.0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.h

In [29]:
x_test = x_test.dropna()
y_test = y_test[x_test.index]
x_test=x_test.dropna()
y_pred=clf.predict(x_test)
confusion_matrix(y_test,y_pred)

array([[1789,    1],
       [   2,    0]])

In [30]:
print(accuracy_score(y_pred,y_test))
# it is a imbalanced datset
# Donot focus on accuracy
# focus on reacall and precision
# we should focus on reducing (40,57)

0.9983258928571429


In [31]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1791
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1792
   macro avg       0.50      0.50      0.50      1792
weighted avg       1.00      1.00      1.00      1792



In [32]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier()
classifier.fit(x_train,y_train)

# Decision Tree forms hierichal structure  with respect to imbalanced data set, it gives us a good accuracy

In [33]:
y_pred=classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))

[[1790    0]
 [   2    0]]


In [34]:
print(accuracy_score(y_pred,y_test))
# it is a imbalanced datset

0.9988839285714286


In [35]:
print(classification_report(y_pred,y_test))
# check the difference with respect to previous decision tree

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1792
         1.0       0.00      0.00      0.00         0

    accuracy                           1.00      1792
   macro avg       0.50      0.50      0.50      1792
weighted avg       1.00      1.00      1.00      1792



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
y_train.value_counts()

0.0    4180
1.0       1
Name: Class, dtype: int64

In [37]:
# just increase the class weights in random forest classifier
class_weight=dict({0:1,1:100})
# always create a dictionary
# 0 and 1 , that weight is 1:1 (output importance )
# for 1:100 , increase the weight (1:100)
# classweight tells on which value to give importance 0 and 1
# without class weight , both 0 and 1 will get equal importance

In [38]:
from sklearn.ensemble import RandomForestClassifier
classifier=RandomForestClassifier(class_weight=class_weight)
classifier.fit(x_train,y_train)


In [39]:
y_pred=classifier.predict(x_test)
print(confusion_matrix(y_test,y_pred))

[[1790    0]
 [   2    0]]


In [40]:
print(accuracy_score(y_pred,y_test))
# it is a imbalanced datset

0.9988839285714286


In [41]:
print(classification_report(y_pred,y_test))
# check the difference with respect to previous decision tree
# not much difference

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1792
         1.0       0.00      0.00      0.00         0

    accuracy                           1.00      1792
   macro avg       0.50      0.50      0.50      1792
weighted avg       1.00      1.00      1.00      1792



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Under Sampling

In [42]:
# By using under sampling , I will try to reduce the points of the maximum labels ( 0 and 1)
# we should not use this as there is loss of data , we should use when there is less data
df['Class'].value_counts() # reduce the value count of zero

0.0    5970
1.0       3
Name: Class, dtype: int64

In [43]:
y_train.value_counts()
# Try to reduce the value by 75 percentage  of 0 feature

0.0    4180
1.0       1
Name: Class, dtype: int64

In [None]:
# pip install imblearn
# for any issues remove the # and run it

In [44]:
from collections import Counter
Counter(y_train)

Counter({0.0: 4180, 1.0: 1})

In [52]:
import numpy as np
y_train_np = np.array(y_train)

In [55]:
from collections import Counter  # counts the number of 0 and 1
from imblearn.under_sampling import NearMiss
ns=NearMiss() # reduce to  80 percentage of the total parameter with respect to 1
x_train_ns , y_train_ns = ns.(x_train,y_train_np)
print(" The number of class before the fit ()",format(Counter(y_train)))
print(" The number of class afer  the fit ()",format(Counter(y_train_ns)))
# connot use because of version issue

AttributeError: 'NearMiss' object has no attribute 'fit_sample'

In [None]:
sklearn.__version__


'1.4.0'

In [None]:
from imblearn.over_sampling import RandomOverSampler
# error

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
from imblearn.combine import SMOTEomek

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [None]:
from imblearn.ensemble import EasyEnsembleClassifier

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (C:\Users\HP\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)