In [2]:
import pandas as pd
df = pd.read_csv('creditcard.csv')
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# df.isnull().sum()

In [5]:
df.shape

(284807, 31)

In [6]:
df['Class'].value_counts()

Class
0    284315
1       492
Name: count, dtype: int64

In [6]:
## Separating into dependent and independant features.
X = df.drop('Class',axis=1)
y = df['Class']

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix,accuracy_score, classification_report
from sklearn.model_selection import KFold, GridSearchCV
import numpy as np

In [4]:
10.0**np.arange(-2,3)

array([1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02])

### Cross validation and Hyperparameter Tunning.

In [44]:
log_class = LogisticRegression()
grid = {'C':10.0**np.arange(-2,3),'penalty': ['l2','l1'], "solver": ["liblinear"]}
cv = KFold(n_splits=5, random_state=None, shuffle=False)

In [45]:
clf = GridSearchCV(log_class,grid,cv=cv,n_jobs=-1,scoring='f1_macro')

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [47]:
clf.fit(X_train,y_train)

In [48]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85295    14]
 [   51    83]]
0.9992392589211521
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85309
           1       0.86      0.62      0.72       134

    accuracy                           1.00     85443
   macro avg       0.93      0.81      0.86     85443
weighted avg       1.00      1.00      1.00     85443



we have to perform cross validation and Hyper parameter tuning to deal with imbalance data.

In [49]:
from sklearn.ensemble import RandomForestClassifier
classifer = RandomForestClassifier() # can set class weighted parameter to the class which is less in number.
classifer.fit(X_train,y_train)

In [51]:
y_pred = classifer.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85302     7]
 [   32   102]]
0.9995435553526912
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85309
           1       0.94      0.76      0.84       134

    accuracy                           1.00     85443
   macro avg       0.97      0.88      0.92     85443
weighted avg       1.00      1.00      1.00     85443



Ensemble techniques works well with imbalanced data.

### Under Sampling.
* This method should not use used in every case becuase it causes loss of info.
* In this method, we reduces the number of samples from the maximum record class.

In [15]:
y_train.value_counts()

Class
0    199024
1       340
Name: count, dtype: int64

In [6]:
from imblearn.under_sampling import NearMiss
from collections import Counter
ns = NearMiss(sampling_strategy='majority')
X_train_ns, y_train_ns = ns.fit_resample(X_train,y_train)
print(f'Count before under sampling: {Counter(y_train)}')
print(f'Count after under sampling: {Counter(y_train_ns)}')

Count before under sampling: Counter({0: 199027, 1: 337})
Count after under sampling: Counter({0: 337, 1: 337})


In [25]:
from sklearn.ensemble import RandomForestClassifier
classifer = RandomForestClassifier() # can set class weighted parameter to the class which is less in number.
classifer.fit(X_train_ns,y_train_ns)

In [26]:
y_pred = classifer.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[65372 19919]
 [   10   142]]
0.7667567852252378
              precision    recall  f1-score   support

           0       1.00      0.77      0.87     85291
           1       0.01      0.93      0.01       152

    accuracy                           0.77     85443
   macro avg       0.50      0.85      0.44     85443
weighted avg       1.00      0.77      0.87     85443



Note: Here we can see Under sampling has reduced the scores due to loss of data.

Over Sampling: it create more points aroud the neiboring points.

In [7]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler()
X_train_os, y_train_os = os.fit_resample(X_train,y_train)
print(f'Count before under sampling: {Counter(y_train)}')
print(f'Count after under sampling: {Counter(y_train_os)}')

Count before under sampling: Counter({0: 199027, 1: 337})
Count after under sampling: Counter({0: 199027, 1: 199027})


In [8]:
from sklearn.ensemble import RandomForestClassifier
classifer = RandomForestClassifier() # can set class weighted parameter to the class which is less in number.
classifer.fit(X_train_os,y_train_os)
y_pred = classifer.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85287     1]
 [   42   113]]
0.9994967405170698
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85288
           1       0.99      0.73      0.84       155

    accuracy                           1.00     85443
   macro avg       1.00      0.86      0.92     85443
weighted avg       1.00      1.00      1.00     85443



#### SMOTETomek

In [10]:
from imblearn.combine import SMOTETomek
from collections import Counter
smot = SMOTETomek()
X_train_os, y_train_os = smot.fit_resample(X_train,y_train)
print(f'Count before under sampling: {Counter(y_train)}')
print(f'Count after under sampling: {Counter(y_train_os)}')

Count before under sampling: Counter({0: 199034, 1: 330})
Count after under sampling: Counter({0: 198521, 1: 198521})


In [11]:
from sklearn.ensemble import RandomForestClassifier
classifer = RandomForestClassifier() # can set class weighted parameter to the class which is less in number.
classifer.fit(X_train_os,y_train_os)
y_pred = classifer.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))
print(classification_report(y_test,y_pred))

[[85263    18]
 [   35   127]]
0.9993797034280163
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     85281
           1       0.88      0.78      0.83       162

    accuracy                           1.00     85443
   macro avg       0.94      0.89      0.91     85443
weighted avg       1.00      1.00      1.00     85443

