In [83]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
from imblearn.under_sampling import RandomUnderSampler, NearMiss, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.pipeline import Pipeline

In [8]:
df = pd.read_csv('creditcard.csv')

In [9]:
pd.set_option('display.max_columns', 50)
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [10]:
df.shape

(284807, 31)

In [11]:
df.Class.value_counts(normalize=True)*100

Class
0    99.827251
1     0.172749
Name: proportion, dtype: float64

In [12]:
df.duplicated().sum(), df.duplicated(), ~df.duplicated()

(np.int64(1081),
 0         False
 1         False
 2         False
 3         False
 4         False
           ...  
 284802    False
 284803    False
 284804    False
 284805    False
 284806    False
 Length: 284807, dtype: bool,
 0         True
 1         True
 2         True
 3         True
 4         True
           ... 
 284802    True
 284803    True
 284804    True
 284805    True
 284806    True
 Length: 284807, dtype: bool)

In [13]:
df[~df.duplicated()]

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [14]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [15]:
df = df.drop('Time', axis=1)
df

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.551600,-0.617801,-0.991390,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.524980,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.119670,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,-1.593105,2.711941,-0.689256,4.626942,-0.924459,1.107641,1.991691,0.510632,-0.682920,1.475829,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,-0.150189,0.915802,1.214756,-0.675143,1.164931,-0.711757,-0.025693,-1.221179,-1.545556,0.059616,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,0.411614,0.063119,-0.183699,-0.510602,1.329284,0.140716,0.313502,0.395652,-0.577252,0.001396,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,-1.933849,-0.962886,-1.042082,0.449624,1.962563,-0.608577,0.509928,1.113981,2.897849,0.127434,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [16]:
df.info(), df.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Index: 283726 entries, 0 to 284806
Data columns (total 30 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   V1      283726 non-null  float64
 1   V2      283726 non-null  float64
 2   V3      283726 non-null  float64
 3   V4      283726 non-null  float64
 4   V5      283726 non-null  float64
 5   V6      283726 non-null  float64
 6   V7      283726 non-null  float64
 7   V8      283726 non-null  float64
 8   V9      283726 non-null  float64
 9   V10     283726 non-null  float64
 10  V11     283726 non-null  float64
 11  V12     283726 non-null  float64
 12  V13     283726 non-null  float64
 13  V14     283726 non-null  float64
 14  V15     283726 non-null  float64
 15  V16     283726 non-null  float64
 16  V17     283726 non-null  float64
 17  V18     283726 non-null  float64
 18  V19     283726 non-null  float64
 19  V20     283726 non-null  float64
 20  V21     283726 non-null  float64
 21  V22     283726 

(None,
 V1        0
 V2        0
 V3        0
 V4        0
 V5        0
 V6        0
 V7        0
 V8        0
 V9        0
 V10       0
 V11       0
 V12       0
 V13       0
 V14       0
 V15       0
 V16       0
 V17       0
 V18       0
 V19       0
 V20       0
 V21       0
 V22       0
 V23       0
 V24       0
 V25       0
 V26       0
 V27       0
 V28       0
 Amount    0
 Class     0
 dtype: int64)

In [17]:
df.groupby('Class').Amount.mean()

Class
0     88.413575
1    123.871860
Name: Amount, dtype: float64

In [18]:
X = df.drop('Class', axis=1)
y = df.Class

In [19]:
X.shape, y.shape

((283726, 29), (283726,))

In [20]:
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, stratify=y, train_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, stratify=y_train_val, train_size=0.2, random_state=42)

In [21]:
y_train_val.value_counts(normalize=True), y_test.value_counts(normalize=True)

(Class
 0    0.998326
 1    0.001674
 Name: proportion, dtype: float64,
 Class
 0    0.998335
 1    0.001665
 Name: proportion, dtype: float64)

In [22]:
y_train.value_counts(normalize=True), y_val.value_counts(normalize=True)

(Class
 0    0.998326
 1    0.001674
 Name: proportion, dtype: float64,
 Class
 0    0.998326
 1    0.001674
 Name: proportion, dtype: float64)

In [23]:
lr = LogisticRegression(max_iter=1000)
kf = StratifiedKFold(n_splits=5, shuffle=False)

In [24]:
recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    lr.fit(X_fold_train, y_fold_train)

    y_pred = lr.predict(X_fold_val)

    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

5it [00:19,  3.83s/it]


In [25]:
average_recall = np.mean(recall_scores)
average_precision = np.mean(precision_scores)
average_f1 = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)

In [26]:
scores_df = pd.DataFrame(data=[(average_recall, average_precision, average_f1, average_accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])
scores_df

Unnamed: 0,recall,precision,f1,accuracy
0,0.494737,0.785578,0.604915,0.998925


In [27]:
lr.fit(X_train, y_train)

In [28]:
y_prob = lr.predict_proba(X_val)[:, 1]
y_pred = (y_prob >= 0.002).astype(int)
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [29]:
recall = recall_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
accuracy = accuracy_score(y_val, y_pred) 

pd.DataFrame(data=[(recall, precision, f1, accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])

Unnamed: 0,recall,precision,f1,accuracy
0,0.855263,0.025958,0.050388,0.94603


In [30]:
cm = confusion_matrix(y_val, y_pred)
cm

#(tn, fp)
#(fn, tp)

array([[42881,  2439],
       [   11,    65]])

In [31]:
rus = RandomUnderSampler(random_state=42)

In [32]:
X_under, y_under = rus.fit_resample(X_train, y_train)

In [33]:
X_under.shape, X_train.shape

((38, 29), (11349, 29))

In [34]:
y_under.value_counts()

Class
0    19
1    19
Name: count, dtype: int64

In [56]:
random_under_pipeline = Pipeline(steps=[
    #('standard_scaler', StandardScaler()),
    ('random_under', RandomUnderSampler(random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state=13))
])
random_under_pipeline

In [57]:
recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    random_under_pipeline.fit(X_fold_train, y_fold_train)

    y_pred = random_under_pipeline.predict(X_fold_val)

    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

5it [00:00,  5.52it/s]


In [58]:
average_recall = np.mean(recall_scores)
average_precision = np.mean(precision_scores)
average_f1 = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)

In [59]:
scores_df = pd.DataFrame(data=[(average_recall, average_precision, average_f1, average_accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])
scores_df

Unnamed: 0,recall,precision,f1,accuracy
0,0.905263,0.018038,0.035366,0.916733


In [62]:
nearmiss_under_pipeline = Pipeline(steps=[
    ('nearmiss_under', NearMiss()),
    ('lr', LogisticRegression(max_iter=1000, random_state=13))
])
nearmiss_under_pipeline

In [63]:
recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    nearmiss_under_pipeline.fit(X_fold_train, y_fold_train)

    y_pred = nearmiss_under_pipeline.predict(X_fold_val)

    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

5it [00:01,  4.06it/s]


In [64]:
average_recall = np.mean(recall_scores)
average_precision = np.mean(precision_scores)
average_f1 = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)

In [65]:
scores_df = pd.DataFrame(data=[(average_recall, average_precision, average_f1, average_accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])
scores_df

Unnamed: 0,recall,precision,f1,accuracy
0,0.957895,0.002445,0.004878,0.343731


In [68]:
tomelinks_under_pipeline = Pipeline(steps=[
    ('tomelinks_under', TomekLinks()),
    ('lr', LogisticRegression(max_iter=1000, random_state=13))
])
tomelinks_under_pipeline

In [69]:
recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    tomelinks_under_pipeline.fit(X_fold_train, y_fold_train)

    y_pred = tomelinks_under_pipeline.predict(X_fold_val)

    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

5it [00:56, 11.39s/it]


In [70]:
average_recall = np.mean(recall_scores)
average_precision = np.mean(precision_scores)
average_f1 = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)

In [71]:
scores_df = pd.DataFrame(data=[(average_recall, average_precision, average_f1, average_accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])
scores_df

Unnamed: 0,recall,precision,f1,accuracy
0,0.536842,0.767007,0.626916,0.998943


In [73]:
ros = RandomOverSampler(random_state=42)
ros.fit(X_train, y_train)

In [74]:
X_over, y_over = ros.fit_resample(X_train, y_train)

In [76]:
X_over.shape, y_over.shape

((22660, 29), (22660,))

In [78]:
X_over.duplicated().sum(), y_over.value_counts()

(np.int64(11363),
 Class
 0    11330
 1    11330
 Name: count, dtype: int64)

In [87]:
random_over_pipeline = Pipeline(steps=[
    ('random_over', RandomOverSampler(random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state=13))
])
random_over_pipeline

In [88]:
recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    random_over_pipeline.fit(X_fold_train, y_fold_train)

    y_pred = random_over_pipeline.predict(X_fold_val)

    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

0it [00:00, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [89]:
average_recall = np.mean(recall_scores)
average_precision = np.mean(precision_scores)
average_f1 = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)

In [90]:
scores_df = pd.DataFrame(data=[(average_recall, average_precision, average_f1, average_accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])
scores_df

Unnamed: 0,recall,precision,f1,accuracy
0,0.863158,0.039236,0.075044,0.964367


In [91]:
smote_over_pipeline = Pipeline(steps=[
    ('smote_over', SMOTE(random_state=42)),
    ('lr', LogisticRegression(max_iter=1000, random_state=13))
])
smote_over_pipeline

In [92]:
recall_scores = []
precision_scores = []
f1_scores = []
accuracy_scores = []

for train_index, val_index in tqdm(kf.split(X_train_val, y_train_val)):
    X_fold_train, X_fold_val = X_train_val.iloc[train_index], X_train_val.iloc[val_index]
    y_fold_train, y_fold_val = y_train_val.iloc[train_index], y_train_val.iloc[val_index]

    smote_over_pipeline.fit(X_fold_train, y_fold_train)

    y_pred = smote_over_pipeline.predict(X_fold_val)

    recall = recall_score(y_fold_val, y_pred)
    precision = precision_score(y_fold_val, y_pred)
    f1 = f1_score(y_fold_val, y_pred)
    accuracy = accuracy_score(y_fold_val, y_pred)

    recall_scores.append(recall)
    precision_scores.append(precision)
    f1_scores.append(f1)
    accuracy_scores.append(accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
5it [01:12, 14.46s/it]


In [93]:
average_recall = np.mean(recall_scores)
average_precision = np.mean(precision_scores)
average_f1 = np.mean(f1_scores)
average_accuracy = np.mean(accuracy_scores)

scores_df = pd.DataFrame(data=[(average_recall, average_precision, average_f1, average_accuracy)], columns=['recall', 'precision', 'f1', 'accuracy'])
scores_df

Unnamed: 0,recall,precision,f1,accuracy
0,0.821053,0.060978,0.113263,0.978148
