In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('./tutorial_data/creditcard.csv')[:80000]
print(df.head(3))

X = df.drop(columns=['Time', 'Amount', 'Class']).values
y = df['Class'].values


print(f'Shape of X: {X.shape}')
print(f'Shape of y: {y.shape}')
print(f'Fraud Cases: {y.sum()}')
# unbalanced Dataset (just 196 Fraud Cases in 80.000 entries) (0 = non fraud, 1 = fraud)

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   

        V26       V27       V28  Amount  Class  
0 -0.189115  0.133558 -0.021053  149.62      0  
1  0.125895 -0.008983  0.014724    2.69      0  
2 -0.139097 -0.055353 -0.059752  378.66      0  

[3 rows x 31 columns]
Shape of X: (80000, 28)
Shape of y: (80000,)
Fraud Cases: 196


In [3]:
from sklearn.linear_model import LogisticRegression
mod = LogisticRegression(class_weight ={0:1, 1:2} ,max_iter=1000) #
mod.fit(X, y).predict(X).sum()

171

In [4]:
from sklearn.linear_model import LogisticRegression
mod = LogisticRegression(class_weight ={0:1, 1:2} ,max_iter=1000) #
mod.fit(X, y).predict(X).sum()

from sklearn.model_selection import GridSearchCV

grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid={'class_weight': [{0:1, 1:v} for v in range(1,4)]},
    cv = 4
)
grid.fit(X, y)

In [5]:
grid_results = pd.DataFrame(grid.cv_results_)
print(grid_results)

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       1.394206      0.072633         0.005563        0.001752   
1       1.338704      0.671255         0.007093        0.003436   
2       2.441844      0.420996         0.011762        0.004497   

  param_class_weight                          params  split0_test_score  \
0       {0: 1, 1: 1}  {'class_weight': {0: 1, 1: 1}}            0.99405   
1       {0: 1, 1: 2}  {'class_weight': {0: 1, 1: 2}}            0.99025   
2       {0: 1, 1: 3}  {'class_weight': {0: 1, 1: 3}}            0.98730   

   split1_test_score  split2_test_score  split3_test_score  mean_test_score  \
0            0.99835            0.99945            0.99780         0.997413   
1            0.99840            0.99960            0.99805         0.996575   
2            0.99845            0.99960            0.99815         0.995875   

   std_test_score  rank_test_score  
0        0.002030                1  
1        0.003697                2  
2 

#### Der Acc-Score ist sehr gut aber vermutlich nur weil es kaum Frauds in den Datem gibt und da Modell das auch predicted
#### Deshalb wäre es sinnvoll eine andere Metrik zu nutzten 

In [12]:
from sklearn.metrics import precision_score, recall_score, make_scorer

grid = GridSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_grid={'class_weight': [{0:1, 1:v} for v in np.linspace(1, 20, 30)]},
    scoring = {'precision':make_scorer(precision_score), 'recall_score':make_scorer(recall_score)},
    refit='precision',
    return_train_score=True, 
    cv = 10
)
grid.fit(X, y)
print(pd.DataFrame(grid.cv_results_))

   mean_fit_time  std_fit_time  mean_score_time  std_score_time  \
0       0.851581      0.180361         0.018123        0.001126   
1       0.811322      0.210956         0.018546        0.001301   
2       0.893184      0.134789         0.020220        0.001535   

  param_class_weight                          params  split0_test_precision  \
0       {0: 1, 1: 1}  {'class_weight': {0: 1, 1: 1}}               0.281250   
1       {0: 1, 1: 2}  {'class_weight': {0: 1, 1: 2}}               0.190678   
2       {0: 1, 1: 3}  {'class_weight': {0: 1, 1: 3}}               0.154882   

   split1_test_precision  split2_test_precision  split3_test_precision  ...  \
0                    1.0               0.952381               0.857143  ...   
1                    1.0               0.955556               0.812500  ...   
2                    1.0               0.955556               0.800000  ...   

   split3_test_recall_score  mean_test_recall_score  std_test_recall_score  \
0                  