# Scikit-Learn Metrics

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt

In [4]:
df = pd.read_csv("creditcard.csv")[:80000]
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
df.columns

Index(['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')

In [14]:
x = df.drop(columns=['Time', 'Amount', 'Class']).values
y = df['Class'].values
print(f"Shapes of x={x.shape}, y={y.shape}, #Fraud Cases = {y.sum()}")

Shapes of x=(80000, 28), y=(80000,), #Fraud Cases = 196


**Note:** This data is imbalanced -> # of faudses cases vs # not fraud cases

In [15]:
from sklearn.linear_model import LogisticRegression

mod = LogisticRegression(class_weight={0:1,1:2}, max_iter = 1000)
mod.fit(x,y).predict(x).sum()

171

In [27]:
??mod.score

[1;31mSignature:[0m [0mmod[0m[1;33m.[0m[0mscore[0m[1;33m([0m[0mX[0m[1;33m,[0m [0my[0m[1;33m,[0m [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mSource:[0m   
    [1;32mdef[0m [0mscore[0m[1;33m([0m[0mself[0m[1;33m,[0m [0mX[0m[1;33m,[0m [0my[0m[1;33m,[0m [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m)[0m[1;33m:[0m[1;33m
[0m        [1;34m"""
        Return the mean accuracy on the given test data and labels.

        In multi-label classification, this is the subset accuracy
        which is a harsh metric since you require for each sample that
        each label set be correctly predicted.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples.

        y : array-like of shape (n_samples,) or (n_samples, n_outputs)
            True labels for X.

        sample_weight : array-like of shape (n_samples,), default=None
           

In [16]:
from sklearn.model_selection import GridSearchCV

In [20]:
grid = GridSearchCV(
    estimator = LogisticRegression(max_iter=1000),
    param_grid = {'class_weight': [{0:1, 1:v} for v in range(1,4)]}, 
    cv = 4,
    n_jobs = -1
)

In [21]:
grid.fit(x,y)

GridSearchCV(cv=4, estimator=LogisticRegression(max_iter=1000), n_jobs=-1,
             param_grid={'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2},
                                          {0: 1, 1: 3}]})

In [22]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,mean_test_score,std_test_score,rank_test_score
0,3.215928,0.755951,0.012534,0.003594,"{0: 1, 1: 1}","{'class_weight': {0: 1, 1: 1}}",0.99405,0.99835,0.99945,0.9978,0.997413,0.00203,1
1,3.902,1.266062,0.007896,0.001477,"{0: 1, 1: 2}","{'class_weight': {0: 1, 1: 2}}",0.99025,0.9984,0.9996,0.99805,0.996575,0.003697,2
2,2.625607,0.309669,0.007394,0.001714,"{0: 1, 1: 3}","{'class_weight': {0: 1, 1: 3}}",0.9873,0.99845,0.9996,0.99815,0.995875,0.00498,3


In [32]:
from sklearn.metrics import precision_score, recall_score, make_scorer

In [30]:
precision_score(y, grid.predict(x))

0.7682119205298014

In [31]:
recall_score(y, grid.predict(x))

0.5918367346938775

**Iternation 2 w/ Metrics**

In [41]:
grid = GridSearchCV(
    estimator = LogisticRegression(max_iter=1000),
    param_grid = {'class_weight': [{0:1, 1:v} for v in range(1,4)]}, 
    scoring={'precision':make_scorer(precision_score), 'recall_score':make_scorer(recall_score)},
    refit = 'precision', # Telling Python which metric you want to optimize
    return_train_score=True,
    cv = 4,
    n_jobs = -1
)

In [42]:
grid.fit(x,y)

GridSearchCV(cv=4, estimator=LogisticRegression(max_iter=1000), n_jobs=-1,
             param_grid={'class_weight': [{0: 1, 1: 1}, {0: 1, 1: 2},
                                          {0: 1, 1: 3}]},
             refit='precision', return_train_score=True,
             scoring={'precision': make_scorer(precision_score),
                      'recall_score': make_scorer(recall_score)})

In [43]:
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,...,split3_test_recall_score,mean_test_recall_score,std_test_recall_score,rank_test_recall_score,split0_train_recall_score,split1_train_recall_score,split2_train_recall_score,split3_train_recall_score,mean_train_recall_score,std_train_recall_score
0,4.735719,1.195462,0.05665,0.002722,"{0: 1, 1: 1}","{'class_weight': {0: 1, 1: 1}}",0.28125,1.0,0.952381,0.857143,...,0.122449,0.545918,0.331397,3,0.863946,0.585034,0.530612,0.693878,0.668367,0.127301
1,4.514259,1.331143,0.056772,0.014615,"{0: 1, 1: 2}","{'class_weight': {0: 1, 1: 2}}",0.190678,1.0,0.955556,0.8125,...,0.265306,0.602041,0.297672,2,0.870748,0.659864,0.632653,0.782313,0.736395,0.095889
2,3.828712,0.402053,0.03735,0.004547,"{0: 1, 1: 3}","{'class_weight': {0: 1, 1: 3}}",0.154882,1.0,0.955556,0.8,...,0.326531,0.627551,0.281816,1,0.870748,0.714286,0.680272,0.816327,0.770408,0.076568


**Iteration 3**
- Increase Cross Validations
- Increase range for class weights

In [44]:
grid = GridSearchCV(
    estimator = LogisticRegression(max_iter=1000),
    param_grid = {'class_weight': [{0:1, 1:v} for v in np.linspace(1,20, 30)]}, 
    scoring={'precision':make_scorer(precision_score), 'recall_score':make_scorer(recall_score)},
    refit = 'precision', # Telling Python which metric you want to optimize
    return_train_score=True,
    cv = 10,
    n_jobs = -1
)

In [45]:
grid.fit(x,y)
pd.DataFrame(grid.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,params,split0_test_precision,split1_test_precision,split2_test_precision,split3_test_precision,...,split2_train_recall_score,split3_train_recall_score,split4_train_recall_score,split5_train_recall_score,split6_train_recall_score,split7_train_recall_score,split8_train_recall_score,split9_train_recall_score,mean_train_recall_score,std_train_recall_score
0,5.459089,1.080728,0.022008,0.006768,"{0: 1, 1: 1.0}","{'class_weight': {0: 1, 1: 1.0}}",1.0,0.463415,0.583333,1.0,...,0.627119,0.548023,0.573864,0.573864,0.5625,0.613636,0.636364,0.607955,0.612185,0.054733
1,3.935615,0.632611,0.018449,0.006785,"{0: 1, 1: 1.6551724137931034}","{'class_weight': {0: 1, 1: 1.6551724137931034}}",1.0,0.463415,0.583333,1.0,...,0.683616,0.627119,0.670455,0.647727,0.630682,0.6875,0.698864,0.6875,0.680239,0.050286
2,3.659381,0.619272,0.017998,0.005718,"{0: 1, 1: 2.310344827586207}","{'class_weight': {0: 1, 1: 2.310344827586207}}",1.0,0.463415,0.583333,1.0,...,0.740113,0.683616,0.710227,0.698864,0.6875,0.715909,0.744318,0.727273,0.724454,0.043881
3,3.453383,0.480236,0.016995,0.004173,"{0: 1, 1: 2.9655172413793105}","{'class_weight': {0: 1, 1: 2.9655172413793105}}",1.0,0.452381,0.583333,1.0,...,0.785311,0.706215,0.744318,0.732955,0.715909,0.755682,0.772727,0.738636,0.749978,0.039589
4,3.436573,0.581563,0.019401,0.005343,"{0: 1, 1: 3.6206896551724137}","{'class_weight': {0: 1, 1: 3.6206896551724137}}",1.0,0.452381,0.583333,1.0,...,0.824859,0.740113,0.755682,0.744318,0.727273,0.778409,0.784091,0.761364,0.771498,0.037959
5,3.588105,0.617191,0.021206,0.00763,"{0: 1, 1: 4.275862068965517}","{'class_weight': {0: 1, 1: 4.275862068965517}}",1.0,0.452381,0.583333,1.0,...,0.841808,0.768362,0.778409,0.789773,0.772727,0.789773,0.795455,0.772727,0.792485,0.029289
6,3.670359,0.619388,0.016644,0.00279,"{0: 1, 1: 4.931034482758621}","{'class_weight': {0: 1, 1: 4.931034482758621}}",1.0,0.452381,0.583333,1.0,...,0.847458,0.80226,0.789773,0.818182,0.801136,0.801136,0.818182,0.789773,0.812327,0.021063
7,3.574319,1.022666,0.025367,0.018726,"{0: 1, 1: 5.586206896551724}","{'class_weight': {0: 1, 1: 5.586206896551724}}",1.0,0.452381,0.583333,1.0,...,0.847458,0.813559,0.829545,0.829545,0.8125,0.806818,0.835227,0.818182,0.82708,0.01719
8,4.655176,1.100321,0.022311,0.00644,"{0: 1, 1: 6.241379310344827}","{'class_weight': {0: 1, 1: 6.241379310344827}}",0.944444,0.452381,0.583333,0.947368,...,0.847458,0.824859,0.846591,0.835227,0.823864,0.829545,0.857955,0.829545,0.838431,0.013974
9,5.190244,1.255142,0.027623,0.01533,"{0: 1, 1: 6.896551724137931}","{'class_weight': {0: 1, 1: 6.896551724137931}}",0.944444,0.452381,0.583333,0.947368,...,0.853107,0.830508,0.846591,0.835227,0.829545,0.835227,0.863636,0.846591,0.844665,0.012015
