# Understanding Classification Metrics in Scikit - Learn
--- 

In [133]:
import pandas as pd
import numpy as np

data = pd.read_csv('/Users/qab/Desktop/data.csv')
data.head()

Unnamed: 0,actual_label,model_RF,model_LR
0,1,0.639816,0.531904
1,0,0.490993,0.414496
2,1,0.623815,0.569883
3,1,0.506616,0.443674
4,0,0.418302,0.369532


In [134]:
thresh = 0.5
data['predicted_RF'] = (data.model_RF >= 0.5).astype('int')
data['predicted_LR'] = (data.model_LR >= 0.5).astype('int')
data.head()

Unnamed: 0,actual_label,model_RF,model_LR,predicted_RF,predicted_LR
0,1,0.639816,0.531904,1,1
1,0,0.490993,0.414496,0,0
2,1,0.623815,0.569883,1,1
3,1,0.506616,0.443674,1,0
4,0,0.418302,0.369532,0,0


In [135]:
from sklearn.metrics import confusion_matrix
confusion_matrix(data.actual_label.values, data.predicted_RF.values)

array([[5519, 2360],
       [2832, 5047]])

In [136]:
def find_TP (y_true, y_pred):
    return sum((y_true == 1) & (y_pred == 1))

def find_FN (y_true, y_pred):
    return sum((y_true == 1) & (y_pred == 0))

def find_TN (y_true, y_pred):
    return sum((y_true == 0) & (y_pred == 0))

def find_FP (y_true, y_pred):
    return sum((y_true == 0) & (y_pred == 1))

In [137]:
find_TP(data.actual_label, data.predicted_RF)

5047

In [138]:
find_FN(data.actual_label, data.predicted_RF)

2832

In [139]:
find_TN(data.actual_label, data.predicted_RF)

5519

In [140]:
find_FP(data.actual_label, data.predicted_RF)

2360

In [141]:
print('TP:',find_TP(data.actual_label.values, data.predicted_RF.values))
print('FN:',find_FN(data.actual_label.values, data.predicted_RF.values))
print('FP:',find_FP(data.actual_label.values, data.predicted_RF.values))
print('TN:',find_TN(data.actual_label.values, data.predicted_RF.values))

TP: 5047
FN: 2832
FP: 2360
TN: 5519


In [207]:
def find_conf_matrix_values(y_true, y_pred):
    TP = find_TP(y_true, y_pred)
    FN = find_FN(y_true, y_pred)
    FP = find_FP(y_true, y_pred)
    TN = find_TN(y_true, y_pred)
    return TP, TN, FN, FP

def my_confusion_matrix(y_true, y_pred):
    TP, FN, FP, TN = find_conf_matrix_values(y_true, y_pred)
    return np.array([[TN, FP], [FN, TP]])

In [209]:
my_confusion_matrix(data.actual_label.values, data.predicted_RF.values)

array([[2360, 5519],
       [2832, 5047]])

In [210]:
find_conf_matrix_values(data.actual_label, data.predicted_RF)

(5047, 5519, 2832, 2360)

In [211]:
assert  np.array_equal(my_confusion_matrix(data.actual_label.values, data.predicted_RF.values),\
                       confusion_matrix(data.actual_label.values, data.predicted_RF.values) ), 'my_confusion_matrix() is not correct for RF'

assert  np.array_equal(my_confusion_matrix(data.actual_label.values, data.predicted_LR.values),\
                       confusion_matrix(data.actual_label.values, data.predicted_LR.values) ), 'my_confusion_matrix() is not correct for LR'

AssertionError: my_confusion_matrix() is not correct for RF

In [212]:
from sklearn.metrics import accuracy_score
accuracy_score(data.actual_label.values, data.predicted_RF.values)

0.6705165630156111

In [217]:
def my_accuracy_score(y_true, y_pred):
    TP, FN, FP, TN = find_conf_matrix_values(y_true, y_pred)
    return accuracy_score(y_true, y_pred)

In [222]:
assert my_accuracy_score(data.actual_label.values, data.predicted_RF.values) == accuracy_score(data.actual_label.values, data.predicted_RF.values), 'my_accuracy_score failed on RF'
assert my_accuracy_score(data.actual_label.values, data.predicted_LR.values) == accuracy_score(data.actual_label.values, data.predicted_LR.values), 'my_accuracy_score failed on LR'
print('Accuracy RF: %.3f'%(my_accuracy_score(data.actual_label.values, data.predicted_RF.values)))
print('Accuracy LR: %.3f'%(my_accuracy_score(data.actual_label.values, data.predicted_LR.values)))

Accuracy RF: 0.671
Accuracy LR: 0.616
