# Evaluation metric for ML model

## Classification problem

In [1]:
import pandas as pd
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, StratifiedKFold

import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

In [2]:
# import data
diabetes_data = pd.read_csv('./input/diabetes.csv')

X =  diabetes_data.drop(["Outcome"],axis = 1)
y = diabetes_data["Outcome"]

In [3]:
X.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148,72,35,0,33.6,0.627,50
1,1,85,66,29,0,26.6,0.351,31
2,8,183,64,0,0,23.3,0.672,32
3,1,89,66,23,94,28.1,0.167,21
4,0,137,40,35,168,43.1,2.288,33


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 56)

In [5]:
cv = StratifiedKFold(n_splits=10, shuffle = True)

### Logistic regression

In [6]:
clf_logreg = LogisticRegression()
clf_logreg.fit(X_train, y_train)

In [7]:
# Make class predictions for the validation set.
y_pred_class_logreg = cross_val_predict(clf_logreg, X_train, y_train, cv = cv)

In [8]:
# predicted probabilities for class 1, probabilities of positive class
y_pred_prob_logreg = cross_val_predict(clf_logreg, 
                                       X_train, y_train, 
                                       cv = cv, method="predict_proba")
y_pred_prob_logreg_class1 = y_pred_prob_logreg[:, 1]

In [9]:
print(y_pred_class_logreg[0:5])
print(y_pred_prob_logreg[0:5])
print(y_pred_prob_logreg_class1[0:5])

[0 1 1 0 0]
[[0.84896091 0.15103909]
 [0.39280507 0.60719493]
 [0.45541236 0.54458764]
 [0.81538766 0.18461234]
 [0.9521612  0.0478388 ]]
[0.15103909 0.60719493 0.54458764 0.18461234 0.0478388 ]


### SGD classifier

In [10]:
clf_SGD = SGDClassifier()
clf_SGD.fit(X_train, y_train)
# make class predictions for the validation set
y_pred_class_SGD = cross_val_predict(clf_SGD, X_train, y_train, cv = cv)
# predicted probabilities for class 1
y_pred_prob_SGD = cross_val_predict(clf_SGD, X_train, y_train, cv = cv, method="decision_function")

## Random forest

In [11]:
clf_rfc = RandomForestClassifier()
clf_rfc.fit(X_train, y_train)
y_pred_class_rfc = cross_val_predict(clf_rfc, X_train, y_train, cv = cv)
y_pred_prob_rfc = cross_val_predict(clf_rfc, X_train, y_train, cv = cv, method="predict_proba")
y_pred_prob_rfc_class1 = y_pred_prob_rfc[:, 1]

### BaseClassifier for null accuracy

In [12]:
from sklearn.base import BaseEstimator
import numpy as np

class BaseClassifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X), 1), dtype=bool)
    
base_clf = BaseClassifier()
cross_val_score(base_clf, X_train, y_train, cv=10, scoring="accuracy")

array([0.55172414, 0.63793103, 0.60344828, 0.75862069, 0.65517241,
       0.72413793, 0.61403509, 0.68421053, 0.66666667, 0.61403509])

In [13]:
null_accuracy = y_train.value_counts().head(1) / len(y_train)

In [14]:
y_train.value_counts().head(1)/len(y_train)

0    0.651042
Name: Outcome, dtype: float64

### classification accuracy
This is for balanced data.
Don't use in imbalanced data

In [15]:
acc_logreg = cross_val_score(clf_logreg, X_train, y_train, 
                             cv = cv, scoring = 'accuracy').mean()
acc_SGD = cross_val_score(clf_SGD, X_train, y_train, 
                          cv = cv, scoring = 'accuracy').mean()
acc_rfc = cross_val_score(clf_rfc, X_train, y_train, 
                          cv = cv, scoring = 'accuracy').mean()
print(acc_logreg, acc_SGD, acc_rfc)

0.7848154869933455 0.5812764670296431 0.7743194192377496


## Logarithmic Loss / Log Loss / Logistic Loss / Cross-Entropy Loss
Log loss measures the UNCERTAINTY of the probabilities of the model by comparing them to the true labels and penalising the false classifications.

In [16]:
logloss_logreg = cross_val_score(clf_logreg, X_train, y_train, cv = cv, scoring = 'neg_log_loss').mean()
logloss_rfc = cross_val_score(clf_rfc, X_train, y_train, cv = cv, scoring = 'neg_log_loss').mean()


In [17]:
# SGDClassifier's hinge loss doesn't support probability estimates.
# We can set SGDClassifier as the base estimator in Scikit-learn's CalibratedClassifierCV, which will generate probability estimates.

from sklearn.calibration import CalibratedClassifierCV

new_clf_SGD = CalibratedClassifierCV(clf_SGD)
new_clf_SGD.fit(X_train, y_train)
logloss_SGD = cross_val_score(new_clf_SGD, X_train, y_train, cv = cv, scoring = 'neg_log_loss').mean()

logloss_logreg, logloss_SGD, logloss_rfc

(-0.4757450889825341, -0.6427222442658042, -0.4788442934999625)