In [1]:
# Agenda
## What is the purpose of model evaluation, and what are some common evaluation procedures?
## What is the usage of classification accuracy, and what are its limitations?
## How does a confusion matrix describe the performance of a classifier?
## What metrics can be computed from a confusion matrix?
## How can you adjust classifier performance by changing the classification threshold?
## What is the purpose of an ROC curve?
## How does Area Under the Curve (AUC) differ from classification accuracy?

In [2]:
# Review of model evaluation
## Need a way to choose between models: different model types, tuning parameters, and features
## Use a model evaluation procedure to estimate how well a model will generalize to out-of-sample data
## Requires a model evaluation metric to quantify the model performance

In [3]:
#Model evaluation procedures
##Training and testing on the same data
##Rewards overly complex models that "overfit" the training data and won't necessarily generalize
#Train/test split
##Split the dataset into two pieces, so that the model can be trained and tested on different data
##Better estimate of out-of-sample performance, but still a "high variance" estimate
##Useful due to its speed, simplicity, and flexibility
# K-fold cross-validation
##Systematically create "K" train/test splits and average the results together
##Even better estimate of out-of-sample performance
##Runs "K" times slower than train/test spli

In [4]:
#Model evaluation metrics
##Regression problems: Mean Absolute Error, Mean Squared Error, Root Mean Squared Error
##Classification problems: Classification accuracy

In [13]:
pd.options.display.max_rows = 999

# Classification accuracy

In [5]:
import pandas as pd

In [6]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"
# as the url is not working need to download the dataset from kaggle
col_names = ['pregnant','glucose',"bp","skin thickness",'insulin',"bmi",'pedigree','age',"label"]

In [9]:
pima = pd.read_csv(r"C:\Users\mynam\Downloads\diabetes.csv")

In [15]:
pima.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [17]:
feature_col = ['Pregnancies','Insulin','BMI','Age']
X = pima[feature_col]
Y = pima.Outcome

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train , X_test ,Y_train ,Y_test = train_test_split(X,Y,random_state=0)

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
logreg = LogisticRegression()
logreg.fit(X_train,Y_train)

LogisticRegression()

In [23]:
y_pred = logreg.predict(X_test)

In [24]:
from sklearn import metrics

In [26]:
# Classification Accuracy

In [25]:
metrics.accuracy_score(Y_test,y_pred)

0.6770833333333334

# we should always check the classifiaction accuracy with null accuracy

In [27]:
# null accuracy: accuracy that could be achived by predicting the most frequent class 

In [28]:
Y_test.value_counts()
# examine the class distribution of the testing set (using a Pandas Series method)

0    130
1     62
Name: Outcome, dtype: int64

In [30]:
# calculate the percentage of ones
Y_test.mean()

0.3229166666666667

In [32]:
# calculate the percentage of zeros
1 - Y_test.mean()

0.6770833333333333

In [33]:
# calculate null accuracy (for binary classification problems coded as 0/1)
max(Y_test.mean(), 1 - Y_test.mean())

0.6770833333333333

In [34]:
# calculate null accuracy (for multi-class classification problems)
Y_test.value_counts().head(1) / len(Y_test)

0    0.677083
Name: Outcome, dtype: float64

In [35]:
#  Comparing the true and predicted response values

In [38]:
# print the first 25 true and predicted responses
print('True:', Y_test.values[0:25])
print('Pred:', y_pred[0:25])

True: [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Pred: [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [39]:
# Conclusion:
##Classification accuracy is the easiest classification metric to understand
##But, it does not tell you the underlying distribution of response values
##And, it does not tell you what "types" of errors your classifier is making

In [40]:
# this is the issue adressed by the confusion matrix 

In [41]:
# Confusion matrix
## Table that describes the performance of a classification model

In [42]:
# IMPORTANT: first argument is true values, second argument is predicted values
print(metrics.confusion_matrix(Y_test, y_pred))
# if u did it in reverse order the matrix will be reversed and no error will be raised this is because the 
# matrics in the sklearn model ascpect that the first value is the true value  

[[114  16]
 [ 46  16]]


In [43]:
# in above the rows are representing the actual values [0] for 0  and [1] for 1
# the column are representing the predticting values [0]is for 0  and [1] is for 1

In [44]:
#very observation in the testing set is represented in exactly one box
#It's a 2x2 matrix because there are 2 response classes
#The format shown here is not universal

In [45]:
#Basic terminology

#True Positives (TP): we correctly predicted that they do have diabetes
#True Negatives (TN): we correctly predicted that they don't have diabetes
#False Positives (FP): we incorrectly predicted that they do have diabetes (a "Type I error")
#False Negatives (FN): we incorrectly predicted that they don't have diabetes (a "Type II error")

In [46]:
# print the first 25 true and predicted responses
print('True:', Y_test.values[0:25])
print('Pred:', y_pred[0:25])

True: [1 0 0 1 0 0 1 1 0 0 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0]
Pred: [0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [47]:
# save confusion matrix and slice into four pieces
confusion = metrics.confusion_matrix(Y_test, y_pred)
TP = confusion[1, 1]
TN = confusion[0, 0]
FP = confusion[0, 1]
FN = confusion[1, 0]

In [48]:
#it help us to understand how well the classifier performed but does not help in understandinig which 
# model is the best 
# however there are many matrix which can be calculaated from the confusion matrix and those can be used in
# the model selection procedure 