<a href="https://colab.research.google.com/github/Xaypanya/Machine-Learning/blob/main/Lab3_2_Confusion_Matrix2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Binary Classification: Confusion Matrix

---
* author:  [Prasert Kanawattanachai](prasert.k@chula.ac.th)
* YouTube: https://www.youtube.com/prasertcbs
* github: https://github.com/prasertcbs/scikitlearn_tutorial
* [Chulalongkorn Business School](https://www.cbs.chula.ac.th/en/)
---

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [None]:
print(f'pandas  version = {pd.__version__}')
print(f'numpy   version = {np.__version__}')
print(f'seaborn version = {sns.__version__}')

### data source
https://en.wikipedia.org/wiki/Logistic_regression#Probability_of_passing_an_exam_versus_hours_of_study

In [None]:
df=pd.read_csv('https://github.com/prasertcbs/basic-dataset/raw/master/study_hours.csv')
df.T

In [None]:
sns.lmplot(x='Hours', y='Pass', data=df,
           logistic=True, height=4, aspect=1.5, 
           line_kws={'color': 'orange'})
plt.ylabel('Probability of passing exam');
plt.axvline(2.71, color='green', linestyle='--')
plt.axhline(.5, color='red', linestyle='--');

# Scikit-learn: LogisticRegression
doc: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

In [None]:
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [None]:
print(f"sklearn version = {sklearn.__version__}")

In [None]:
X_train = df[['Hours']]
y_train = df.Pass

In [None]:
X_train # pandas DataFrame

In [None]:
y_train # pandas Series

In [None]:
model = LogisticRegression()
# model = LogisticRegression(C=1e10) # minimize regularization
model

In [None]:
model.fit(X_train, y_train)

In [None]:
model.score(X_train, y_train) # accuracy of the model

In [None]:
predicted = model.predict(X_train)
predicted

In [None]:
y_train.values

## Confusion matrix

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, precision_recall_fscore_support, f1_score

### further reading:
* https://scikit-learn.org/stable/modules/model_evaluation.html#classification-metrics
* https://en.wikipedia.org/wiki/Confusion_matrix

In [None]:
X_train['Hours'].values

In [None]:
y_train.values

In [None]:
predicted

In [None]:
cm=confusion_matrix(y_train, predicted)
cm

In [None]:
plot_confusion_matrix(model, X_train, y_train, cmap='Oranges')

## scikit-learn: confusion matrix
$$
\begin{bmatrix}
C_{0,0} & C_{0,1} \\
C_{1,0} & C_{1,1}
\end{bmatrix}
=
\begin{bmatrix}
tn&fp \\
fn&tp
\end{bmatrix}
$$

<table>
<tr>
    <td></td>
    <td>predicted false</td>
    <td>predicted true</td>
</tr>
<tr>
    <td>actual false</td>
    <td>tn</td>
    <td>fp</td>
</tr>
<tr>
    <td>actual true</td>
    <td>fn</td>
    <td>tp</td>
</tr>
<table>

In [None]:
pd.crosstab(y_train, predicted, rownames=['Actual'], colnames=['Predicted'], margins=True, margins_name='Total')

In [None]:
dd=pd.DataFrame({'actual': y_train.values, 'predicted': predicted})
dd

In [None]:
pd.crosstab(dd['actual'], dd['predicted'])

In [None]:
metrics.confusion_matrix(y_train, predicted)

In [None]:
pd.DataFrame(metrics.confusion_matrix(y_train, predicted), 
             columns=['pred_fail', 'pred_pass'],
             index=['actual_fail', 'actual_pass'])

In [None]:
def pretty_confusion_matrix(cm_array):
    '''
    add labels and grand total to sklearn confusion matrix (2 x 2)
    '''
    df=pd.DataFrame(cm_array, 
                   columns=['pred_fail', 'pred_pass'], 
                   index=['actual_fail', 'actual_pass'])
    
    index_labels=df.index.tolist()
    df=df.append(df.sum(axis=0), ignore_index=True)
    df.index=index_labels + ['total']
    df['total']=df.sum(axis=1)
    return df

In [None]:
pretty_confusion_matrix(metrics.confusion_matrix(y_train, predicted))

In [None]:
tn, fp, fn, tp = metrics.confusion_matrix(y_train, predicted).ravel()

$$Accuracy={\frac {{TP+TN} }{TP + TN +FP + FN} }\\
Precision={\frac {TP}{TP+FP}}\\
Recall={\frac {TP}{TP+FN}}\\
F1 = 2 \times {\frac {precision \times recall} {precision + recall}}
$$

In [None]:
(tp+tn)/(tp+tn+fp+fn) # accuracy score

In [None]:
metrics.accuracy_score(y_train, predicted)

In [None]:
metrics.precision_score(y_train, predicted)

In [None]:
tp/(tp+fp) # precision score

In [None]:
metrics.recall_score(y_train, predicted)

In [None]:
tp/(tp+fn) # recall score

In [None]:
metrics.f1_score(y_train, predicted)

In [None]:
print(metrics.classification_report(y_train, predicted))

In [None]:
tn/(tn+fn) # precision (class=0)

In [None]:
tn/(tn+fp) # recall (class=0)