# 实现混淆矩阵，精准率和召回率

In [1]:
import numpy as np
from sklearn import datasets

In [3]:
digits = datasets.load_digits()
X = digits.data
y = digits.target.copy()

# 把数据变为极度偏斜的数据
# 把手写数字分为9和非9两大类， 重点关注的是分类为9的数字
y[digits.target==9] = 1
y[digits.target!=9] = 0

In [4]:
from sklearn.model_selection._split import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [5]:
from sklearn.linear_model.logistic import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)

0.97555555555555551

虽然0.975555555551看上去很高了，但因为我们的数据是极度偏斜的数据，即使我们把全部分类预测为"非9"也会有0.9左右的正确率

In [6]:
y_predict = log_reg.predict(X_test)

## 求TP，FP，FN，TN的值

In [7]:
def TN(y_true, y_predict):
    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'
    return np.sum((y_true == 0) & (y_predict == 0))

TN(y_test, y_predict)

403

In [8]:
def FP(y_true, y_predict):
    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'
    return np.sum((y_true == 0) & (y_predict == 1))

FP(y_test, y_predict)

2

In [9]:
def FN(y_true, y_predict):
    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'
    return np.sum((y_true == 1) & (y_predict == 0))

FN(y_test, y_predict)

9

In [10]:
def TP(y_true, y_predict):
    assert len(y_true) == len(y_predict),'y_true与y_predict的样本数目必须一致'
    return np.sum((y_true == 1) & (y_predict == 1))

TP(y_test, y_predict)

36

In [12]:
def confusion_matrix(y_true, y_predict):
    """返回一个2✖️2的混淆矩阵"""
    return np.array([
        [TN(y_true, y_predict), FP(y_true, y_predict)],
        [FN(y_true, y_predict), TP(y_true, y_predict)]
    ])

confusion_matrix(y_test, y_predict)

array([[403,   2],
       [  9,  36]])

## 根据混淆矩阵求精准率和召回率

In [13]:
def precision_score(y_true, y_predict):
    """求精准率"""
    tp = TP(y_true, y_predict)
    fp = FP(y_true, y_predict)
    try:
        return tp / (tp + fp)
    except:     # 分母为0时，结果返回0
        return 0.0

# 精准率
precision_score(y_test, y_predict)

0.94736842105263153

In [14]:
def recall_score(y_true, y_predict):
    """求召回率"""
    tp = TP(y_true, y_predict)
    fn = FN(y_true, y_predict)
    try:
        return tp / (tp + fn)
    except:
        return 0.0

# 召回率
recall_score(y_test, y_predict)

0.80000000000000004

# scikit-learn中的混淆矩阵，精准率和召回率

混淆矩阵

In [15]:
import sklearn.metrics.classification as classification
classification.confusion_matrix(y_test, y_predict)

array([[403,   2],
       [  9,  36]])

精准率

In [16]:
classification.precision_score(y_test, y_predict)

0.94736842105263153

召回率

In [17]:
classification.recall_score(y_test, y_predict)

0.80000000000000004