# 精准率和召回率的平衡

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from sklearn import datasets
digits = datasets.load_digits()
X = digits.data
y = digits.target.copy()

# 把数据变为极度偏斜的数据
# 把手写数字分为9和非9两大类， 重点关注的是分类为9的数字
y[digits.target==9] = 1
y[digits.target!=9] = 0

In [3]:
from sklearn.model_selection._split import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [4]:
from sklearn.linear_model.logistic import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_predict = log_reg.predict(X_test)

In [5]:
from sklearn.metrics.scorer import f1_score

f1_score(y_test, y_predict)

0.86746987951807231

对于极其偏斜的数据，f1_score是比准确率更加准确的评判标准

In [6]:
from sklearn.metrics.classification import confusion_matrix

confusion_matrix(y_test, y_predict)

array([[403,   2],
       [  9,  36]])

In [7]:
log_reg.decision_function(X_test)

array([-22.05698982, -33.02941569, -16.21335556, -80.37917862,
       -48.25125007, -24.54006981, -44.39168237, -25.04296365,
        -0.97828044, -19.71746541, -66.25139253, -51.09604623,
       -31.49349131, -46.05333357, -38.67878372, -29.804725  ,
       -37.58850985, -82.57570471, -37.81905611, -11.01165092,
        -9.17440401, -85.13005039, -16.71614385, -46.23727242,
        -5.32995324, -47.91762196, -11.66731351, -39.19611282,
       -25.25295477, -14.36646658, -16.99785232, -28.91907577,
       -34.33943416, -29.4761261 ,  -7.85813152,  -3.82091535,
       -24.08168877, -22.16362583, -33.61223199, -23.14024822,
       -26.91806299, -62.38939364, -38.85693085, -66.77261823,
       -20.14483308, -17.47886804, -18.0680003 , -22.22227201,
       -29.62306026, -19.7317084 ,   1.49551879,   8.32082289,
       -36.29315563, -42.50734136, -25.90458854, -34.9896239 ,
        -8.42013899, -50.04728023, -51.48209551,  19.88961877,
        -8.91888431, -31.9934551 , -11.66101689,  -0.47

In [8]:
# 前10个都是负数
log_reg.decision_function(X_test)[:10]

array([-22.05698982, -33.02941569, -16.21335556, -80.37917862,
       -48.25125007, -24.54006981, -44.39168237, -25.04296365,
        -0.97828044, -19.71746541])

In [9]:
# 所以前10个的分类都为0
log_reg.predict(X_test)[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

## 使用skLearn调整精准率-召回率

In [10]:
decision_scores = log_reg.decision_function(X_test)

In [11]:
np.min(decision_scores), np.max(decision_scores)

(-85.686099770956332, 19.889618774310772)

### 调整threshold的值，默认是0，现在增大到5

In [12]:
y_predict_2 = np.array(decision_scores >= 5, dtype=int)

In [14]:
confusion_matrix(y_test, y_predict_2)

array([[404,   1],
       [ 21,  24]])

In [15]:
from sklearn.metrics.scorer import precision_score, recall_score

In [16]:
# 调整前的精准率和召回率
precision_score(y_test, y_predict), recall_score(y_test, y_predict)

(0.94736842105263153, 0.80000000000000004)

In [17]:
# 调整后的精准率和召回率
precision_score(y_test, y_predict_2), recall_score(y_test, y_predict_2)

(0.95999999999999996, 0.53333333333333333)

可以看到threshold增大后，精准率增加了，召回率减少了

### 把threshold的值由默认的0减少为-5

In [18]:
y_predict_3 = np.array(decision_scores >= -5, dtype=int)

In [19]:
# 调整前的精准率和召回率
precision_score(y_test, y_predict), recall_score(y_test, y_predict)

(0.94736842105263153, 0.80000000000000004)

In [20]:
# 调整后的精准率和召回率
precision_score(y_test, y_predict_3), recall_score(y_test, y_predict_3)

(0.72727272727272729, 0.88888888888888884)

可以看到threshold减少后，精准率下降，召回率提高了