## 精准率和召回率的平衡

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [3]:
from sklearn import  datasets

In [4]:
digits = datasets.load_digits()
X = digits.data
y = digits.target.copy()

y[digits.target==9] = 1
y[digits.target!=9] = 0

In [5]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=666)

In [6]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
y_predict = log_reg.predict(X_test)

In [7]:
'''对于有偏的数据集 f1 score的值是比 取平均值更好'''
from sklearn.metrics import f1_score
f1_score(y_test,y_predict)

0.86746987951807231

In [15]:
'''混淆矩阵'''
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict)

array([[403,   2],
       [  9,  36]], dtype=int64)

In [16]:
'''精准率'''
from sklearn.metrics import  precision_score
precision_score(y_test,y_predict)

0.94736842105263153

In [17]:
'''召回率'''
from sklearn.metrics import  recall_score
recall_score(y_test,y_predict)

0.80000000000000004

In [None]:
'''
精准率召回率的平衡是通过调整我们分类为1的阈值来进行调整的
但是我们本身没这样的函数让我们在predict的时候传入这样参数
，自带的predict函数都是以0作为基准的，如果我们想要调整分类阈值的话，
需要绕弯子，对于像逻辑回归这样的算法，他是通过一个 score值，也就是theta*X的值，
大于0还是小于0来进行分类的。这里面还有一个函数  叫做   decision_function
这个函数是一个决策函数，里面的数就是  我们的 score 的 值 
'''

In [18]:
log_reg.decision_function(X_test)

array([-22.05705181, -33.02949573, -16.21340238, -80.37917009,
       -48.25125209, -24.54010044, -44.39166152, -25.04298784,
        -0.97831701, -19.71745321, -66.2513864 , -51.09609927,
       -31.49354859, -46.05334586, -38.6788697 , -29.80474166,
       -37.5885736 , -82.57576637, -37.81902892, -11.01172596,
        -9.17441466, -85.1300934 , -16.71622699, -46.2373814 ,
        -5.33001429, -47.9176172 , -11.66732594, -39.19614376,
       -25.25298529, -14.36652407, -16.99787221, -28.91910077,
       -34.33945987, -29.47614656,  -7.85819265,  -3.82095014,
       -24.08174422, -22.16364571, -33.61231771, -23.14031933,
       -26.91810677, -62.38943784, -38.85701958, -66.77268117,
       -20.14492423, -17.47892943, -18.06805716, -22.22229886,
       -29.62307347, -19.73172345,   1.49545888,   8.32075374,
       -36.29320492, -42.50740983, -25.90461737, -34.98965016,
        -8.42018324, -50.04725465, -51.48215611,  19.88957373,
        -8.91894758, -31.99350749, -11.66107835,  -0.47

In [19]:
log_reg.decision_function(X_test)[:10]

array([-22.05705181, -33.02949573, -16.21340238, -80.37917009,
       -48.25125209, -24.54010044, -44.39166152, -25.04298784,
        -0.97831701, -19.71745321])

In [21]:
log_reg.predict(X_test)[:10]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
'''我们通过上面的两个式子可以看出来   小于0 的预测结果为0 '''

In [None]:
'''我们可以通过decision_function 来实现我们自己的基于不同的阈值来进行分类的结果'''

In [22]:
'''我们先将我们的decision_function的值保存起来'''
decision_caores = log_reg.decision_function(X_test)


In [23]:
np.min(decision_caores)

-85.68617092628044

In [24]:
np.max(decision_caores)

19.889573725637934

In [33]:
'''我们将我们的阈值设置为5   小于5 的为0  大于5的为1'''
y_predict2  = np.array(decision_caores>=5,dtype='int')

In [28]:
'''首先查看混淆矩阵'''
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test,y_predict2)

array([[404,   1],
       [ 21,  24]], dtype=int64)

In [29]:
'''精准率'''
precision_score(y_test,y_predict2)

0.95999999999999996

In [30]:
'''召回率'''
recall_score(y_test,y_predict2)

0.53333333333333333

In [32]:
'''我们将我们的阈值设置为 -5   小于-5 的为0  大于5的为1'''
y_predict3  = np.array(decision_caores>=-5,dtype='int')

In [34]:
confusion_matrix(y_test,y_predict3)

array([[390,  15],
       [  5,  40]], dtype=int64)

In [35]:
precision_score(y_test,y_predict3)

0.72727272727272729

In [36]:
recall_score(y_test,y_predict3)

0.88888888888888884

In [None]:
'''召回率  越来越高    精准率 越来越低'''

In [57]:
'''我们将我们的阈值设置为 -5   小于-5 的为0  大于5的为1'''
y_predict4  = np.array(decision_caores>=18,dtype='int')

In [58]:
confusion_matrix(y_test,y_predict4)

array([[405,   0],
       [ 43,   2]], dtype=int64)

In [59]:
precision_score(y_test,y_predict4)

1.0

In [60]:
recall_score(y_test,y_predict4)

0.044444444444444446