使用评价指标来画图，进行学习指标的学习
---

In [4]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 读入数据
train=pd.read_csv('../data/train_data.csv')
test=pd.read_csv('../data/test_data.csv')

In [3]:
# 训练集与测试集的导入
train_y=train['status']
train_X=train.drop(['status'],axis=1)

test_y=test['status']
test_X=test.drop(['status'],axis=1)

In [5]:
# 数据归一化操作
from sklearn.preprocessing import minmax_scale
train_X=minmax_scale(train_X)
test_X=minmax_scale(test_X)

## 使用模型来拟合数据并且查看准确率

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score

In [16]:
#  LogisticRegression
lr=LogisticRegression()
lr.fit(train_X,train_y)

# SVM
svm=SVC(probability=True)
svm.fit(train_X,train_y)

# DecisionTree
dt=DecisionTreeClassifier()
dt.fit(train_X,train_y)

# RF
rf=RandomForestClassifier()
rf.fit(train_X,train_y)

# GBDT
gbdt=GradientBoostingClassifier()
gbdt.fit(train_X,train_y)

# KNN
knn=KNeighborsClassifier()
knn.fit(train_X,train_y)


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [17]:
# 当存在多个模型的时候，我们可以一起输出准确率，尝试使用列表的形式
names=["LR","SVM","DecisionTree","RF","KNN","GBDT"]
models=[lr,svm,dt,rf,knn,gbdt]

In [18]:
import numpy as np
df_list=[]
for name,model in zip(names,models):
    y_train_pred=model.predict(train_X)
    y_test_pred=model.predict(test_X)
    
    # accuracy
    train_accuracy=model.score(train_X,y_train_pred)
    test_accuracy=model.score(test_X,y_test_pred)
    
    # precision
    train_precision=precision_score(train_y,y_train_pred)
    test_precision=precision_score(test_y,y_test_pred)
    
    # recall
    train_recall=recall_score(train_y,y_train_pred)
    test_recall=recall_score(test_y,y_test_pred)
    
    # f1
    train_f1=f1_score(train_y,y_train_pred)
    test_f1=f1_score(test_y,y_test_pred)
    
    # auc
    y_train_pred=model.predict_proba(train_X)[:,1]
    y_test_pred=model.predict_proba(test_X)[:,1]
    
    train_auc=roc_auc_score(train_y,y_train_pred)
    test_auc=roc_auc_score(test_y,y_test_pred)
    
    print('{} 训练集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,train_accuracy,train_precision,train_recall,train_f1,train_auc))
    print('{} 测试集： accuracy:{:.3},precision:{:.3}, recall:{:.3}, f1:{:.3}, auc:{:.3}'.format(name,test_accuracy,test_precision,test_recall,test_f1,test_auc))
    print('\n')
    df = pd.DataFrame(np.array([train_accuracy,train_precision,train_recall,train_f1,train_auc,test_accuracy,test_precision,test_recall,test_f1,test_auc]).reshape(2,-1),
                  index = ['train','test'],
                  columns = ['Accuracy','Precision','Recall','F1-Score','AUC-Score'])
    df_list.append(df)
pd.concat(df_list,axis=0,keys=names)

LR 训练集： accuracy:1.0,precision:0.748, recall:0.291, f1:0.419, auc:0.801
LR 测试集： accuracy:1.0,precision:0.573, recall:0.536, f1:0.554, auc:0.798


SVM 训练集： accuracy:1.0,precision:0.0, recall:0.0, f1:0.0, auc:0.807
SVM 测试集： accuracy:1.0,precision:1.0, recall:0.0058, f1:0.0115, auc:0.799


DecisionTree 训练集： accuracy:1.0,precision:1.0, recall:1.0, f1:1.0, auc:1.0
DecisionTree 测试集： accuracy:1.0,precision:0.328, recall:0.539, f1:0.408, auc:0.575


RF 训练集： accuracy:1.0,precision:0.997, recall:0.919, f1:0.957, auc:0.999
RF 测试集： accuracy:1.0,precision:0.481, recall:0.4, f1:0.437, auc:0.705


KNN 训练集： accuracy:1.0,precision:0.775, recall:0.39, f1:0.519, auc:0.852
KNN 测试集： accuracy:1.0,precision:0.52, recall:0.27, f1:0.355, auc:0.645


GBDT 训练集： accuracy:1.0,precision:0.855, recall:0.507, f1:0.637, auc:0.926
GBDT 测试集： accuracy:1.0,precision:0.505, recall:0.623, f1:0.558, auc:0.774




Unnamed: 0,Unnamed: 1,Accuracy,Precision,Recall,F1-Score,AUC-Score
LR,train,1.0,0.748322,0.290743,0.418779,0.800847
LR,test,1.0,0.572755,0.536232,0.553892,0.797769
SVM,train,1.0,0.0,0.0,0.0,0.80676
SVM,test,1.0,1.0,0.005797,0.011527,0.799397
DecisionTree,train,1.0,1.0,1.0,1.0,1.0
DecisionTree,test,1.0,0.328042,0.53913,0.407895,0.575376
RF,train,1.0,0.997171,0.919166,0.956581,0.999268
RF,test,1.0,0.480836,0.4,0.436709,0.705348
KNN,train,1.0,0.774611,0.389831,0.518647,0.852384
KNN,test,1.0,0.519553,0.269565,0.354962,0.645343


In [25]:
import matplotlib.pyplot as plt
def draw_roc_curve(train_pre_proba,test_pre_proba,train_auc,test_auc,model_name,num):
    fpr,tpr,roc_auc = train_pre_proba
    test_fpr,test_tpr,test_roc_auc = test_pre_proba
    
    plt.figure()
    lw = 2
    plt.plot(fpr, tpr, color='darkorange',
             lw=lw, label='ROC curve (area = %0.2f)' % train_auc)
    plt.plot(test_fpr, test_tpr, color='red',
         lw=lw, label='ROC curve (area = %0.2f)' %test_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Roc example '+ model_name)
    plt.legend(loc="lower right")
#     plt.savefig("img/five model_{}.png".format(num))
    plt.close()
    
for num,name,model in zip(range(1,7),names,models):
    
    y_train_pred = model.predict_proba(train_X)[:,1]
    y_test_pred = model.predict_proba(test_X)[:,1]


    train_roc = roc_curve(train_y,y_train_pred)
    test_roc = roc_curve(test_y,y_test_pred)

    train_auc = roc_auc_score(train_y,y_train_pred)
    test_auc = roc_auc_score(test_y,y_test_pred)

    draw_roc_curve(train_roc,test_roc,train_auc,test_auc,name,num)

### 使用k折交叉验证法建立模型，然后然后比较结果

In [26]:
from sklearn.model_selection import KFold

def run_cv(X,y,clf_class,**kwargs):
    kf = KFold(n_splits = 5, shuffle = False, random_state = 2018)
    y_pred = y.copy()
    clf = clf_class(**kwargs)
    
    for train_index , test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        clf.fit(X_train,y_train)
        y_pred[test_index] = clf.predict(X_test)
    return y_pred

In [27]:
# 测试几种方式
LR_CV_result = run_cv(train_X,train_y,LogisticRegression)
RF_CV_result = run_cv(train_X,train_y,RandomForestClassifier)
KNN_CV_result = run_cv(train_X,train_y,KNeighborsClassifier)

In [31]:
def accuracy(y_true,y_pred):
    return np.mean(y_true == y_pred)

print ("Logistic Regression (L2 is default): " + str(accuracy(train_y, LR_CV_result)))
print ("Random forest: " + str(accuracy(train_y, RF_CV_result)))
print ("K-nearest-neighbors: " + str(accuracy(train_y, KNN_CV_result)))

Logistic Regression (L2 is default): 0.7958656330749354
Random forest: 0.7677648578811369
K-nearest-neighbors: 0.7458010335917312
