In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.model_selection import cross_val_score
from numpy.core.umath_tests import inner1d
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from numpy.core.umath_tests import inner1d
from sklearn.svm import SVC

  import sys


# 1 比较模型

挑选几种备选模型，用交叉验证计算各自的效果，删选出合适的模型。这里我就只比较决策树、随机森林和SVM，这个挑选的步骤是一个粗选，因此对于模型的参数只需设定为默认就行

In [2]:
#读取之前保存好的数据
train_X=pd.read_csv('train_X.csv').values
train_X=np.delete(train_X,0,axis=1)
print(train_X.shape)
train_Y=pd.read_csv('train_Y.csv').values
train_Y=np.delete(train_Y,0,axis=1)
print(train_Y.shape)

(889, 13)
(889, 1)


## 1.1 决策树

In [3]:
def display_scores(scores):
    print('Scores:',scores)
    print('Mean:',scores.mean())
    print('Standard deviation:',scores.std())
tree_cl=DecisionTreeClassifier()
scores=cross_val_score(tree_cl,train_X,train_Y,scoring='accuracy',cv=10) #进行10折的交叉验证
display_scores(scores)

Scores: [0.75280899 0.82022472 0.73033708 0.78651685 0.84269663 0.76404494
 0.82022472 0.76404494 0.78651685 0.85227273]
Mean: 0.7919688457609806
Standard deviation: 0.0384155711161994


## 1.2 随机森林

In [4]:
forest_cl=RandomForestClassifier(n_estimators=500)
scores=cross_val_score(forest_cl,train_X,train_Y.ravel(),scoring='accuracy',cv=10)
display_scores(scores)

Scores: [0.71910112 0.80898876 0.76404494 0.83146067 0.88764045 0.82022472
 0.80898876 0.7752809  0.78651685 0.84090909]
Mean: 0.8043156281920327
Standard deviation: 0.04399862284858091


## 1.3 SVM 

In [5]:
svc_cl=SVC(kernel='rbf')
scores=cross_val_score(svc_cl,train_X,train_Y.ravel(),scoring='accuracy',cv=10)
display_scores(scores)

Scores: [0.79775281 0.83146067 0.76404494 0.86516854 0.83146067 0.79775281
 0.79775281 0.78651685 0.86516854 0.80681818]
Mean: 0.8143896833503576
Standard deviation: 0.031526169608360424


从交叉训练的平均准确度上看，SVM的效果好一点，所以我们就选择SVM作为训练模型

# 2 调整模型

已经选定了SVM作为应用模型，那么下面运用网格搜索调整超参数，让模型达到最佳状态

In [6]:
train_X=pd.read_csv('train_X.csv').values
train_X=np.delete(train_X,0,axis=1)
train_Y=pd.read_csv('train_Y.csv').values
train_Y=np.delete(train_Y,0,axis=1)
train_Y=train_Y.ravel()
param_grid=[{'kernel': ['rbf'], 'gamma': [0.1, 1], 'C': [0.1,1, 10]}, 
            {'kernel': ['linear'], 'C': [0.1, 1]},
           {'kernel':['poly'],'C':[0.1,1],'gamma': [0.1, 1]}]
svc=SVC()
grid_search=GridSearchCV(svc,param_grid,cv=5,scoring='accuracy')
grid_search.fit(train_X,train_Y)
print(grid_search.best_params_)
cvres=grid_search.cv_results_
for mean_score,params in zip(cvres['mean_test_score'],cvres['params']):
    print(mean_score,params)

{'C': 1, 'gamma': 0.1, 'kernel': 'poly'}
0.7874015748031497 {'C': 0.1, 'gamma': 0.1, 'kernel': 'rbf'}
0.7266591676040495 {'C': 0.1, 'gamma': 1, 'kernel': 'rbf'}
0.8143982002249719 {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.7986501687289089 {'C': 1, 'gamma': 1, 'kernel': 'rbf'}
0.81214848143982 {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.8031496062992126 {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
0.7862767154105736 {'C': 0.1, 'kernel': 'linear'}
0.7862767154105736 {'C': 1, 'kernel': 'linear'}
0.6287964004499438 {'C': 0.1, 'gamma': 0.1, 'kernel': 'poly'}
0.81214848143982 {'C': 0.1, 'gamma': 1, 'kernel': 'poly'}
0.8188976377952756 {'C': 1, 'gamma': 0.1, 'kernel': 'poly'}
0.7941507311586051 {'C': 1, 'gamma': 1, 'kernel': 'poly'}


通过网格搜索可以看到C=1,gamma=0.1,kernel=poly是最佳参数

# 3 保存模型

使用搜索出的最佳参数构造并训练模型，将模型保存到本地

In [7]:
svc=SVC(kernel='poly',gamma=0.1,C=1)
svc.fit(train_X,train_Y)
train_pred=svc.predict(train_X)
print(metrics.accuracy_score(train_Y,train_pred))

0.8222722159730034


In [8]:
joblib.dump(svc,'my_model.pkl')

['my_model.pkl']