## 任务5 模型调优

内容：记使用网格搜索法对5个模型进行调优（调参时采用五折交叉验证的方式），并进行模型评估，记得展示代码的运行结果

In [13]:
import numpy as np
import pandas as pd
import matplotlib as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 5.1 导入数据

In [2]:
df5 = pd.read_csv('./data_clean.csv')
df5.head()

Unnamed: 0,history_fail_fee,trans_fail_top_count_enum_last_1_month,loans_score,apply_score,latest_one_month_fail,loans_overdue_count,max_cumulative_consume_later_1_month,trans_amount_3_month,repayment_capability,latest_query_day,...,rank_trad_1_month,consume_top_time_last_1_month,latest_six_month_loan,latest_one_month_suc,latest_six_month_apply,trans_activity_month,transd_mcc,latest_three_month_apply,take_amount_in_later_12_month_highest,status
0,7.0,1.0,552.0,583.0,0.0,2.0,2170,34030,19890,12.0,...,0.4,4.0,13.0,1.0,8.0,0.55,17.0,5.0,0,1
1,4.0,0.0,635.0,653.0,1.0,0.0,2100,10590,16970,4.0,...,0.35,13.0,8.0,2.0,8.0,1.0,19.0,6.0,2000,0
2,2.0,3.0,633.0,654.0,1.0,0.0,0,5710,9710,2.0,...,1.0,0.0,4.0,1.0,14.0,1.0,13.0,5.0,0,1
3,26.0,1.0,542.0,595.0,0.0,4.0,8140,91690,6210,2.0,...,0.15,6.0,34.0,2.0,17.0,0.57,22.0,16.0,2000,0
4,25.0,3.0,479.0,541.0,0.0,6.0,1000,9770,11150,22.0,...,0.65,0.0,10.0,0.0,9.0,1.0,13.0,8.0,0,1


In [4]:
feature = df5.iloc[:, :-1]
label = df5.iloc[:, -1]

In [5]:
feature.shape

(4455, 50)

In [6]:
label.shape

(4455,)

In [10]:
# 拆分训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(feature,label,test_size = 0.3,random_state=2018)

In [11]:
# 数据标准化处理
std_scaler = StandardScaler()
X_train = std_scaler.fit_transform(X_train)
X_test = std_scaler.fit_transform(X_test)

In [12]:
print("Size of training set:{} size of testing set:{}".format(X_train.shape[0],X_test.shape[0]))

Size of training set:3118 size of testing set:1337


## 5.2 模型调优

* **Grid Search（网格搜索）**：一种调参手段；穷举搜索：在所有候选的参数选择中，通过循环遍历，尝试每一种可能性，表现最好的参数就是最终的结果。其原理就像是在数组里找最大值。


* **Cross Validation（交叉验证）**：交叉验证用于评估模型的预测性能,尤其是训练好的模型在新数据上的表现,可以在一定程度上减少偶然性，减小过拟合。


* **Grid Search with Cross Validation**：一种结合了网格搜索和交叉验证的参数评价方法（利用sklearn中的类，GridSearchCV）。

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

In [24]:
# SVM
param_grid = {"gamma":[0.001,0.01,0.1,1,10,100],
             "C":[0.001,0.01,0.1,1,10,100]}
print("Parameters:{}".format(param_grid))

grid_search = GridSearchCV(SVC(),param_grid,cv=5)
grid_search.fit(X_train,y_train)

print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))

Parameters:{'gamma': [0.001, 0.01, 0.1, 1, 10, 100], 'C': [0.001, 0.01, 0.1, 1, 10, 100]}
Test set score:0.80
Best parameters:{'C': 100, 'gamma': 0.001}
Best score on train set:0.79


In [26]:
# 逻辑回归
param_grid = [
    {
        "C":[0.001,0.01,0.1,1,10,100],
        "penalty":['l2'],
        "tol":[1e-4,1e-5,1e-6]
    },
    {
        "C":[0.001,0.01,0.1,1,10,100],
        "penalty":['l1'],
        "tol":[1e-4,1e-5,1e-6]
    }
]
print("Parameters:{}".format(param_grid))

grid_search = GridSearchCV(LogisticRegression(),param_grid,cv=5)
grid_search.fit(X_train,y_train)

print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))

Parameters:[{'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l2'], 'tol': [0.0001, 1e-05, 1e-06]}, {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'penalty': ['l1'], 'tol': [0.0001, 1e-05, 1e-06]}]
Test set score:0.79
Best parameters:{'C': 0.1, 'penalty': 'l2', 'tol': 0.0001}
Best score on train set:0.80


In [27]:
# 决策树
param_grid = {"max_depth":[m for m in range(5,10)],
             "class_weight":['balanced',None]}
print("Parameters:{}".format(param_grid))

grid_search = GridSearchCV(DecisionTreeClassifier(),param_grid,cv=5)
grid_search.fit(X_train,y_train)

print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))

Parameters:{'max_depth': [5, 6, 7, 8, 9], 'class_weight': ['balanced', None]}
Test set score:0.79
Best parameters:{'class_weight': None, 'max_depth': 5}
Best score on train set:0.76


In [None]:
# XGboost
param_grid = {"max_depth":[10,30,50],
             "min_child_weight" : [1,3,6],
             "n_estimators": [200],
             "learning_rate": [0.05, 0.1,0.16]}
print("Parameters:{}".format(param_grid))

grid_search = GridSearchCV(XGBClassifier(),param_grid,cv=5)
grid_search.fit(X_train,y_train)

print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))

Parameters:{'max_depth': [10, 30, 50], 'min_child_weight': [1, 3, 6], 'n_estimators': [200], 'learning_rate': [0.05, 0.1, 0.16]}


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [None]:
# 随机森林
param_grid = {"n_estimators":[50,120,160,200,250],
             "max_depth":[1,2,3,5,7,9,11,13],
             "min_samples_split":[100,120,150,180,200,300]}
print("Parameters:{}".format(param_grid))

grid_search = GridSearchCV(RandomForestClassifier(),param_grid,cv=5)
grid_search.fit(X_train,y_train)

print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))