## 糖尿病分类
要求:采用5折交叉验证，分别用log似然损失和正确率，对Logistic回归模型的正则超参数调优。
特征工程见:1_FE_diabetes.ipynb

In [1]:
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sn

import pandas as pd

### 1.读取数据

In [3]:
train = pd.read_csv('FE_pima-indians-diabetes.csv')
train.head()

Unnamed: 0,pregnants,Plasma_glucose_concentration,blood_pressure,Triceps_skin_fold_thickness,serum_insulin,BMI,Diabetes_pedigree_function,Age,Target
0,0.639947,0.866045,-0.03199,0.670643,-0.181541,0.166619,0.468492,1.425995,1
1,-0.844885,-1.205066,-0.528319,-0.012301,-0.181541,-0.8522,-0.365061,-0.190672,0
2,1.23388,2.016662,-0.693761,-0.012301,-0.181541,-1.3325,0.604397,-0.105584,1
3,-0.844885,-1.073567,-0.528319,-0.695245,-0.540642,-0.633881,-0.920763,-1.041549,0
4,-1.141852,0.504422,-2.679076,0.670643,0.316566,1.549303,5.484909,-0.020496,1


### 2.数据切割

In [10]:
from sklearn.model_selection import train_test_split
y = train['Target']
X = train.drop(['Target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

columns_name = X.columns

### 3. 训练数据

In [24]:
# 引入Logistic分类模型
from sklearn.linear_model import LogisticRegression

# 初始化默认配置
lr = LogisticRegression()

# 训练
lr.fit(X_train, y_train)

#测试
y_pred_test_lr = lr.predict(X_test)

In [14]:
# 查看特征权重
fs = pd.DataFrame({"columns":list(columns_name), "coef":list((lr.coef_.T))})
fs.sort_values(by=['coef'], ascending=False)

Unnamed: 0,coef,columns
1,[0.8450167931818476],Plasma_glucose_concentration
5,[0.4974677447841612],BMI
7,[0.32418580077435705],Age
0,[0.2004541490190203],pregnants
6,[0.16806422103761665],Diabetes_pedigree_function
3,[0.1130747438391962],Triceps_skin_fold_thickness
4,[-0.024811872284968724],serum_insulin
2,[-0.0501715234151964],blood_pressure


### 评价模型性能

In [28]:
from sklearn.metrics import r2_score, log_loss
# 评价模型性能
#logloss 损失评价
print "log loss on the test is ", log_loss(y_test, y_pred_test_lr)

# 正确率评价
print "正确率评价:", lr.score(X_test, y_test)

log loss on the test is  8.522643481577399
正确率评价: 0.7532467532467533


In [29]:
# 用5折交叉验证
from sklearn.model_selection import cross_val_score
loss = cross_val_score(lr, X_train, y_train, cv=5, scoring="neg_log_loss")

print "logloss of each fold is ", -loss

logloss of each fold is  [0.51946855 0.44590087 0.48482846 0.48318246 0.46157091]


### 4. 参数调优

### logloss 正则参数调优

In [56]:
from sklearn.model_selection import GridSearchCV

#正则/参数范畴
penaltys = ["l1","l2"]
Cs = [0.1, 1, 10, 100, 1000]
tuned_para  = dict(penalty = penaltys, C=Cs)

# 选用liblinear 优化方法
lr.set_params(solver="liblinear")

lr_cv = GridSearchCV(lr, tuned_para, cv=5, scoring='neg_log_loss')
lr_cv.fit(X_train,y_train)

lr_cv.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0,
       0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0])

In [57]:
print 'best score:', -lr_cv.best_score_
print 'best_params:', lr_cv.best_params_

best score: 0.47873419003005585
best_params: {'penalty': 'l1', 'C': 1}


### 超参数调优

### GridSearchCV 和logistic没有alphas参数?

In [58]:
# 细化参数范围
#alphas_to_test = np.linspace(0.1, )
print lr_cv.get_params().keys()
lr_cv.set_params(alphas=np.array([0.1, 0.5, 0.9, 1.3 , 1.7]))
# 再训练
lr_cv.fit(X_train, y_train)

# 预测
y_test_pred_lr = lr_cv.predict(X_test)

print "the best alpha is: ", gird.alphas

['n_jobs', 'verbose', 'estimator__penalty', 'param_grid', 'cv', 'scoring', 'estimator__verbose', 'pre_dispatch', 'estimator__intercept_scaling', 'fit_params', 'estimator__max_iter', 'estimator__warm_start', 'estimator__solver', 'refit', 'iid', 'estimator__dual', 'estimator__fit_intercept', 'estimator__n_jobs', 'estimator__class_weight', 'estimator__C', 'estimator__random_state', 'return_train_score', 'estimator', 'error_score', 'estimator__multi_class', 'estimator__tol']


ValueError: Invalid parameter alphas for estimator GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.1, 1, 10, 100, 1000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_log_loss', verbose=0). Check the list of available parameters with `estimator.get_params().keys()`.

### 正确率调优

In [59]:
from sklearn.model_selection import GridSearchCV

# 设置参数范围
penaltys = ['l1', 'l2']

Cs = [0.001, 0.1, 1, 100]

tuned_para = dict(penalty=penaltys, C=Cs)

lr.set_params(solver='liblinear')

lr_cv = GridSearchCV(lr, tuned_para, cv=5, scoring='accuracy')
lr_cv.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.1, 1, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [60]:
print "best score :", lr_cv.best_score_
print "best params: ", lr_cv.best_params_

best score : 0.7671009771986971
best params:  {'penalty': 'l1', 'C': 1}
