# 印度糖尿病

In [1]:
import numpy as np
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# 加载数据
dataset = np.loadtxt('indians-diabetes.csv', delimiter=",")

print(dataset[:10])

[[6.000e+00 1.480e+02 7.200e+01 3.500e+01 0.000e+00 3.360e+01 6.270e-01
  5.000e+01 1.000e+00]
 [1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
  3.100e+01 0.000e+00]
 [8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
  3.200e+01 1.000e+00]
 [1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01 0.000e+00]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01 1.000e+00]
 [5.000e+00 1.160e+02 7.400e+01 0.000e+00 0.000e+00 2.560e+01 2.010e-01
  3.000e+01 0.000e+00]
 [3.000e+00 7.800e+01 5.000e+01 3.200e+01 8.800e+01 3.100e+01 2.480e-01
  2.600e+01 1.000e+00]
 [1.000e+01 1.150e+02 0.000e+00 0.000e+00 0.000e+00 3.530e+01 1.340e-01
  2.900e+01 0.000e+00]
 [2.000e+00 1.970e+02 7.000e+01 4.500e+01 5.430e+02 3.050e+01 1.580e-01
  5.300e+01 1.000e+00]
 [8.000e+00 1.250e+02 9.600e+01 0.000e+00 0.000e+00 0.000e+00 2.320e-01
  5.400e+01 1.000e+00]]


In [2]:
# 将最后一列（第8列）切分为目标属性
X = dataset[:,0:8]
Y = dataset[:,8]

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# 使用XGBoost
model = XGBClassifier(gamma = 0.1,  learning_rate = 0.3, max_depth = 3)
model.fit(X_train, y_train)

# 测试集上做预测
y_pred = model.predict(X_test)

# 计算准确率
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 80.52%


In [3]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

param_range = {'max_depth': [4, 3], 'learning_rate': [0.3, 0.05, 0.1, 0.5], 'gamma':[0.1, 0.05, 0.2, 0.3], 
               'min_child_weight':[5, 10], 'n_estimators': [50, 100, 200]}
model = XGBClassifier(random_state=27)
print(model)

clf = GridSearchCV(estimator = model,
                       param_grid = param_range,
                       cv = 6,
                       scoring='roc_auc',
                       refit=True, verbose=1, n_jobs=2)

clf.fit(X_train, y_train)

train_score = clf.score(X_train, y_train)
test_score = clf.score(X_test, y_test)
    
print('训练集分数：{:.3f}'.format(train_score))
print('测试集分数：{:.3f}'.format(test_score))
print('最优参数：{}'.format(clf.best_params_))


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic',
       random_state=27, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1)
Fitting 6 folds for each of 192 candidates, totalling 1152 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 101 tasks      | elapsed:    5.4s
[Parallel(n_jobs=2)]: Done 701 tasks      | elapsed:   25.4s


训练集分数：0.883
测试集分数：0.858
最优参数：{'gamma': 0.05, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 10, 'n_estimators': 50}


[Parallel(n_jobs=2)]: Done 1149 out of 1152 | elapsed:   40.3s remaining:    0.1s
[Parallel(n_jobs=2)]: Done 1152 out of 1152 | elapsed:   40.4s finished


In [92]:
from scipy.sparse import csr_matrix
indptr = np.array([0, 2, 3, 6])
indices = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
csr_matrix((data, indices, indptr), shape=(3, 3)).toarray()


array([[1, 0, 2],
       [0, 0, 3],
       [4, 5, 6]])