In [1]:
import os
import pandas as pd
os.chdir(r"C:\Users\Nicol\Google Drive\Learning\Jupyter")

## Scaling, Filling NA

In [3]:
# Pre-processing 
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# read the data in
df = pd.read_csv("Data/Diabetes.csv")
# Let's use some weak features as predictors
predictors = ['age','serum_insulin']
target = 'class'
# Most common preprocessing step include label encoding and missing value treatment
from sklearn import preprocessing
for f in df.columns:
    if df[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))
        
df.fillna((-999), inplace=True) # missing value treatment
# Let's use some week features to build the tree
X = df[['age','serum_insulin']] # independent variables
y = df['class'].values # dependent variables

#Normalize
X = StandardScaler().fit_transform(X)
# evaluate the model by splitting into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=2017)
num_rounds = 100

## XGBOOST

In [4]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

import os
mingw_path = 'C:\\Program Files\\mingw-w64\\x86_64-5.3.0-posix-seh-rt_v4-rev0\\mingw64\\bin'
os.environ['PATH'] = mingw_path + ';' + os.environ['PATH']
import xgboost as xgb
from xgboost.sklearn import XGBClassifier

In [5]:
clf_XGB = XGBClassifier(n_estimators = num_rounds,
objective= 'binary:logistic',
seed=2017)
# use early_stopping_rounds to stop the cv when there is no score imporovement
clf_XGB.fit(X_train,y_train, early_stopping_rounds=20, eval_set=[(X_test,
y_test)], verbose=False)
results = cross_val_score(clf_XGB, X_train,y_train, cv=5)
print("\nxgBoost - CV Train : %.2f" % results.mean())
print("xgBoost - Train : %.2f" % metrics.accuracy_score(clf_XGB.predict(X_train), y_train))
print("xgBoost - Test : %.2f" % metrics.accuracy_score(clf_XGB.predict(X_test), y_test))


xgBoost - CV Train : 0.69
xgBoost - Train : 0.75
xgBoost - Test : 0.69


In [6]:
xgtrain = xgb.DMatrix(X_train, label=y_train, missing=-999)
xgtest = xgb.DMatrix(X_test, label=y_test, missing=-999)
# set xgboost params
param = {'max_depth': 3, # the maximum depth of each tree
'objective': 'binary:logistic'}
clf_xgb_cv = xgb.cv(param, xgtrain, num_rounds,
stratified=True,
nfold=5,
early_stopping_rounds=20,
seed=2017)
print ("Optimal number of trees/estimators is %i" % clf_xgb_cv.shape[0])
watchlist = [(xgtest,'test'), (xgtrain,'train')]
clf_xgb = xgb.train(param, xgtrain,clf_xgb_cv.shape[0], watchlist)
# predict function will produce the probability
# so we'll use 0.5 cutoff to convert probability to class label
y_train_pred = (clf_xgb.predict(xgtrain, ntree_limit=clf_xgb.best_iteration)
> 0.5).astype(int)
y_test_pred = (clf_xgb.predict(xgtest, ntree_limit=clf_xgb.best_iteration) >
0.5).astype(int)
print("XGB - Train : %.2f" % metrics.accuracy_score(y_train_pred, y_train))
print("XGB - Test : %.2f" % metrics.accuracy_score(y_test_pred, y_test))

Optimal number of trees/estimators is 7
[0]	test-error:0.344156	train-error:0.299674
[1]	test-error:0.324675	train-error:0.273616
[2]	test-error:0.272727	train-error:0.281759
[3]	test-error:0.266234	train-error:0.278502
[4]	test-error:0.266234	train-error:0.273616
[5]	test-error:0.311688	train-error:0.254072
[6]	test-error:0.318182	train-error:0.254072
XGB - Train : 0.75
XGB - Test : 0.69
