In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV
import matplotlib.pylab as plt
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

%matplotlib inline



In [30]:
df_downsampled = pd.read_csv('./df_down.csv')
X = df_downsampled.drop(['ARSON'], axis=1)
y = df_downsampled['ARSON']

pca = PCA(n_components=6)
pca.fit(X)
X_train = pca.transform(X)

In [31]:
X_train = pd.DataFrame(X_train)

In [32]:
type(X_train)

pandas.core.frame.DataFrame

In [33]:
type(y)

pandas.core.series.Series

In [34]:
train = pd.concat([X_train, y], axis=1)

In [35]:
train.head(3)

Unnamed: 0,0,1,2,3,4,5,ARSON
0,207.695617,-3.342275,2.61197,-0.106462,0.023264,-0.171066,0
1,-92.204587,-0.521998,-0.73227,-0.607829,-0.006265,-0.172929,0
2,-91.304603,0.002993,-0.53771,1.153308,0.01524,-0.190643,0


In [36]:
type(train)

pandas.core.frame.DataFrame

In [37]:
target = 'ARSON'

In [41]:
train.columns

Index([0, 1, 2, 3, 4, 5, 'ARSON'], dtype='object')

I was deep in the weeds with this, coding everything by hand, when I realized someone had already done this and put it online somewhere. That somewhere was https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

The below code is from that tutorial, I made a lot of changes.

In [38]:
def modelfit(alg, dtrain, predictors, useTrainCV=True, cv_folds=6, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
                          metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

In [None]:
predictors = [x for x in train.loc['0':'5']]
xgb1 = XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=2,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=-1,
 scale_pos_weight=1,
 seed=42)
modelfit(xgb1, train, predictors)

In [None]:
param_test1 = {
 'max_depth':[3,4,5,6,7]
 'min_child_weight':[5,6,7,8,10,12]
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=-1, scale_pos_weight=1, seed=42), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=-1, iid=False, cv=6)
gsearch1.fit(train[predictors], target)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

In [None]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=6, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch3.fit(train[predictors],train[target])
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_