# Improving a gradient boosting classifier

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

%matplotlib inline
sns.set_style('whitegrid')

In [15]:
df = pd.read_csv((
    "https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/"
    "master/ESS_practice_data/ESSdata_Thinkful.csv")).dropna()
df.head()

Unnamed: 0,cntry,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner
0,CH,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,2.0,60.0,1.0
1,CH,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,2.0,59.0,1.0
2,CH,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,1.0,24.0,2.0
3,CH,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,2.0,64.0,1.0
4,CH,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,2.0,55.0,1.0


In [3]:
df.cntry.value_counts()

ES    2292
SE    1726
CH    1475
NO    1420
CZ    1207
DE      27
Name: cntry, dtype: int64

In [16]:
df['partner'] = df.partner - 1
df['gndr'] = df.gndr - 1

#### Notes: Dummies

We can generate dummy features for the countries listed.

In [17]:
df = pd.concat([df, pd.get_dummies(df.cntry, prefix='cntry')], 1).drop('cntry', 1)

In [18]:
df.head()

Unnamed: 0,idno,year,tvtot,ppltrst,pplfair,pplhlp,happy,sclmeet,sclact,gndr,agea,partner,cntry_CH,cntry_CZ,cntry_DE,cntry_ES,cntry_NO,cntry_SE
0,5.0,6,3.0,3.0,10.0,5.0,8.0,5.0,4.0,1.0,60.0,0.0,1,0,0,0,0,0
1,25.0,6,6.0,5.0,7.0,5.0,9.0,3.0,2.0,1.0,59.0,0.0,1,0,0,0,0,0
2,26.0,6,1.0,8.0,8.0,8.0,7.0,6.0,3.0,0.0,24.0,1.0,1,0,0,0,0,0
3,28.0,6,4.0,6.0,6.0,7.0,10.0,6.0,2.0,1.0,64.0,0.0,1,0,0,0,0,0
4,29.0,6,5.0,6.0,7.0,5.0,8.0,7.0,2.0,1.0,55.0,0.0,1,0,0,0,0,0


In [20]:
X = df.drop('partner', 1)
y = df.partner

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## GridSearchCV

#### Let's try several hyperparameters and see how much the predictions react to them.

In [62]:
params = {'learning_rate': [0.1, 0.33], 'subsample': [0.25, 0.33, 1], 
          'n_estimators': [100, 300], 'max_depth': [2, 3, 4]}
gbc_clf = GradientBoostingClassifier()
search = GridSearchCV(gbc_clf, params, n_jobs=-1)
search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'learning_rate': [0.1, 0.33], 'subsample': [0.25, 0.33, 1], 'n_estimators': [100, 300], 'max_depth': [2, 3, 4]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [63]:
search.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33}

In [64]:
search.best_score_

0.7552554856529078

In [65]:
search.grid_scores_

[mean: 0.75004, std: 0.00237, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.75234, std: 0.00388, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.74988, std: 0.00212, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'subsample': 1},
 mean: 0.74467, std: 0.00289, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.25},
 mean: 0.75203, std: 0.00571, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 0.33},
 mean: 0.75326, std: 0.00310, params: {'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 300, 'subsample': 1},
 mean: 0.75403, std: 0.00163, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.25},
 mean: 0.75526, std: 0.00218, params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.33},
 mean: 0.75464, std: 0.00060, params: {'learning_rate': 0.1, '

In [66]:
new_gbc_clf = GradientBoostingClassifier(learning_rate=0.1, max_depth=3, n_estimators=100, subsample=0.33)
new_gbc_clf.fit(X_train, y_train).score(X_test, y_test)

0.7429447852760737

In [67]:
confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))

array([[918, 106],
       [309, 297]], dtype=int64)

In [68]:
print('Type I error: {}'.format(confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))[0][1] / len(y_test)))
print('Type II error: {}'.format(confusion_matrix(y_test, new_gbc_clf.fit(X_train, y_train).predict(X_test))[1][0] / len(y_test)))

Type I error: 0.06134969325153374
Type II error: 0.19631901840490798


#### Notes: GridSearchCV

After trying several different hyperparameters, the scores don't seem to be changing very much at all - they sit at roughly 0.70 to 0.75. We picked the best results it returned.