In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV,train_test_split,GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost.sklearn import XGBClassifier

In [4]:
# data prep from previous module
file=r'C:\Users\Hanfi\Documents\Python\Data Sets\census_income.csv'

ci=pd.read_csv(file)


In [5]:
# there is perfect correspondance between education and education.num, we'll drop education
ci.drop('education',axis=1,inplace=True)

# convert target Y to 1,0
ci['Y']=(ci['Y']==' >50K').astype(int)

In [6]:
cat_cols=ci.select_dtypes(['object']).columns
print(cat_cols)

Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')


In [7]:
for col in cat_cols:
    freqs=ci[col].value_counts()
    selected_cats=freqs.index[freqs>100][:-1]
    
    print(col)
    for cat in selected_cats:
        name=col+'_'+cat
        
        ci[name]=(ci[col]==cat).astype(int)
    del ci[col]
    

workclass
marital.status
occupation
relationship
race
sex
native.country


In [6]:
ci.shape

(32561, 48)

In [8]:
x_train = ci.drop('Y',axis=1)

y_train = ci['Y']

# Gradient Boosting Machine

In [15]:
gbm_params = {'n_estimators' : [80,90,100,110,120],
'learning_rate' : [0.2,0.3,0.4,0.5,0.6],
'max_depth' : [2,3],
'subsample' : [0.5,0.8,1],
'max_features' : [18,19,20,21,22] }

In [16]:
gbm = GradientBoostingClassifier()

In [21]:
random_search = RandomizedSearchCV(gbm,scoring='roc_auc',param_distributions=gbm_params, cv=5, n_iter=300, n_jobs=-1)

In [22]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False),
          fit_params=None, iid=True, n_iter=300, n_jobs=-1,
          param_distributions={'n_estimators': [80, 90, 100, 110, 120], 'learning_rate': [0.2, 0.3, 0.4, 0.5, 0.6], 'max_depth': [2, 3], 'subsample': [0.5, 0.8, 1], 'max_features': [18, 19, 20, 21, 22]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring='roc_auc', verbose=0)

In [23]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [24]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: 0.926 (std: 0.00180)
Parameters: {'subsample': 1, 'n_estimators': 120, 'max_features': 18, 'max_depth': 3, 'learning_rate': 0.4}

Model with rank: 2
Mean validation score: 0.926 (std: 0.00214)
Parameters: {'subsample': 1, 'n_estimators': 110, 'max_features': 22, 'max_depth': 3, 'learning_rate': 0.4}

Model with rank: 3
Mean validation score: 0.926 (std: 0.00249)
Parameters: {'subsample': 1, 'n_estimators': 120, 'max_features': 19, 'max_depth': 3, 'learning_rate': 0.3}

Model with rank: 4
Mean validation score: 0.926 (std: 0.00168)
Parameters: {'subsample': 1, 'n_estimators': 100, 'max_features': 22, 'max_depth': 3, 'learning_rate': 0.4}

Model with rank: 5
Mean validation score: 0.926 (std: 0.00228)
Parameters: {'subsample': 1, 'n_estimators': 110, 'max_features': 18, 'max_depth': 3, 'learning_rate': 0.4}



# XGBoost

In [9]:
xgb_params = {  "learning_rate":[0.01,0.05,0.1,0.3,0.5],
        "gamma":[i/10.0 for i in range(0,5)],
        "max_depth":[2,3,4,5,6,7,8],
        "min_child_weight":[1,2,5,10],
        "max_delta_step":[0,1,2,5,10],
        "subsample":[i/10.0 for i in range(5,10)],
        "colsample_bytree":[i/10.0 for i in range(5,10)],
        "colsample_bylevel":[i/10.0 for i in range(5,10)],
        "reg_lambda":[1e-5,1e-2,0.1,1,100],
        "reg_alpha":[1e-5,1e-2,0.1,1,100],
        "scale_pos_weight":[1,2,3,4,5,6,7,8,9],
        "n_estimators":[100,500,700,10000]}

In [10]:
xgb = XGBClassifier(objective = 'binary:logistic')

In [11]:
n_iter = 10
random_search = RandomizedSearchCV(xgb,n_jobs=-1,cv=5,n_iter=n_iter,scoring='roc_auc',param_distributions=xgb_params)

In [13]:
random_search.fit(x_train,y_train)

KeyboardInterrupt: 

In [None]:
random_search.best_estimator_

In [None]:
report(random_search.cv_results_,5)