In [1]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
file='G:/EdData/census_income.csv'

ci=pd.read_csv(file)
ci.drop(['education'],axis=1,inplace=True)
ci['Y']=(ci['Y']==' >50K').astype(int)
cat_cols=ci.select_dtypes(['object']).columns

for col in cat_cols:
    freqs=ci[col].value_counts()
    k=freqs.index[freqs>500][:-1]
    for cat in k:
        name=col+'_'+cat
        ci[name]=(ci[col]==cat).astype(int)
    del ci[col]

x=ci.drop(['Y'],1)
y=ci['Y']
    

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [5]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
def acc_model(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, x_train, y_train).mean()

In [8]:
x_train.shape

(26048, 38)

In [9]:
param_space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,30)),
    'n_estimators': hp.choice('n_estimators', range(100,500)),
    'criterion': hp.choice('criterion', ["gini", "entropy"])}

In [10]:
best = 0
def f(params):
    global best
    acc = acc_model(params)
    if acc > best:
        best = acc
    print ('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

In [12]:
trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=3, trials=trials)
print ('best:')
print (best)

new best:                                                                                                              
0.8636362577835719                                                                                                     
{'criterion': 'gini', 'max_depth': 15, 'max_features': 6, 'n_estimators': 308}                                         
new best:                                                                                                              
0.8636362577835719                                                                                                     
{'criterion': 'gini', 'max_depth': 8, 'max_features': 24, 'n_estimators': 303}                                         
new best:                                                                                                              
0.8636362577835719                                                                                                     
{'criterion': 'entropy', 'max_depth': 10

In [13]:
rf=RandomForestClassifier(**{'criterion': 'gini', 'max_depth': 14, 'max_features': 5, 'n_estimators': 208})

In [14]:
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=14, max_features=5,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=208,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import roc_auc_score

In [16]:
roc_auc_score(y_test,rf.predict_proba(x_test)[:,1])

0.9197164009257246