In [1]:
import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier

In [2]:
file=r'/Users/lalitsachan/Dropbox/0.0 Data/census_income.csv'

ci=pd.read_csv(file)
ci.drop(['education'],axis=1,inplace=True)
ci['Y']=(ci['Y']==' >50K').astype(int)
cat_cols=ci.select_dtypes(['object']).columns

for col in cat_cols:
    freqs=ci[col].value_counts()
    k=freqs.index[freqs>500][:-1]
    for cat in k:
        name=col+'_'+cat
        ci[name]=(ci[col]==cat).astype(int)
    del ci[col]

x=ci.drop(['Y'],1)
y=ci['Y']
    

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [5]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
def acc_model(params):
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, x_train, y_train).mean()

In [8]:
x_train.shape

(26048, 38)

In [9]:
param_space = {
    'max_depth': hp.choice('max_depth', range(1,20)),
    'max_features': hp.choice('max_features', range(1,30)),
    'n_estimators': hp.choice('n_estimators', range(100,500)),
    'criterion': hp.choice('criterion', ["gini", "entropy"])}

In [10]:
best = 0
def f(params):
    global best
    acc = acc_model(params)
    if acc > best:
        best = acc
    print ('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

In [11]:
trials = Trials()
best = fmin(f, param_space, algo=tpe.suggest, max_evals=10, trials=trials)
print ('best:')
print (best)

  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]




new best:                                           
0.853693090005122                                   
{'criterion': 'entropy', 'max_depth': 6, 'max_features': 6, 'n_estimators': 233}
 10%|█         | 1/10 [00:03<00:35,  3.93s/it, best loss: -0.853693090005122]




new best:                                                                    
0.853693090005122                                                            
{'criterion': 'entropy', 'max_depth': 3, 'max_features': 29, 'n_estimators': 316}
 20%|██        | 2/10 [00:13<00:44,  5.55s/it, best loss: -0.853693090005122]




new best:                                                                    
0.8614097106467673                                                           
{'criterion': 'gini', 'max_depth': 16, 'max_features': 13, 'n_estimators': 294}
 30%|███       | 3/10 [00:27<00:57,  8.19s/it, best loss: -0.8614097106467673]




new best:                                                                     
0.8617167843683111                                                            
{'criterion': 'entropy', 'max_depth': 12, 'max_features': 21, 'n_estimators': 370}
 40%|████      | 4/10 [00:53<01:21, 13.60s/it, best loss: -0.8617167843683111]




new best:                                                                     
0.8624462010792854                                                            
{'criterion': 'entropy', 'max_depth': 18, 'max_features': 8, 'n_estimators': 482}
 50%|█████     | 5/10 [01:16<01:21, 16.37s/it, best loss: -0.8624462010792854]




new best:                                                                     
0.8624462010792854                                                            
{'criterion': 'gini', 'max_depth': 16, 'max_features': 22, 'n_estimators': 205}
 60%|██████    | 6/10 [01:33<01:06, 16.53s/it, best loss: -0.8624462010792854]




new best:                                                                     
0.8624462010792854                                                            
{'criterion': 'entropy', 'max_depth': 19, 'max_features': 20, 'n_estimators': 239}
 70%|███████   | 7/10 [01:52<00:52, 17.36s/it, best loss: -0.8624462010792854]




new best:                                                                     
0.8624462010792854                                                            
{'criterion': 'entropy', 'max_depth': 4, 'max_features': 4, 'n_estimators': 386}
 80%|████████  | 8/10 [01:57<00:26, 13.40s/it, best loss: -0.8624462010792854]




new best:                                                                     
0.8624462010792854                                                            
{'criterion': 'entropy', 'max_depth': 7, 'max_features': 16, 'n_estimators': 269}
 90%|█████████ | 9/10 [02:06<00:12, 12.20s/it, best loss: -0.8624462010792854]




new best:                                                                     
0.8624462010792854                                                            
{'criterion': 'gini', 'max_depth': 14, 'max_features': 23, 'n_estimators': 259}
100%|██████████| 10/10 [02:25<00:00, 14.29s/it, best loss: -0.8624462010792854]
best:
{'criterion': 1, 'max_depth': 17, 'max_features': 7, 'n_estimators': 382}


In [12]:
rf=RandomForestClassifier(**{'criterion': 'entropy', 'max_depth': 17, 'max_features': 7, 'n_estimators': 382})

In [13]:
rf.fit(x_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
                       max_depth=17, max_features=7, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=382,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [14]:
from sklearn.metrics import roc_auc_score

In [15]:
roc_auc_score(y_test,rf.predict_proba(x_test)[:,1])

0.917132138950613