In [1]:
__author__ = 'Vladimir Iglovikov'

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import mean_absolute_error, make_scorer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import hyperopt.pyll.stochastic
import time



In [2]:
%ls

1114.model                                                params.txt
Santhosh SharmaExploratory study on ML algorithms?.ipynb  sub_v.csv
Vladimir Iglovikovxgb 1114?.py                            submission.csv
[0m[01;34mallstate_capstone-master[0m/                                 test.csv
handtune.ipynb                                            train.csv
hyperopt.ipynb


In [13]:
%cat params.txt

socre:1156.8227052, params:{'reg_alpha': 0.26995003848906524, 'colsample_bytree': 0.2266549861816051, 'scale_pos_weight': 1, 'learning_rate': 0.05344032085694122, 'nthread': 10, 'min_child_weight': 4, 'subsample': 0.48379887781040276, 'seed': 20, 'max_depth': 16, 'gamma': 0.402588613598941}socre:1149.3825198, params:{'reg_alpha': 0.9999978085959014, 'colsample_bytree': 0.2502484001654734, 'scale_pos_weight': 1, 'learning_rate': 0.03509717328153762, 'nthread': 10, 'min_child_weight': 4, 'subsample': 0.6409135720554807, 'seed': 20, 'max_depth': 16, 'gamma': 0.1306661498079924}

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])

In [3]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

In [4]:
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = (remove_train|remove_test)
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        joined[column] = joined[column].apply(lambda x: np.nan if x in remove else x, 1)

    joined[column] = pd.factorize(joined[column].values, sort=True)[0]

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 200
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)
xgtrain = xgb.DMatrix(X, label=y)
xgtest = xgb.DMatrix(X_test)

In [18]:
time0= time.time()
cv = StratifiedKFold(n_splits=5,random_state=42)
space4xgb = {
    'max_depth': hp.choice('max_depth', range(3,20)),
    'min_child_weight':hp.choice('min_child_weight',range(1,10)),
    
    'learning_rate': hp.loguniform('learning_rate', -2.5*np.log(10), -1*np.log(10)),
    
    'scale_pos_weight': hp.uniform('scale_pos_weight', 0,10), 
    'gamma': hp.uniform('gamma',0,1),
    
    
    'subsample': hp.uniform('subsample',0.1,0.9),
    'colsample_bytree': hp.uniform('colsample_bytree',0.1,1),
    'reg_alpha':hp.uniform('reg_alpha',0,1000),
    
    'nthread': hp.choice('nthread', [10])
}
minacc=1160
#{'reg_alpha': 0.8673861702351379, 'colsample_bytree': 0.8759854865376723, 'scale_pos_weight': 1, 'learning_rate': 0.06925571303753118, 'nthread': 20, 'min_child_weight': 1, 'subsample': 0.629375884543008, 'max_depth': 7, 'gamma': 0.19254932932610697}
def optf(params): 
    params['nthread']=10
    params['seed']=20
    #clf = xgb.XGBRegressor(**params)
    #acc = cross_val_score(clf, X, y,cv=cv,n_jobs=1,scoring=evalerror).mean()
    bst = xgb.cv(params, xgtrain, num_boost_round=500, nfold=5, seed=7, 
                    feval=evalerror,early_stopping_rounds=10)
    acc = bst.iloc[-1,:]['test-mae-mean']
    print time.time()-time0
    print acc
    global minacc
    if acc < minacc:
        minacc = acc
        print 'new best:', minacc, params
        
        with open("params2.txt", "a") as text_file:
            text_file.write('socre:{}, params:{}\n'.format(minacc,params))
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
def run_trials():

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("/home/phe002/shicheng/dataset/hyperoptModel", "rb"))
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn=optf, space=space4xgb, algo=tpe.suggest, max_evals=max_trials, trials=trials)
    # save the trials object
    with open("/home/phe002/shicheng/dataset/hyperoptModel", "wb") as f:
        pickle.dump(trials, f)

# loop indefinitely and stop whenever you like
while True:
    run_trials()

Rerunning from 457 trials to 458 (+1) trials
109.290174961
1157.5361086
new best: 1157.5361086 {'reg_alpha': 0.991470826095777, 'colsample_bytree': 0.1235200182991481, 'scale_pos_weight': 1, 'learning_rate': 0.04127155920388758, 'nthread': 10, 'min_child_weight': 6, 'subsample': 0.24989254385528198, 'seed': 20, 'max_depth': 7, 'gamma': 0.47772585814974006}
Rerunning from 458 trials to 459 (+1) trials
201.662240982
2049.5875246
Rerunning from 459 trials to 460 (+1) trials
291.468131065
1684.0019042
Rerunning from 460 trials to 461 (+1) trials
582.61702013
1150.8644044
new best: 1150.8644044 {'reg_alpha': 0.9999174896084708, 'colsample_bytree': 0.15010562804817196, 'scale_pos_weight': 1, 'learning_rate': 0.061547110425257914, 'nthread': 10, 'min_child_weight': 6, 'subsample': 0.3332350811823883, 'seed': 20, 'max_depth': 7, 'gamma': 0.49966763825465166}
Rerunning from 463 trials to 464 (+1) trials
654.76593399
2474.997754
Rerunning from 464 trials to 465 (+1) trials
752.932347059
1153.462

In [66]:
#1148.9639894 {'reg_alpha': 0.13614510960026047, 'colsample_bytree': 0.48613283826428166, 'scale_pos_weight': 1, 'learning_rate': 0.018415349849039475, 'nthread': 10, 'min_child_weight': 3, 'subsample': 0.8045758877474857, 'max_depth': 16, 'gamma': 3.738381824865164}
'''RANDOM_STATE = 2016
params = {
    'min_child_weight': 1,
    'eta': 0.01,
    'colsample_bytree': 0.5,
    'max_depth': 12,
    'subsample': 0.8,
    'alpha': 1,
    'gamma': 1,
    'silent': 1,
    'verbose_eval': True,
    'seed': RANDOM_STATE,
    'nthread':10
}

xgtrain = xgb.DMatrix(X, label=y)
xgtest = xgb.DMatrix(X_test)

model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)'''

In [None]:
#model.save_model('1114.model')

In [None]:
prediction = np.exp(model.predict(xgtest)) - shift
submission = pd.DataFrame()
submission['loss'] = prediction
submission['id'] = ids
submission.to_csv('sub_v.csv', index=False)

In [None]:
if __name__ == '__main__':
    print "a+b"