In [11]:
__author__ = 'Vladimir Iglovikov'

import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import mean_absolute_error, make_scorer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import pickle
import hyperopt.pyll.stochastic
import time

In [12]:
%ls

1114.model                                                my_model.hyperopt
Santhosh SharmaExploratory study on ML algorithms?.ipynb  params.txt
Vladimir Iglovikovxgb 1114?.py                            sub_v.csv
[0m[01;34mallstate_capstone-master[0m/                                 submission.csv
handtune.ipynb                                            test.csv
hyperopt.ipynb                                            train.csv


In [13]:
%cat params.txt

socre:1156.8227052, params:{'reg_alpha': 0.26995003848906524, 'colsample_bytree': 0.2266549861816051, 'scale_pos_weight': 1, 'learning_rate': 0.05344032085694122, 'nthread': 10, 'min_child_weight': 4, 'subsample': 0.48379887781040276, 'seed': 20, 'max_depth': 16, 'gamma': 0.402588613598941}socre:1149.3825198, params:{'reg_alpha': 0.9999978085959014, 'colsample_bytree': 0.2502484001654734, 'scale_pos_weight': 1, 'learning_rate': 0.03509717328153762, 'nthread': 10, 'min_child_weight': 4, 'subsample': 0.6409135720554807, 'seed': 20, 'max_depth': 16, 'gamma': 0.1306661498079924}

In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])

In [15]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels))

In [None]:
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = (remove_train|remove_test)
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        joined[column] = joined[column].apply(lambda x: np.nan if x in remove else x, 1)

    joined[column] = pd.factorize(joined[column].values, sort=True)[0]

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 200
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)
xgtrain = xgb.DMatrix(X, label=y)
xgtest = xgb.DMatrix(X_test)

In [None]:
time0= time.time()
cv = StratifiedKFold(n_splits=5,random_state=42)
space4xgb = {
    'max_depth': hp.choice('max_depth', range(3,20)),
    'learning_rate': hp.loguniform('learning_rate', -2.5*np.log(10), -1*np.log(10)),
    'min_child_weight':hp.choice('min_child_weight',range(1,7)),
    'scale_pos_weight': hp.choice('scale_pos_weight', [1]), 
    'reg_alpha': hp.uniform('reg_alpha',0.1,1),
    'gamma': hp.uniform('gamma',0,0.5),
    
    'subsample': hp.uniform('subsample',0.1,0.9),
    'colsample_bytree': hp.uniform('colsample_bytree',0.1,1),
    
    'nthread': hp.choice('nthread', [10])
}
minacc=1160
#{'reg_alpha': 0.8673861702351379, 'colsample_bytree': 0.8759854865376723, 'scale_pos_weight': 1, 'learning_rate': 0.06925571303753118, 'nthread': 20, 'min_child_weight': 1, 'subsample': 0.629375884543008, 'max_depth': 7, 'gamma': 0.19254932932610697}
def optf(params): 
    params['nthread']=10
    params['seed']=20
    #clf = xgb.XGBRegressor(**params)
    #acc = cross_val_score(clf, X, y,cv=cv,n_jobs=1,scoring=evalerror).mean()
    bst = xgb.cv(params, xgtrain, num_boost_round=500, nfold=5, seed=7, 
                    feval=evalerror,early_stopping_rounds=10)
    acc = bst.iloc[-1,:]['test-mae-mean']
    print time.time()-time0
    print acc
    global minacc
    if acc < minacc:
        minacc = acc
        print 'new best:', minacc, params
        
        with open("params.txt", "a") as text_file:
            text_file.write('socre:{}, params:{}\n'.format(minacc,params))
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
def run_trials():

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("/home/phe002/shicheng/kaggle/Allstate Claims Severity/my_model.hyperopt", "rb"))
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn=optf, space=space4xgb, algo=tpe.suggest, max_evals=max_trials, trials=trials)
    # save the trials object
    with open("my_model.hyperopt", "wb") as f:
        pickle.dump(trials, f)

# loop indefinitely and stop whenever you like
while True:
    run_trials()

Rerunning from 353 trials to 354 (+1) trials
205.044909954
2091.0032714
Rerunning from 354 trials to 355 (+1) trials
357.92079401
2469.3193358
Rerunning from 355 trials to 356 (+1) trials
488.391077042
1677.0664306
Rerunning from 356 trials to 357 (+1) trials
715.206483126
1790.186914
Rerunning from 357 trials to 358 (+1) trials
832.859982967
1161.454956
Rerunning from 358 trials to 359 (+1) trials
1010.66010094
2241.1940432
Rerunning from 359 trials to 360 (+1) trials
1144.06762505
2581.3344236
Rerunning from 360 trials to 361 (+1) trials
1330.87800407
1161.4091552
Rerunning from 361 trials to 362 (+1) trials
1460.34931517
1156.8227052
new best: 1156.8227052 {'reg_alpha': 0.26995003848906524, 'colsample_bytree': 0.2266549861816051, 'scale_pos_weight': 1, 'learning_rate': 0.05344032085694122, 'nthread': 10, 'min_child_weight': 4, 'subsample': 0.48379887781040276, 'seed': 20, 'max_depth': 16, 'gamma': 0.402588613598941}
Rerunning from 362 trials to 363 (+1) trials
1735.93543005
1924.710

In [66]:
#1148.9639894 {'reg_alpha': 0.13614510960026047, 'colsample_bytree': 0.48613283826428166, 'scale_pos_weight': 1, 'learning_rate': 0.018415349849039475, 'nthread': 10, 'min_child_weight': 3, 'subsample': 0.8045758877474857, 'max_depth': 16, 'gamma': 3.738381824865164}
'''RANDOM_STATE = 2016
params = {
    'min_child_weight': 1,
    'eta': 0.01,
    'colsample_bytree': 0.5,
    'max_depth': 12,
    'subsample': 0.8,
    'alpha': 1,
    'gamma': 1,
    'silent': 1,
    'verbose_eval': True,
    'seed': RANDOM_STATE,
    'nthread':10
}

xgtrain = xgb.DMatrix(X, label=y)
xgtest = xgb.DMatrix(X_test)

model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)'''

In [None]:
#model.save_model('1114.model')

In [None]:
prediction = np.exp(model.predict(xgtest)) - shift
submission = pd.DataFrame()
submission['loss'] = prediction
submission['id'] = ids
submission.to_csv('sub_v.csv', index=False)

In [None]:
if __name__ == '__main__':
    print "a+b"