In [13]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,GridSearchCV
from sklearn.model_selection import cross_val_score, learning_curve
from sklearn.metrics import mean_absolute_error, make_scorer
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import matplotlib.pyplot as plt
import pickle
import hyperopt.pyll.stochastic
import time

In [14]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test['loss'] = np.nan
joined = pd.concat([train, test])

In [15]:
features = [x for x in train.columns if x not in ['id','loss', 'log_loss']]

cat_features = [x for x in train.select_dtypes(
        include=['object']).columns if x not in ['id','loss', 'log_loss']]
num_features = [x for x in train.select_dtypes(
        exclude=['object']).columns if x not in ['id','loss', 'log_loss']]

print "Categorical features:", len(cat_features)
print "Numerical features:", len(num_features)

Categorical features: 116
Numerical features: 14


In [16]:
def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(preds), np.exp(labels)),False

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y), np.exp(yhat)),False

def mae_score(y_true, y_pred):
    return mean_absolute_error(np.exp(y_true), np.exp(y_pred))

mae_scorer = make_scorer(mae_score, greater_is_better=False)

In [17]:
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = (remove_train|remove_test)
        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        joined[column] = joined[column].apply(lambda x: 'np.nan' if x in remove else x, 1)

    joined[column] = pd.factorize(joined[column].values, sort=True,na_sentinel=-1)[0]
    joined[column] = joined[column] + 1

train = joined[joined['loss'].notnull()]
test = joined[joined['loss'].isnull()]

shift = 200
y = np.log(train['loss'] + shift)
ids = test['id']
X = train.drop(['loss', 'id'], 1)
X_test = test.drop(['loss', 'id'], 1)
lgtrain = lgb.Dataset(X, label=y,categorical_feature=cat_features,)
lgtest = lgb.Dataset(X_test)

In [21]:
time0= time.time()
space4lgb = {
    'max_depth': hp.choice('max_depth', range(3,29)),
    'min_child_weight':hp.choice('min_child_weight',range(2,100)),
    'num_leaves':hp.choice('num_leaves',range(50,400)),
    
    'min_split_gain':hp.choice('min_split_gain',[ 0.01 * i for i in range(0,100)]),
    'colsample_bytree':hp.choice('colsample_bytree',[ 0.01 * i for i in range(50,100)]),
    'subsample':hp.choice('subsample',[ 0.01 * i for i in range(50,100)]),
    
    'subsample_freq':hp.choice('subsample_freq',range(0,10)),
    
    'reg_alpha':hp.uniform('reg_alpha',0,1000),
    'reg_lambda':hp.uniform('reg_lambda',0,1000),
    
    'nthread': hp.choice('nthread', [10]),
}
minacc = 1512.60244356
def optf(params): 
    cv_result_lgb = lgb.cv(params, lgtrain, num_boost_round=100, nfold=3, 
                           stratified=False,categorical_feature=cat_features,
                    feval=evalerror, early_stopping_rounds=5) 
    acc = cv_result_lgb['mae-mean'][-1]
    print acc
    global minacc
    if acc < minacc:
        minacc = acc
        print 'new best:', minacc, params
        
        with open("lgbparams.txt", "a") as text_file:
            text_file.write('socre:{}, params:{}\n'.format(minacc,params))
    return {'loss': -acc, 'status': STATUS_OK}

In [None]:
def run_trials():

    trials_step = 1  # how many additional trials to do after loading saved trials. 1 = save after iteration
    max_trials = 1  # initial max_trials. put something small to not have to wait

    
    try:  # try to load an already saved trials object, and increase the max
        trials = pickle.load(open("/home/phe002/shicheng/dataset/lgbhyperopt", "rb"))
        max_trials = len(trials.trials) + trials_step
        print("Rerunning from {} trials to {} (+{}) trials".format(len(trials.trials), max_trials, trials_step))
    except:  # create a new trials object and start searching
        trials = Trials()

    best = fmin(fn=optf, space=space4lgb, algo=tpe.suggest, max_evals=max_trials, trials=trials)
    # save the trials object
    with open("/home/phe002/shicheng/dataset/lgbhyperopt", "wb") as f:
        pickle.dump(trials, f)

# loop indefinitely and stop whenever you like
while True:
    run_trials()



1270.19421331
new best: 1270.19421331 {'num_leaves': 344, 'reg_alpha': 854.9946380195793, 'subsample_freq': 7, 'colsample_bytree': 0.6900000000000001, 'verbose': 1, 'nthread': 10, 'min_child_weight': 39, 'min_split_gain': 0.22, 'subsample': 0.79, 'reg_lambda': 380.092886126646, 'max_bin': 255, 'categorical_column': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115], 'max_depth': 6}
Rerunning from 1 trials to 2 (+1) trials
1282.11649259
Rerunning from 2 trials to 3 (+1) trials
1198.12102817
new best: 1198.12102817 {'num_leaves': 367, 'reg_alpha': 102.31314041450234, 'subsamp

In [66]:
'''params = {
    'min_child_weight': 1,
    'eta': 0.01,
    'colsample_bytree': 0.5,
    'max_depth': 12,
    'subsample': 0.8,
    'alpha': 1,
    'gamma': 1,
    'silent': 1,
    'verbose_eval': True,
    'seed': RANDOM_STATE,
    'nthread':10
}

xgtrain = xgb.DMatrix(X, label=y)
xgtest = xgb.DMatrix(X_test)

model = xgb.train(params, xgtrain, int(2012 / 0.9), feval=evalerror)'''

In [None]:
prediction = np.exp(model.predict(xgtest)) - shift
submission = pd.DataFrame()
submission['loss'] = prediction
submission['id'] = ids
submission.to_csv('sub_v.csv', index=False)

In [None]:
if __name__ == '__main__':
    print "a+b"