In [1]:
import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
import itertools



In [2]:
shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

In [4]:
len(COMB_FEATURE)

35

In [5]:
def encode(charcode):
    r = 0
    ln = len(str(charcode))
    for i in range(ln):
        r += (ord(str(charcode)[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
    return r

fair_constant = 2
def fair_obj(preds, dtrain):
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift, np.exp(yhat)-shift)

def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

In [14]:
print('\nStarted')
train = pd.read_csv(r'D:\allstate\train.csv.zip')
test = pd.read_csv(r'D:\allstate\test.csv.zip')


Started


In [15]:
numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
categorical_feats = [x for x in train.columns[1:-1] if 'cat' in x]
train_test, ntrain = mungeskewed(train, test, numeric_feats)

In [21]:
print('')
for comb in itertools.combinations(COMB_FEATURE, 2):
    feat = comb[0] + "_" + comb[1]
    train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
    train_test[feat] = train_test[feat].apply(encode)
    print('Analyzing Columns:', feat)


Analyzing Columns: cat80_cat87
Analyzing Columns: cat80_cat57
Analyzing Columns: cat80_cat12
Analyzing Columns: cat80_cat79
Analyzing Columns: cat80_cat10
Analyzing Columns: cat80_cat7
Analyzing Columns: cat80_cat89
Analyzing Columns: cat80_cat2
Analyzing Columns: cat80_cat72
Analyzing Columns: cat80_cat81
Analyzing Columns: cat80_cat11
Analyzing Columns: cat80_cat1
Analyzing Columns: cat80_cat13
Analyzing Columns: cat80_cat9
Analyzing Columns: cat80_cat3
Analyzing Columns: cat80_cat16
Analyzing Columns: cat80_cat90
Analyzing Columns: cat80_cat23
Analyzing Columns: cat80_cat36
Analyzing Columns: cat80_cat73
Analyzing Columns: cat80_cat103
Analyzing Columns: cat80_cat40
Analyzing Columns: cat80_cat28
Analyzing Columns: cat80_cat111
Analyzing Columns: cat80_cat6
Analyzing Columns: cat80_cat76
Analyzing Columns: cat80_cat50
Analyzing Columns: cat80_cat5
Analyzing Columns: cat80_cat4
Analyzing Columns: cat80_cat14
Analyzing Columns: cat80_cat38
Analyzing Columns: cat80_cat24
Analyzing Col

In [24]:
# не надо все перебирать 
categorical_feats = [x for x in train_test.columns[1:] if 'cat' in x]

In [25]:
print('')
for col in categorical_feats:
    print('Analyzing Column:', col)
    train_test[col] = train_test[col].apply(encode)


Analyzing Column: cat1
Analyzing Column: cat2
Analyzing Column: cat3
Analyzing Column: cat4
Analyzing Column: cat5
Analyzing Column: cat6
Analyzing Column: cat7
Analyzing Column: cat8
Analyzing Column: cat9
Analyzing Column: cat10
Analyzing Column: cat11
Analyzing Column: cat12
Analyzing Column: cat13
Analyzing Column: cat14
Analyzing Column: cat15
Analyzing Column: cat16
Analyzing Column: cat17
Analyzing Column: cat18
Analyzing Column: cat19
Analyzing Column: cat20
Analyzing Column: cat21
Analyzing Column: cat22
Analyzing Column: cat23
Analyzing Column: cat24
Analyzing Column: cat25
Analyzing Column: cat26
Analyzing Column: cat27
Analyzing Column: cat28
Analyzing Column: cat29
Analyzing Column: cat30
Analyzing Column: cat31
Analyzing Column: cat32
Analyzing Column: cat33
Analyzing Column: cat34
Analyzing Column: cat35
Analyzing Column: cat36
Analyzing Column: cat37
Analyzing Column: cat38
Analyzing Column: cat39
Analyzing Column: cat40
Analyzing Column: cat41
Analyzing Column: cat42


In [28]:
ss = StandardScaler()
train_test[numeric_feats] = ss.fit_transform(train_test[numeric_feats].values)

In [29]:
train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()
print('\nMedian Loss:', train.loss.median())
print('Mean Loss:', train.loss.mean())


Median Loss: 2115.5699999999997
Mean Loss: 3037.3376856699792


In [31]:
ids = pd.read_csv(r'D:\allstate\test.csv.zip')['id']
train_y = np.log(train['loss'] + shift)
train_x = train.drop(['loss','id'], axis=1)
test_x = test.drop(['loss','id'], axis=1)

In [35]:
n_folds = 10
cv_sum = 0
early_stopping = 100
fpred = []
xgb_rounds = []

In [36]:
d_train_full = xgb.DMatrix(train_x, label=train_y)
d_test = xgb.DMatrix(test_x)

In [41]:
kf = KFold(train.shape[0], n_folds=n_folds)
for i, (train_index, test_index) in enumerate(kf):
    print('\n Fold %d' % (i+1))
    X_train, X_val = train_x.iloc[train_index], train_x.iloc[test_index]
    y_train, y_val = train_y.iloc[train_index], train_y.iloc[test_index]
    rand_state = 42
    params = {
        'seed': 0,
        'colsample_bytree': 0.7,
        'silent': 1,
        'subsample': 0.7,
        'learning_rate': 0.03,
        'objective': 'reg:linear',
        'max_depth': 12,
        'min_child_weight': 100,
        'booster': 'gbtree'
    }
    d_train = xgb.DMatrix(X_train, label=y_train)
    d_valid = xgb.DMatrix(X_val, label=y_val)
    watchlist = [(d_train, 'train'), (d_valid, 'eval')]
    clf = xgb.train(params, d_train, 100000, watchlist, early_stopping_rounds=50, obj=fair_obj, feval=xg_eval_mae)

    xgb_rounds.append(clf.best_iteration)
    scores_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
    cv_score = mean_absolute_error(np.exp(y_val), np.exp(scores_val))
    
    print('eval-MAE: %.6f' % cv_score)
    y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

    if i > 0:
        fpred = pred + y_pred
    else:
        fpred = y_pred
    pred = fpred
    cv_sum = cv_sum + cv_score


 Fold 1
[0]	train-mae:3231.96	eval-mae:3240.29
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 50 rounds.
[1]	train-mae:3226.58	eval-mae:3234.91
[2]	train-mae:3217.99	eval-mae:3226.32
[3]	train-mae:3205.73	eval-mae:3214.04
[4]	train-mae:3189.52	eval-mae:3197.8
[5]	train-mae:3169.31	eval-mae:3177.55
[6]	train-mae:3145.22	eval-mae:3153.39
[7]	train-mae:3117.44	eval-mae:3125.53
[8]	train-mae:3086.25	eval-mae:3094.26
[9]	train-mae:3052.11	eval-mae:3059.99
[10]	train-mae:3015.33	eval-mae:3023.06
[11]	train-mae:2976.15	eval-mae:2983.69
[12]	train-mae:2935.14	eval-mae:2942.52
[13]	train-mae:2892.62	eval-mae:2899.82
[14]	train-mae:2848.82	eval-mae:2855.82
[15]	train-mae:2804.07	eval-mae:2810.79
[16]	train-mae:2758.41	eval-mae:2764.86
[17]	train-mae:2712.44	eval-mae:2718.62
[18]	train-mae:2665.91	eval-mae:2671.81
[19]	train-mae:2619.29	eval-mae:2624.96
[20]	train-mae:2572.92	eval-mae:2578.35
[21]	train-mae:2526.4

In [42]:
mpred = pred / n_folds
score = cv_sum / n_folds
print('Average eval-MAE: %.6f' % score)
n_rounds = int(np.mean(xgb_rounds))

print("Writing results")
result = pd.DataFrame(mpred, columns=['loss'])
result["id"] = ids

result = result.set_index("id")
print("%d-fold average prediction:" % n_folds)

now = datetime.now()
score = str(round((cv_sum / n_folds), 6))
sub_file = 'output/submission_5fold-average-xgb_fairobj_' + str(score) + '_' + str(now.strftime("%Y-%m-%d-%H-%M")) + '.csv'
print("Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')

Average eval-MAE: 1131.430646
Writing results
10-fold average prediction:
Writing submission: output/submission_5fold-average-xgb_fairobj_1131.430646_2016-11-24-04-41.csv


FileNotFoundError: [Errno 2] No such file or directory: 'output/submission_5fold-average-xgb_fairobj_1131.430646_2016-11-24-04-41.csv'