In [1]:
import gc
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from scipy.stats import skew, boxcox
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
import itertools

In [2]:
shift = 200
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,cat4,cat14,cat38,cat24,cat82,cat25'.split(
    ',')

In [3]:
def encode(charcode):
    r = 0
    if(type(charcode) is float):
        return np.nan
    else:
        ln = len(charcode)
        for i in range(ln):
            r += (ord(charcode[i]) - ord('A') + 1) * 26 ** (ln - i - 1)
        return r

In [4]:
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    con = 2
    x = preds - labels
    grad = con * x / (np.abs(x) + con)
    hess = con ** 2 / (np.abs(x) + con) ** 2
    return grad, hess

In [5]:
def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y) - shift,
                                      np.exp(yhat) - shift)

In [6]:
def mungeskewed(train, test, numeric_feats):
    ntrain = train.shape[0]
    test['loss'] = 0
    train_test = pd.concat((train, test)).reset_index(drop=True)
    # compute skew and do Box-Cox transformation (Tilli)
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    print("\nSkew in numeric features:")
    print(skewed_feats)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index

    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    return train_test, ntrain

In [7]:
print('Started')
train = pd.read_csv(r'D:\allstate\train.csv.zip')
test = pd.read_csv(r'D:\allstate\test.csv.zip')
numeric_feats = [x for x in train.columns[1:-1] if 'cont' in x]
cats = [x for x in train.columns[1:-1] if 'cat' in x]
train_test, ntrain = mungeskewed(train, test, numeric_feats)

Started

Skew in numeric features:
cont1     0.516420
cont2    -0.310939
cont3    -0.010002
cont4     0.416093
cont5     0.681617
cont6     0.461211
cont7     0.826046
cont8     0.676629
cont9     1.072420
cont10    0.354998
cont11    0.280819
cont12    0.291990
cont13    0.380739
cont14    0.248672
dtype: float64


In [8]:
for column in list(train.select_dtypes(include=['object']).columns):
    if train[column].nunique() != test[column].nunique():
        set_train = set(train[column].unique())
        set_test = set(test[column].unique())
        remove_train = set_train - set_test
        remove_test = set_test - set_train

        remove = remove_train.union(remove_test)

        def filter_cat(x):
            if x in remove:
                return np.nan
            return x

        train_test[column] = train_test[column].apply(lambda x: filter_cat(x), 1)

In [9]:
train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))

train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25

In [10]:
    for comb in itertools.combinations(COMB_FEATURE, 2):
        feat = comb[0] + "_" + comb[1]
        train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
        train_test[feat] = train_test[feat].apply(encode)
        print(feat)

cat80_cat87
cat80_cat57
cat80_cat12
cat80_cat79
cat80_cat10
cat80_cat7
cat80_cat89
cat80_cat2
cat80_cat72
cat80_cat81
cat80_cat11
cat80_cat1
cat80_cat13
cat80_cat9
cat80_cat3
cat80_cat16
cat80_cat90
cat80_cat23
cat80_cat36
cat80_cat73
cat80_cat103
cat80_cat40
cat80_cat28
cat80_cat111
cat80_cat6
cat80_cat76
cat80_cat50
cat80_cat5
cat80_cat4
cat80_cat14
cat80_cat38
cat80_cat24
cat80_cat82
cat80_cat25
cat87_cat57
cat87_cat12
cat87_cat79
cat87_cat10
cat87_cat7
cat87_cat89
cat87_cat2
cat87_cat72
cat87_cat81
cat87_cat11
cat87_cat1
cat87_cat13
cat87_cat9
cat87_cat3
cat87_cat16
cat87_cat90
cat87_cat23
cat87_cat36
cat87_cat73
cat87_cat103
cat87_cat40
cat87_cat28
cat87_cat111
cat87_cat6
cat87_cat76
cat87_cat50
cat87_cat5
cat87_cat4
cat87_cat14
cat87_cat38
cat87_cat24
cat87_cat82
cat87_cat25
cat57_cat12
cat57_cat79
cat57_cat10
cat57_cat7
cat57_cat89
cat57_cat2
cat57_cat72
cat57_cat81
cat57_cat11
cat57_cat1
cat57_cat13
cat57_cat9
cat57_cat3
cat57_cat16
cat57_cat90
cat57_cat23
cat57_cat36
cat57_cat

In [11]:
cats = [x for x in train.columns[1:-1] if 'cat' in x]
for col in cats:
    train_test[col] = train_test[col].apply(encode)
train_test.loss = np.log(train_test.loss + shift)
ss = StandardScaler()
train_test[numeric_feats] = ss.fit_transform(train_test[numeric_feats].values)
train = train_test.iloc[:ntrain, :].copy()
test = train_test.iloc[ntrain:, :].copy()
test.drop('loss', inplace=True, axis=1)

In [12]:
print('Median Loss:', train.loss.median())
print('Mean Loss:', train.loss.mean())
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.03,
    'objective': 'reg:linear',
    'max_depth': 12,
    'min_child_weight': 100,
    'booster': 'gbtree',
}

Median Loss: 7.747411156506141
Mean Loss: 7.799837008871419


In [13]:
best_nrounds = 20000  # 640 score from above commented out code (Faron)
allpredictions = pd.DataFrame()
kfolds = 15  # 10 folds is better!

if kfolds > 1:
    kf = KFold(n_splits=kfolds)
    for i, (train_index, test_index) in enumerate(kf.split(train)):
        dtest = xgb.DMatrix(test[test.columns[1:]])
        print('Fold {0}'.format(i + 1))
        X_train, X_val = train.iloc[train_index], train.iloc[test_index]
        cols_ = [x for x in X_train.columns if 'loss' not in x][1:]
        dtrain = xgb.DMatrix(X_train[cols_], label=X_train.loss)
        dvalid = xgb.DMatrix(X_val[cols_], label=X_val.loss)
        watchlist = [(dtrain, 'train'), (dvalid, 'eval')]

        gbdt = xgb.train(xgb_params, dtrain, best_nrounds, watchlist,
                        obj=logregobj,
                        feval=xg_eval_mae, maximize=False,
                        verbose_eval=50,
                        early_stopping_rounds=25)
        del dtrain
        del dvalid
        gc.collect()
        allpredictions['p' + str(i)] = gbdt.predict(dtest, ntree_limit=gbdt.best_ntree_limit)
        del dtest
        del gbdt
        gc.collect()
        
else:
    cols_ = [x for x in train.columns if 'loss' not in x][1:]

    dtest = xgb.DMatrix(test[cols_].values)
    dtrain = xgb.DMatrix(train[cols_].values, label=train.loss)
    watchlist = [(dtrain, 'train'), (dtrain, 'eval')]
    gbdt = xgb.train(xgb_params, dtrain, best_nrounds, watchlist,
                    obj=logregobj,
                    feval=xg_eval_mae, maximize=False,
                    verbose_eval=50, early_stopping_rounds=25)
    
    allpredictions['p1'] = gbdt.predict(dtest, ntree_limit=gbdt.best_ntree_limit)
    del dtrain
    del dtest
    del gbdt
    gc.collect()

print(allpredictions.head())

Fold 1
[0]	train-mae:3232.53	eval-mae:3236.54
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 25 rounds.
[50]	train-mae:1547.94	eval-mae:1554.77
[100]	train-mae:1149.69	eval-mae:1187.93
[150]	train-mae:1100.82	eval-mae:1155.86
[200]	train-mae:1083.45	eval-mae:1148.51
[250]	train-mae:1070.96	eval-mae:1144.92
[300]	train-mae:1061.16	eval-mae:1142.34
[350]	train-mae:1052.27	eval-mae:1141.21
[400]	train-mae:1044.78	eval-mae:1140.04
[450]	train-mae:1036.95	eval-mae:1138.95
[500]	train-mae:1030.1	eval-mae:1138.05
[550]	train-mae:1022.53	eval-mae:1137.72
[600]	train-mae:1016.02	eval-mae:1137.09
[650]	train-mae:1009.98	eval-mae:1136.73
Stopping. Best iteration:
[653]	train-mae:1009.78	eval-mae:1136.63

Fold 2
[0]	train-mae:3231.37	eval-mae:3252.7
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 25 rounds.
[50]	train-mae:1546.59	eval-

In [14]:
submission = pd.read_csv(r'D:\allstate\sample_submission.csv.zip')
if (kfolds > 1):
    submission.iloc[:, 1] = np.exp(allpredictions.mean(axis=1).values) - shift
    submission.to_csv('xgbmeansubmission2.csv', index=None)
    submission.iloc[:, 1] = np.exp(allpredictions.median(axis=1).values) - shift
    submission.to_csv('xgbmediansubmission2.csv', index=None)
else:
    submission.iloc[:, 1] = np.exp(allpredictions.p1.values) - shift
    submission.to_csv('xgbsubmission2.csv', index=None)
print('Finished')

Finished
