# Rating
1. hd = 0.744 lb = 0.76195 agg_custom
2. hd = 0.747 lb = 0.76741 aggregate
3. hd = 0.756 lb = 0.78332 agg_petrovich
4. hd = 0.746 lb = ... aggregate_modify

In [20]:
# Data-scientist prays like this:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from utils import *
from tqdm import tqdm_notebook

from scipy import stats
from scipy.sparse import coo_matrix, hstack

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier

import xgboost as xgb

from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# Load and prepare data

### use for first time

In [None]:
train, test = load_data(resave=True, points=True, sort=True)

### load here

In [13]:
train, test = load_data(resave=False, points=True, sort=True)

In [14]:
X_train, y_train = calculate_target(train, offset=0)
X_test = test
X_train.shape, X_test.shape

((4480835, 17), (4751035, 16))

### Add aditional data

In [4]:
rfm = pd.read_csv('./rfm-segments.csv')
new_columns = rfm[['customer', 'R_Quartile', 'F_Quartile']]
X_train = X_train.merge(new_columns, left_on='id', right_on='customer', how='left')
X_test = X_test.merge(new_columns, left_on='id', right_on='customer', how='left')
X_train.drop('customer', axis=1, inplace=True)
X_test.drop('customer', axis=1, inplace=True)
X_train.shape, X_test.shape

((4480835, 19), (4751035, 18))

In [15]:
cat_features_name = get_cat_features()
cat_features_name += ['rich_category', 'R_Quartile', 'F_Quartile']
print(cat_features_name)

['n_tr', 'code_azs', 'location', 'region', 'code', 'code1', 'type', 'month', 'weekday', 'rich_category', 'R_Quartile', 'F_Quartile']


In [16]:
for col in tqdm_notebook(X_train.columns):
    if col in cat_features_name:
        X_train[col].fillna('-' if col == 'code' else -99999, inplace=True)
        X_test[col].fillna('-' if col == 'code' else -99999, inplace=True)
        le = LabelEncoder().fit(X_train[col].append(X_test[col]))
        X_train[col] = le.transform(X_train[col])
        X_test[col] = le.transform(X_test[col])

A Jupyter Widget




# Aggregate function

In [17]:
def aggregate(df, take_values=True):
    mode = lambda x: stats.mode(x).mode[0]
    fst = lambda vec: vec.iloc[0]
    simple_trend = lambda vec: np.sum(vec.shift(1)-vec[1:])
    
    num_features = ['min', 'max', 'median', 'sum', simple_trend]
    cat_features = [unique_cnt, 'min', 'max', mode]
   
    res = df.groupby('id')[['code', 'code1', 'code_azs', 'cur_points', 'first_prch_num', 'location',\
                           'oil_price', 'percent', 'q', 'region', 'return_num',\
                           'rich_category','sum_b', 'total_user_spend',\
                           'type', 'user_spend_fuel', 'v_l', 'month', 'weekday', 'true_percent',\
                           'time_weight', 'R_Quartile', 'F_Quartile']].agg({
        'code':[unique_cnt, mode],
        'code1':[unique_cnt, mode],
        'code_azs':[unique_cnt, mode],
        'cur_points':num_features,
        'first_prch_num':'max',
        'location':[unique_cnt, mode],
        'oil_price':['min', 'max'],
        'percent': num_features,
        'q': num_features,
        'region':[unique_cnt, mode],
        'return_num':'max',
        'rich_category':'max',
        'sum_b': num_features,
        'total_user_spend': 'max',
        'type':[unique_cnt, mode],
        'user_spend_fuel':num_features,
        'v_l':num_features,
        'month':[unique_cnt, mode],
        'weekday':[unique_cnt, mode],
        'true_percent':['max', 'median'],
        'time_weight': num_features,
        'R_Quartile': ['median'],
        'F_Quartile': ['median']
    })
 
    if take_values:
        return res.values, res.index
    else:
        return res

# Holdout

In [18]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, train_size=0.75)
X_tr.shape[0], X_val.shape[0]

(3392763, 1088072)

In [None]:
%%time
X_tr, X_val = add_features(X_tr, X_val, sort=True)

In [None]:
save_to(X_tr, X_val, 'data/holdout.hdf')

In [19]:
X_tr, X_val = load_from('data/holdout.hdf')

In [None]:
%%time
X_tr_agg, X_val_agg = aggregate(X_tr, take_values=False), aggregate(X_val, take_values=False)

In [None]:
save_to(X_tr_agg, X_val_agg, 'data/aggregate.hdf')

In [34]:
X_tr_agg, X_val_agg = load_from('data/aggregate.hdf')

# XGBoost

In [22]:
cat_features = [col for col in X_tr_agg.columns if col in cat_features_name or 
                                        (col[0] in cat_features_name and col[1] != 'unique_cnt')]

In [36]:
le = OneHotEncoder().fit(X_tr_agg[cat_features].append(X_val_agg[cat_features]))
X_tr_ohe = le.transform(X_tr_agg[cat_features])
X_val_ohe = le.transform(X_val_agg[cat_features])
X_tr_ohe.shape, X_tr_agg.drop(cat_features, axis=1).shape, X_val_ohe.shape, X_val_agg.drop(cat_features, axis=1).shape

((39540, 1442), (39540, 50), (13181, 1442), (13181, 50))

In [37]:
X_tr_ohe = hstack([X_tr_ohe, coo_matrix(X_tr_agg.drop(cat_features, axis=1))])
X_val_ohe = hstack([X_val_ohe, coo_matrix(X_val_agg.drop(cat_features, axis=1))])
X_tr_ohe.shape

(39540, 1492)

In [65]:
params = {
    'booster': 'gblinear',
    'objective': 'binary:logistic',
    'lambda': 0.1,
    'learning_rate': 1.0,
    'silent': 1.0,
    'seed': 42
}
eval_params = params.copy()

In [None]:
clf = xgb.XGBClassifier(**eval_params, num_rounds = 500, n_jobs=-1)

In [None]:
clf.fit(X_tr_ohe, y_tr)

In [None]:
pred = clf.predict_proba(X_val_ohe)[:, 1]
roc_auc_score(y_val, pred)

# RandomForestClassifier

In [16]:
rfclf = RandomForestClassifier(random_state=42, n_estimators=100, n_jobs=-1, verbose=1)
rfclf.fit(X_tr_agg.values, y_tr)

roc_auc_score(y_val, rfclf.predict_proba(X_val_agg.values)[:, 1])

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.6s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.7s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.1s finished


0.9477369268302286

# XGBoost Tune

In [None]:
def score(params):
    print("Training with params:")
    print(params)
    if param['bootstrap_type'] == 'Bernoulli':
        param['subsample'] = 0.75
    clf = CatBoostClassifier(**params)
    clf.fit(X_tr_agg.values, y_tr, cat_features=cat_features)
    predictions = clf.predict_proba(X_val_agg.values)[:, 1]
    score = roc_auc_score(y_val, predictions)
    print("\tScore {0}\n\n".format(score))
    return {'loss': score, 'status': STATUS_OK}

In [None]:
def optimize(trials):
    
    # 1 075 200 fits
    space_all = {
        'random_seed': 42,
        'eval_metric': 'AUC',
        'train_dir': './catboost',
        'verbose': False,
        
        'iteration': hp.choice('iteration', np.linspace(10, 1000, 10)),
        'learning_rate': hp.choice('learning_rate', np.linspace(0.001, 0.1, 10)),
        'bootstrap_type': hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli']),
        'bagging_temperature': hp.choice('bagging_temperature', np.linspace(0, 1, 4)),
        
        'l2_leaf_reg': hp.choice('l2_leaf_reg', [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]),
        'depth': hp.choice('depth', np.arange(2, 16, 2)),
        'rsm': hp.choice('rsm', np.linspace(0, 1, 4)),
        'leaf_estimation_method': hp.choice('leaf_estimation_method', ['Newton', 'Gradient']),
        'leaf_estimation_method': hp.choice('leaf_estimation_method', np.arange(1, 4))
    }
    
    # 11 250 fits
    space_mini = {
        'random_seed': 42,
        'eval_metric': 'AUC',
        'train_dir': './catboost',
        'verbose': False,
        
        'iteration': hp.choice('iteration', np.linspace(50, 1000, 5)),
        'learning_rate': hp.choice('learning_rate', np.linspace(0.001, 0.1, 5)),
        'bootstrap_type': hp.choice('bootstrap_type', ['Bayesian', 'Bernoulli']),
        'bagging_temperature': hp.choice('bagging_temperature', np.linspace(0, 1, 3)),
        
        'l2_leaf_reg': hp.choice('l2_leaf_reg', [1e-3, 1e-2, 1e-1, 1e0, 1e1]),
        'depth': hp.choice('depth', np.arange(2, 11, 2)),
        'rsm': hp.choice('rsm', np.linspace(0, 1, 3))
    }
    
    best = fmin(score, space_mini, algo=tpe.suggest, trials=trials, max_evals=10)
    return best

In [None]:
trials = Trials()
best_params = optimize(trials)
best_params

# Submission

In [None]:
%%time
X_train_add, X_test_add = add_features(X_train, X_test, sort=True)

In [None]:
X_train_agg = aggregate(X_train_add, take_values=False)
X_test_agg = aggregate(X_test_add, take_values=False)

In [None]:
clf = CatBoostClassifier(**params)
clf.fit(X_train_agg.values, y_train, cat_features=cat_features, plot=True)

In [None]:
y_pred = clf.predict_proba(X_test_agg.values)[:, 0]
y_pred = pd.Series(y_pred, index=X_test_agg.index)
y_pred = y_pred.reindex(sample_submission.id)

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission['proba'] = y_pred.values
sample_submission.head()

In [None]:
sample_submission.to_csv('submissions/catboost_agg.csv', index=False)