In [1]:
import pandas as pd
import numpy as np
from utils_optimized import *
import warnings
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression
warnings.simplefilter('ignore')
total_splits = 4

In [2]:
train, test = load_data()

In [3]:
def aggregate_cb(df, take_values=True):
    mode = lambda x: stats.mode(x).mode[0]
    fst = lambda vec: vec.iloc[0]
    simple_trend = lambda vec: np.sum(vec.shift(1)-vec[1:])
    
    num_features = ['min', 'max', 'median', 'sum', simple_trend]
    cat_features = [unique_cnt, 'min', 'max', mode]
   
    res = df.groupby('id')[['first_prch_num', 'q', 'v_l', 'month', 'time_weight', 'v_l_tw']].agg({
        'first_prch_num':'max',
        'q':'sum',
        'v_l':'sum',
        'month':unique_cnt,
        'time_weight':['min', 'median'],
        'v_l_tw':'median'
    })
 
    if take_values:
        return res.values, res.index
    else:
        return res

In [None]:
def aggregate_xgb(df, take_values=True):
    mode = lambda x: stats.mode(x).mode[0]
    fst = lambda vec: vec.iloc[0]
    simple_trend = lambda vec: np.sum(vec.shift(1)-vec[1:])
    
    num_features = ['min', 'max', 'median', 'sum', simple_trend]
    cat_features = [unique_cnt, 'min', 'max', mode]
   
    res = df.groupby('id')[['first_prch_num', 'q', 'v_l', 'month', 'time_weight', 'v_l_tw',
                            'percent', 'sum_b_tw', 'q_tw', 'code_azs', 'cur_points', 'sum_b',
                            'true_percent', 'percent_tw']].agg({
        'first_prch_num': ['max'],
        'percent': [simple_trend],
        'sum_b_tw': ['median', 'max'],
        'q_tw': ['median', 'sum'],
        'q': [simple_trend, 'sum'],
        'v_l': ['sum', 'max'],
        'month': [unique_cnt, mode],
        'time_weight': ['min', 'median'],
        'v_l_tw': ['median'],
        'code_azs': [mode],
        'cur_points': [simple_trend],
        'sum_b': ['sum'],
        'weekday': [mode],
        'true_percent': ['max'],
        'percent_tw': ['sum']
    })
 
    if take_values:
        return res.values, res.index
    else:
        return res

In [None]:
for offset in range(total_splits):
    print(offset)
    X_train, y_train = calculate_target(train, offset=offset)
    X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train)
    print("Adding features..")
    X_tr, X_val = add_features(X_tr, X_val, sort=True)
    print("Aggregating..")
    X_tr_xgb, X_val_xgb = aggregate_xgb(X_tr, take_values=False), aggregate_xgb(X_val, take_values=False)
    X_tr_cb, X_val_cb = aggregate_cb(X_tr, take_values=False), aggregate_cb(X_val, take_values=False)
    
    save_split('data/holdouts_xgb.hdf', X_tr_xgb, X_val_xgb, y_tr, y_val, str(offset))
    save_split('data/holdouts_cb.hdf', X_tr_cb, X_val_cb, y_tr, y_val, str(offset))

0
Adding features..
Aggregating..
1
Adding features..
Aggregating..


### DataFrame with models results

In [None]:
lgb = LGBMClassifier()
scores_lgb, probas_lgb = cross_val(lgb, None, None, return_proba=True, splits=total_splits,
                           splits_file='data/holdouts_lgb.hdf', verbose=False)
print(np.mean(scores_lgb))

In [None]:
cb_param = {
    'depth': 8,
    'eval_metric': 'AUC',
    'l2_leaf_reg': 0.01,
    'random_seed': 42,
    'rsm': 0.5,
    'train_dir': './catboost',
    'verbose': False,
    'od_type': 'Iter'
}

cb = CatBoostClassifier(**cb_param)
cat_features=[]
scores_cb, probas_cb = cross_val(cb, None, None, return_proba=True, splits=total_splits,
                           splits_file='data/holdouts_cb.hdf', verbose=True,
                          cat_features=cat_features)
print(np.mean(scores_cb))

In [None]:
xgb_params = {
    'booster': 'gblinear',
    'objective': 'binary:logistic',
    'lambda': 0.5,
    'learning_rate': 1.2,
    'silent': 1.0,
    'seed': 42
}

xgb = XGBClassifier(**xgb_params, num_rounds = 500, n_jobs=-1)
scores_xgb, probas_xgb = cross_val(xgb, None, None, return_proba=True, splits=5,
                                   splits_file='data/holdouts_xgb.hdf', verbose=False)
print(np.mean(scores_xgb))

Add columns: xgb, KNN etc.

In [None]:
X = pd.DataFrame({'cb': pd.concat(probas_cb), 'xgb': pd.concat(probas_xgb)})

In [None]:
target = []
for offset in range(total_splits):
    X_tr, X_val, y_tr, y_val = load_split('data/holdouts_cb.hdf', offset)
    target.append(y_val)
target = pd.concat(target)

# Stacking

In [None]:
for col in X_val.columns:
    print(col,"-",roc_auc_score(y_val, X_val[col]))

## LGB

In [None]:
from sklearn.model_selection import train_test_split as tr_val

In [None]:
X_tr, X_val, y_tr, y_val = tr_val(X, target, train_size=0.75, shuffle=True)

In [None]:
y_pred = lgb.fit(X_tr, y_tr).predict_proba(X_val)[:, 1]
roc_auc_score(y_val, y_pred)

In [None]:
roc_auc_score(y_val, X_val.iloc[:, 0]), roc_auc_score(y_val, X_val.iloc[:, 1])

## Linear

In [None]:
lr = LinearRegression(fit_intercept=False).fit(X_tr, y_tr)

In [None]:
y_pred = lr.fit(X_tr, y_tr).predict(X_val)
roc_auc_score(y_val, y_pred)

# Submission

Set whichever stacker you chose

In [None]:
clf = LinearRegression(fit_intercept=False)

In [None]:
clf.fit(X, target)

In [None]:
###SOME MAGIC WITH GENERATING SUBMISSIONS FROM DIFFERENT MODELS
# suppose, now we have submission_cb, submission_lgb, submission_xgb

In [None]:
X_test_probas = pd.DataFrame({'cb': submission_cb.proba, 'lgb': submission_lgb.proba})

In [None]:
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission.proba = clf.predict(X_test_probas)