In [1]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from utils import *
from sklearn.metrics import roc_auc_score
from scipy import stats

In [2]:
# Data-scientist prays like this:
import warnings
warnings.simplefilter('ignore')

## First save here (or resave)

In [3]:
train, test = load_data(resave=False, points=True, sort=True)

In [4]:
train, test = add_features(train, test, sort=True)

MemoryError: 

In [None]:
save_to(train, test, 'data/with_features.hdf')

## Load here

In [6]:
train, test = load_from('data/with_features.hdf')
train.shape, test.shape

((5563602, 21), (4751035, 21))

In [7]:
X_train, y_train = calculate_target(train, offset=0)
X_test = test
X_train.shape, X_test.shape

((5027172, 21), (4751035, 21))

## Classifier

### LightGBM

In [8]:
lgb_params = {'random_state': 42, 
              'n_estimators': 100}

In [9]:
lgb = LGBMClassifier(**lgb_params)

### Custom aggregate function

In [10]:
def aggregate_custom(df, take_values=True):
    mode = lambda x: stats.mode(x).mode[0]
    fst = lambda vec: vec.iloc[0]
    
    num_features = ['min', 'max', 'median', 'sum']
    cat_features = [unique_cnt, 'min', 'max', mode]
    
    res = df.groupby('id')[['v_l', 'q', 'sum_b', 'percent']].agg({
    'v_l':num_features, 'q':num_features, 'sum_b':num_features, 'percent':num_features})

    if take_values:
        return res.values, res.index
    else:
        return res

In [11]:
# default
agg_func = aggregate
# custom
agg_func = aggregate_custom

### Holdout

In [26]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, train_size=0.75)
X_tr.shape[0], X_val.shape[0]

(3758699, 1268473)

In [27]:
%%time
X_tr, X_val = agg_func(X_tr, take_values=False), agg_func(X_val, take_values=False)

CPU times: user 2.91 s, sys: 333 ms, total: 3.25 s
Wall time: 3.24 s


In [29]:
lgb.fit(X_tr.values, y_tr)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=42,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=50000, subsample_freq=1)

In [30]:
pred = lgb.predict_proba(X_val)[:, 1]
roc_auc_score(y_val, pred)

0.8474094155142363

### Cross Validation

In [31]:
scores = cross_val(lgb, train, agg_func, splits=2, interval=2)

Split № 0
Aggregating X_tr..
Aggregating X_val..
Fitting classifier..
Target month: -0. Score: 0.8474094155142363
-------------------
Split № 1
Aggregating X_tr..
Aggregating X_val..
Fitting classifier..
Target month: -3. Score: 0.8222878353647909
-------------------
Mean score is: 0.8348486254395135


In [32]:
train.columns

Index(['time', 'v_l', 'q', 'n_tr', 'code_azs', 'location', 'region', 'code',
       'code1', 'type', 'date', 'id', 'sum_b', 'percent', 'cur_points',
       'first_prch_num', 'return_num', 'first_prch', 'oil_price', 'month',
       'weekday'],
      dtype='object')

In [18]:
scores, probas = cross_val(lgb, train, agg_func, splits=2, return_proba=True, 
                           train_size=0.7, verbose=False)

In [19]:
print("Shape:", probas[0].shape)
probas[0].head()

Shape: (21917,)


id
21-8AJ-10808    0.064112
21-8AJ-10814    0.047118
21-8AJ-10817    0.053327
21-8AJ-10828    0.051719
21-8AJ-10831    0.009351
dtype: float64

## Submission

In [20]:
%%time
X_train_agg = agg_func(X_train, take_values=False)
X_test_agg = agg_func(X_test, take_values=False)

CPU times: user 5.18 s, sys: 804 ms, total: 5.99 s
Wall time: 5.98 s


In [21]:
lgb = LGBMClassifier(**lgb_params)
lgb.fit(X_train_agg.values, y_train)

LGBMClassifier(boosting_type='gbdt', colsample_bytree=1.0, learning_rate=0.1,
        max_bin=255, max_depth=-1, min_child_samples=10,
        min_child_weight=5, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=42,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=50000, subsample_freq=1)

In [22]:
y_pred = lgb.predict_proba(X_test_agg.values)[:, 1]
y_pred = pd.Series(y_pred, index=X_test_agg.index)
y_pred[:10]

id
21-186G-1142    0.005978
21-186G-1225    0.013966
21-186G-1227    0.021776
21-186G-1232    0.007700
21-186G-1258    0.002628
21-186G-1287    0.001990
21-186G-1306    0.002587
21-186G-1310    0.000899
21-186G-1344    0.001582
21-186G-1358    0.000945
dtype: float64

In [23]:
sample_submission = pd.read_csv('data/sample_submission.csv')

In [24]:
y_pred = y_pred.reindex(sample_submission.id)

In [25]:
sample_submission.to_csv('submissions/submission.csv', index=False)