In [1]:
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from utils import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from scipy import stats

In [2]:
train, test = load_data()

In [3]:
train['month'] = train.date.dt.month
train['weekday'] = train.date.dt.dayofweek
test['month'] = test.date.dt.month
test['weekday'] = test.date.dt.dayofweek

In [4]:
le_region = LabelEncoder()
train.region.fillna(0, inplace=True)
test.region.fillna(0, inplace=True)
le_region.fit(pd.concat([train.region, test.region]))
train.region = le_region.transform(train.region)
test.region = le_region.transform(test.region)

In [5]:
train.location.fillna(99999, inplace=True)
test.location.fillna(99999, inplace=True)
train.code_azs.fillna(99999, inplace=True)
test.code_azs.fillna(99999, inplace=True)
train.code1.fillna(99999, inplace=True)
test.code1.fillna(99999, inplace=True)

In [6]:
train.location = train.location.astype(int)
test.location = test.location.astype(int)
train.code_azs = train.code_azs.astype(int)
test.code_azs = test.code_azs.astype(int)
train.code1 = train.code1.astype(int)
test.code1 = test.code1.astype(int)

In [7]:
X, y = calculate_target(train, 0)

In [8]:
X.head()

Unnamed: 0,time,date,v_l,q,n_tr,sum_b,code_azs,id,first_prch,location,region,code,code1,percent,type,month,weekday
3,08:24:24,2017-10-23,31.01,0,2017,1181.45,5915,21-8B2-7695,27.12.16 23:06:11,10,28,1010121,126,0.0,7,10,0
4,,2017-03-19,51.95,0,2017,1818.09,2418,21-8B2-7695,27.12.16 23:06:11,9,27,1010121,126,0.0,5,3,6
5,19:45:02,2017-08-02,48.64,0,2017,1746.05,2546,21-8B2-7695,27.12.16 23:06:11,9,26,1010121,126,0.0,7,8,2
6,,2017-05-08,17.43,0,2017,636.1,11152,21-8B2-7695,27.12.16 23:06:11,10,28,1010121,126,0.0,13,5,0
7,,2017-09-07,23.77,0,2017,908.12,5915,21-8B2-7695,27.12.16 23:06:11,10,28,1010121,126,0.0,13,9,3


In [9]:
def unique_cnt(series):
    return len(series.unique())
def mode(vec):
    counts = np.bincount(vec)
    return (np.argmax(counts))
def mode_lam(vec):
    return vec.mode()
def last_week_views(vec):
    return vec[vec > vec.max()-7].count()
def last_2week_views(vec):
    return vec[vec > vec.max()-14].count()
def last_3days_views(vec):
    return vec[vec > vec.max()-3].count()
def previous_3weeks_views(vec):
    return vec[(vec >= vec.max()-28) & (vec < vec.max() - 7)].count()
def last_month_views(vec):
    return vec[vec > vec.max()-30].count()

In [50]:
X_train, X_test, y_train, y_test = train_test_split(X, y )

In [56]:
mode_func =  stats.mode

def get_aggregate(df):
    return df.groupby('id')[['v_l', 'q', 'sum_b', 'location', 'code', 'percent', 'type', 'month',\
                                    'weekday', 'code_azs','region',  'code1']].agg({
    'v_l':['min', 'max', 'median', 'sum'],
    'q':['min', 'max', 'median', 'sum'],
    'sum_b':['min', 'max', 'median', 'sum'],
    'location':[unique_cnt, 'min', 'max', mode_func],
    'code':[unique_cnt],
    'percent':['min', 'max', 'median', 'sum'],
    'type':[unique_cnt, 'min', 'max', mode_func],
    'month':[unique_cnt, 'min', 'max', mode_func],
    'weekday':[unique_cnt, 'min', 'max', mode_func],
    'code_azs':[unique_cnt, 'min', 'max', mode_func],
    'region':[unique_cnt, 'min', 'max', mode_func],
    'code1':[unique_cnt, 'min', 'max', mode_func]
})

def get_aggregate_small(df):
    return df.groupby('id')[['v_l', 'q', 'sum_b', 'location', 'code', 'percent', 'type', 'month',\
                                    'weekday', 'code_azs','region',  'code1']].agg({
    'v_l':['min', 'max', 'median', 'sum'],
    'q':['min', 'max', 'median', 'sum'],
    'sum_b':['min', 'max', 'median', 'sum'],
    'location':[unique_cnt, 'min', 'max'],
    'code':[unique_cnt],
    'percent':['min', 'max', 'median', 'sum'],
    'type':[unique_cnt],
    'month':[unique_cnt, 'min', 'max'],
    'weekday':[unique_cnt, 'min', 'max'],
    'code_azs':[unique_cnt],
    'region':[unique_cnt],
    'code1':[unique_cnt]
})

In [57]:
%%time
X_train_agg = get_aggregate_small(X_train)

CPU times: user 22 s, sys: 284 ms, total: 22.3 s
Wall time: 22.2 s


In [58]:
X_train_agg.head()

Unnamed: 0_level_0,v_l,v_l,v_l,v_l,q,q,q,q,sum_b,sum_b,...,type,month,month,month,weekday,weekday,weekday,code_azs,region,code1
Unnamed: 0_level_1,min,max,median,sum,min,max,median,sum,min,max,...,unique_cnt,unique_cnt,min,max,unique_cnt,min,max,unique_cnt,unique_cnt,unique_cnt
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
21-8A1-14021,8.28,38.18,25.39,1942.07,0,0,0.0,0,272.47,1451.13,...,3,11,1,11,7,0,6,12,1,3
21-8A1-14023,0.0,48.61,13.45,715.12,0,1,0.0,11,53.64,1817.98,...,5,10,1,11,7,0,6,9,2,14
21-8A1-1403,0.0,59.55,36.36,3499.47,0,4,0.0,28,26.36,2272.65,...,8,11,1,11,7,0,6,21,5,17
21-8A1-14032,0.0,47.85,14.0,1416.04,0,2,0.0,5,77.27,1665.02,...,5,11,1,11,7,0,6,9,4,6
21-8A1-14038,0.0,54.45,13.45,2096.19,0,1,0.0,18,0.32,1965.81,...,6,11,1,11,7,0,6,24,1,6


In [59]:
rf = RandomForestClassifier(n_estimators=100, n_jobs = -2, random_state=42)

In [60]:
rf.fit(X_train_agg.values, y_train.sort_index())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-2,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [64]:
X_test_agg = get_aggregate_small(X_test)

In [65]:
preds = rf.predict_proba(X_test_agg)

In [66]:
roc_auc_score(y_test.sort_index(), preds[:, 1])

0.83764813645878566

### LGB

In [61]:
lgb = LGBMClassifier()

In [67]:
lgb.fit(X_train_agg.values, y_train.sort_index())

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [68]:
preds_lgb = lgb.predict_proba(X_test_agg) 

In [69]:
roc_auc_score(y_test.sort_index(), preds_lgb[:, 1])

0.84521833527334811

# Let's make a prediction!

In [70]:
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [71]:
sample_submission.sort_values(by = 'id')

Unnamed: 0,id,proba
0,21-186G-1142,0.5
15307,21-186G-1225,0.5
1,21-186G-1227,0.5
15308,21-186G-1232,0.5
15309,21-186G-1258,0.5
15310,21-186G-1287,0.5
2,21-186G-1306,0.5
3,21-186G-1310,0.5
15311,21-186G-1344,0.5
4,21-186G-1358,0.5


In [73]:
X_agg = get_aggregate_small(X)

In [96]:
rf.fit(X_agg.values, y.sort_index())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-2,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [97]:
test_agg = get_aggregate_small(test)

In [77]:
test_agg.head()

Unnamed: 0_level_0,v_l,v_l,v_l,v_l,q,q,q,q,sum_b,sum_b,...,type,month,month,month,weekday,weekday,weekday,code_azs,region,code1
Unnamed: 0_level_1,min,max,median,sum,min,max,median,sum,min,max,...,unique_cnt,unique_cnt,min,max,unique_cnt,min,max,unique_cnt,unique_cnt,unique_cnt
id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
21-186G-1142,0.0,57.04,25.705,1255.58,0,1,0.0,22,31.82,2227.27,...,5,11,2,12,7,0,6,2,1,13
21-186G-1225,0.0,32.5,0.0,544.48,0,1,1.0,41,31.82,1330.87,...,5,9,1,9,7,0,6,9,4,13
21-186G-1227,4.13,24.47,10.68,439.88,0,0,0.0,0,154.36,909.16,...,5,12,1,12,7,0,6,7,1,3
21-186G-1232,0.0,53.93,27.27,1532.83,0,1,0.0,3,53.64,1854.55,...,6,11,1,12,7,0,6,9,1,9
21-186G-1258,0.0,60.0,22.985,1669.3,0,1,0.0,8,13.64,2087.4,...,7,11,1,12,7,0,6,12,3,10


In [24]:
test_agg.shape

(31122, 19)

In [105]:
preds = rf.predict_proba(test_agg)

predictions = pd.Series(data = preds[:, 0], index = test_agg.index)

predictions = predictions.reindex(index = sample_submission.id)

sample_submission.proba = predictions.values

In [106]:
sample_submission.head()

Unnamed: 0,id,proba
0,21-186G-1142,0.01
1,21-186G-1227,0.11
2,21-186G-1306,0.0
3,21-186G-1310,0.0
4,21-186G-1358,0.0


In [107]:
sample_submission.to_csv('rf_baseline_submission.csv', index = False)

check = pd.read_csv('./rf_baseline_submission.csv')

check.head()

Unnamed: 0,id,proba
0,21-186G-1142,0.01
1,21-186G-1227,0.11
2,21-186G-1306,0.0
3,21-186G-1310,0.0
4,21-186G-1358,0.0


## LGB submission

In [79]:
lgb.fit(X_agg.values, y.sort_index())

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        learning_rate=0.1, max_depth=-1, min_child_samples=20,
        min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
        n_jobs=-1, num_leaves=31, objective=None, random_state=None,
        reg_alpha=0.0, reg_lambda=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1)

In [80]:
preds_lgb = lgb.predict_proba(test_agg)

In [102]:
predictions = pd.Series(data = preds_lgb[:, 0], index = test_agg.index)
predictions = predictions.reindex(index = sample_submission.id)
sample_submission.proba = predictions.values

In [103]:
sample_submission.head(20)

Unnamed: 0,id,proba
0,21-186G-1142,0.027764
1,21-186G-1227,0.074057
2,21-186G-1306,0.002275
3,21-186G-1310,0.0004
4,21-186G-1358,0.000561
5,21-186G-136,0.001534
6,21-186G-1495,0.002808
7,21-186G-1512,0.046592
8,21-186G-1548,0.01634
9,21-186G-1624,0.001317


In [104]:
sample_submission.to_csv('lgb_submission.csv', index = False)

check = pd.read_csv('./rf_baseline_submission.csv')

check.head()

Unnamed: 0,id,proba
0,21-186G-1142,0.027764
1,21-186G-1227,0.074057
2,21-186G-1306,0.002275
3,21-186G-1310,0.0004
4,21-186G-1358,0.000561
