In [1]:

import pandas as pd
import numpy as np
from sklearn import metrics, linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMRegressor
import sys
import warnings

if not sys.warnoptions:
    warnings.simplefilter("ignore")


data_path = 'data/'

#lgb25 = pd.read_csv('../lgCV_2505.csv.gz', compression='gzip')
lgb27 = pd.read_csv('../lgCV_2705B.csv.gz', compression='gzip')
lgb31 = pd.read_csv('../lgCV_3105.csv.gz', compression='gzip')
rnn = pd.read_csv('../rnnCV_2805.csv.gz', compression='gzip')
rnn27 = pd.read_csv('../rnnCV_2705A.csv.gz', compression='gzip')
mlp = pd.read_csv('../mlpCV_2505.csv.gz', compression='gzip')
truth = pd.read_csv('../../%strain.csv.zip'%(data_path), compression='zip', parse_dates = ["activation_date"])
y = truth['deal_probability'].values
truth.drop('deal_probability', 1)
test =  pd.read_csv('../../%stest.csv.zip'%(data_path), compression='zip', parse_dates = ["activation_date"])
test['deal_probability']=float('NAN') 
truth = pd.concat([truth,test[truth.columns]],axis=0)

In [3]:
#rnn27trn = pd.read_pickle('../../features/y_pred_trn.pkl')
#rnn27tst = pd.read_csv('../rnndhsub_2705A.csv.gz', compression='gzip')
#rnn27 = pd.concat([rnn27tst, rnn27trn.reset_index().rename(columns = {0:'deal_probability'})], axis = 0)
#rnn27.to_csv('../rnnCV_2705A.csv.gz', compression='gzip')

In [4]:
#lgb25.rename(columns={'deal_probability': 'lgb25_preds' }, inplace=True)
lgb31.rename(columns={'deal_probability': 'lgb31_preds' }, inplace=True)
lgb27.rename(columns={'deal_probability': 'lgb27_preds' }, inplace=True)
rnn27.rename(columns={'deal_probability': 'rnn27_preds' }, inplace=True)
mlp.rename(columns={'deal_probability': 'mlp_preds' }, inplace=True)
preds_df = lgb27.merge(rnn, on='item_id')\
                .merge(mlp, on='item_id')\
                .merge(lgb31, on='item_id')\
                .merge(rnn27, on='item_id')\
                .merge(truth, on='item_id',how='left')

In [5]:
preds_df['difference_l27_r']=preds_df['lgb27_preds'] - preds_df['rnn_preds']
preds_df['difference_l27_r1']=preds_df['lgb27_preds'] - preds_df['rnn27_preds']
preds_df['difference_l27_m']=preds_df['lgb27_preds'] - preds_df['mlp_preds']
preds_df['difference_r_m']=preds_df['rnn27_preds'] - preds_df['mlp_preds']
#preds_df['difference_l27_l25']=preds_df['lgb27_preds'] - preds_df['lgb25_preds']
preds_df['price'].fillna(-1,inplace=True)

### Train scores

In [8]:
idx = preds_df['deal_probability']==preds_df['deal_probability']
print(idx.value_counts())
for col in [c for c in preds_df.columns if '_preds' in c]:
    print('RMSE %s: '%(col), np.sqrt(metrics.mean_squared_error(preds_df['deal_probability'][idx].values, preds_df[col][idx].values)))

True     1503424
False     508438
Name: deal_probability, dtype: int64
RMSE lgb27_preds:  0.21681527582821059
RMSE rnn_preds:  0.21771476573780568
RMSE mlp_preds:  0.2187544351068867
RMSE lgb31_preds:  0.21681135578306754
RMSE rnn27_preds:  0.21698040871120608


In [10]:
foldls = [["2017-03-15", "2017-03-16", "2017-03-17"], \
       ["2017-03-18", "2017-03-19", "2017-03-20"], \
       ["2017-03-21", "2017-03-22", "2017-03-23"], \
       ["2017-03-24", "2017-03-25", "2017-03-26"], \
        ["2017-03-27", "2017-03-28", "2017-03-29", \
            "2017-03-30", "2017-03-31", "2017-04-01", \
            "2017-04-02", "2017-04-03","2017-04-07"]]
foldls = [[pd.to_datetime(d) for d in f] for f in foldls]
preds_df['fold'] = -1
for t, fold in enumerate(foldls):
    preds_df['fold'][preds_df.activation_date.isin(fold)] = t

In [11]:
scores = []
for col in [c for c in preds_df.columns if '_preds' in c]:
    lstmp = [col]
    for i in range(5):
        idx = preds_df['fold']==i
        lstmp.append(np.sqrt(metrics.mean_squared_error(preds_df['deal_probability'][idx].values, \
                                                        preds_df[col][idx].values)))
    scores.append(lstmp)
pd.DataFrame(scores, columns = ['Model']+['Fold%s'%(i) for i in range(5)])

Unnamed: 0,Model,Fold0,Fold1,Fold2,Fold3,Fold4
0,lgb27_preds,0.216848,0.217096,0.217923,0.217597,0.213692
1,rnn_preds,0.218756,0.218966,0.215764,0.219554,0.214747
2,mlp_preds,0.21966,0.220088,0.216672,0.220347,0.216385
3,lgb31_preds,0.216537,0.216643,0.21923,0.217169,0.21343
4,rnn27_preds,0.218097,0.218115,0.214982,0.218651,0.214379


### Correlations in test and train

In [20]:
# Test correlation
preds_df[~preds_df['deal_probability'].isnull()][[c for c in preds_df.columns if '_preds' in c]].corr()

Unnamed: 0,lgb27_preds,rnn_preds,mlp_preds,lgb31_preds,rnn27_preds
lgb27_preds,1.0,0.915952,0.919171,0.99075,0.916804
rnn_preds,0.915952,1.0,0.92971,0.910602,0.98536
mlp_preds,0.919171,0.92971,1.0,0.914307,0.92501
lgb31_preds,0.99075,0.910602,0.914307,1.0,0.914204
rnn27_preds,0.916804,0.98536,0.92501,0.914204,1.0


In [21]:
# Train correlation
preds_df[preds_df['deal_probability'].isnull()][[c for c in preds_df.columns if '_preds' in c]].corr()

Unnamed: 0,lgb27_preds,rnn_preds,mlp_preds,lgb31_preds,rnn27_preds
lgb27_preds,1.0,0.930288,0.924283,0.991439,0.930692
rnn_preds,0.930288,1.0,0.939951,0.927835,0.988198
mlp_preds,0.924283,0.939951,1.0,0.922413,0.935069
lgb31_preds,0.991439,0.927835,0.922413,1.0,0.931468
rnn27_preds,0.930692,0.988198,0.935069,0.931468,1.0


In [25]:
cols = [c for c in preds_df.columns if '_preds' in c]
cols += [c for c in preds_df.columns if 'difference' in c]
cols += ['price', 'region', 'param_1', 'parent_category_name', 'category_name' ]
for col in ['region', 'param_1', 'parent_category_name', 'category_name']:
    preds_df[col] = LabelEncoder().fit_transform(preds_df[col].fillna("0"))
train_df = preds_df[~preds_df['deal_probability'].isnull()]
test_df = preds_df[preds_df['deal_probability'].isnull()]

In [26]:
n_estimators = 4000
train_X, valid_X, train_y, valid_y = train_test_split(train_df[cols], y, train_size=.8, random_state=12345)
eval_set = [(train_X,train_y),(valid_X,valid_y)]

In [27]:
len(train_X)

1202739

In [28]:
%%time
clf = LGBMRegressor(n_estimators=n_estimators, 
                    max_depth=-1, 
                    feature_fraction= 0.5,
                    num_leaves=32, 
                    learning_rate=.01)#, device='gpu')
clf.fit(train_X, train_y, early_stopping_rounds=80, 
        eval_set=eval_set, eval_metric='rmse', verbose=100, 
        categorical_feature=['param_1','region','parent_category_name','category_name'])

Training until validation scores don't improve for 80 rounds.
[100]	training's rmse: 0.221068	valid_1's rmse: 0.221411
[200]	training's rmse: 0.214745	valid_1's rmse: 0.215309
[300]	training's rmse: 0.213592	valid_1's rmse: 0.214339
[400]	training's rmse: 0.213262	valid_1's rmse: 0.214158
[500]	training's rmse: 0.213079	valid_1's rmse: 0.214105
[600]	training's rmse: 0.212935	valid_1's rmse: 0.214076
[700]	training's rmse: 0.212796	valid_1's rmse: 0.214048
[800]	training's rmse: 0.212677	valid_1's rmse: 0.214033
[900]	training's rmse: 0.212568	valid_1's rmse: 0.214022
[1000]	training's rmse: 0.212463	valid_1's rmse: 0.214013
[1100]	training's rmse: 0.212364	valid_1's rmse: 0.214004
[1200]	training's rmse: 0.212265	valid_1's rmse: 0.213993
[1300]	training's rmse: 0.212169	valid_1's rmse: 0.213987
[1400]	training's rmse: 0.212072	valid_1's rmse: 0.213981
[1500]	training's rmse: 0.211984	valid_1's rmse: 0.213977
[1600]	training's rmse: 0.211891	valid_1's rmse: 0.213972
[1700]	training's r

In [29]:
n_estimators = 2050
train_X = train_df[cols]
train_y = y
eval_set = [(train_X,train_y)]
len(train_X)

1503424

In [30]:
%%time
clf = LGBMRegressor(n_estimators=n_estimators, 
                    max_depth=-1, 
                    feature_fraction= 0.5,
                    num_leaves=32, 
                    learning_rate=.01)#, device='gpu')

clf.fit(train_X, train_y, early_stopping_rounds=80, 
        eval_set=eval_set, eval_metric='rmse', verbose=100, 
        categorical_feature=['param_1','region','parent_category_name','category_name'])

Training until validation scores don't improve for 80 rounds.
[100]	training's rmse: 0.221141
[200]	training's rmse: 0.214856
[300]	training's rmse: 0.213731
[400]	training's rmse: 0.213421
[500]	training's rmse: 0.213256
[600]	training's rmse: 0.213129
[700]	training's rmse: 0.213008
[800]	training's rmse: 0.212907
[900]	training's rmse: 0.212813
[1000]	training's rmse: 0.212722
[1100]	training's rmse: 0.21264
[1200]	training's rmse: 0.212556
[1300]	training's rmse: 0.212476
[1400]	training's rmse: 0.212393
[1500]	training's rmse: 0.212318
[1600]	training's rmse: 0.212241
[1700]	training's rmse: 0.212169
[1800]	training's rmse: 0.212095
[1900]	training's rmse: 0.212023
[2000]	training's rmse: 0.211951
Did not meet early stopping. Best iteration is:
[2050]	training's rmse: 0.211915
CPU times: user 11min 26s, sys: 5.84 s, total: 11min 32s
Wall time: 1min 31s


In [31]:
test_df['deal_probability'] = clf.predict(test_df[cols])
test_df['deal_probability'] = np.clip(test_df['deal_probability'], .0001, .9999)
test_df[['item_id', 'deal_probability']].to_csv('../lgbbsub_0106AL2.csv.gz', compression='gzip', index=False, header=True)

In [32]:
test_df[['item_id', 'deal_probability']].head()

Unnamed: 0,item_id,deal_probability
1503424,6544e41a8817,0.422122
1503425,65b9484d670f,0.151064
1503426,8bab230b2ecd,0.118057
1503427,8e348601fefc,0.07053
1503428,8bd2fe400b89,0.218432
