In [1]:
import pandas as pd
import numpy as np
import gc
import random

from dateutil.relativedelta import relativedelta

In [2]:
train = pd.read_csv("../data/new_train.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})
train = train.loc[(train["date"] >= '2016-03-01') & (train["date"] <= '2016-08-31')]
test = pd.read_csv("../data/new_test.csv", parse_dates=["date"], dtype={"Page": str, "Visits": 'float32'})

In [3]:
random.seed(2)
random_pages = random.sample(set(train["Page"].unique()), 50000)

In [4]:
new_train = train.loc[train["Page"].isin(random_pages)].copy()

In [5]:
np.mean(train["Visits"])

1502.6982

In [6]:
np.mean(new_train["Visits"])

1565.8059

In [7]:
train = train.loc[train["Page"].isin(random_pages)]
test = test.loc[test["Page"].isin(random_pages)]
train.fillna(0, inplace=True)
test.fillna(0, inplace=True)
train["Visits"] = np.log1p(train["Visits"]).astype("float64")
test["Visits"] = np.log1p(test["Visits"]).astype("float64")
del new_train

In [8]:
gc.collect()

7

In [9]:
def create_features(df, month, target=None):
    if type(target) != type(pd.DataFrame()):
        target = df.loc[(df["date"] >= pd.to_datetime(month)) & (df["date"] <= pd.to_datetime(month) + relativedelta(months=2))]
    temp = df.loc[(train["date"] < pd.to_datetime(month) + relativedelta(days=-10)) & (df["date"] >= pd.to_datetime(month) + relativedelta(days=-40))].copy()
    temp = pd.pivot_table(temp, index=["Page"], values=["Visits"], columns=["date"]).reset_index()
    temp.columns = ["lag_{}".format(i)  if j[1] != "" else j[0] for i, j in enumerate(temp.columns)]

    temp = target.merge(temp, on="Page", how='left')

    temp['Month']     = temp["date"].dt.month
    temp['Month']     = temp['Month'] - temp['Month'].min()
    temp['Day']       = temp["date"].dt.day
    temp['DayOfWeek'] = temp["date"].dt.dayofweek
    return temp
    

In [10]:
new_train = create_features(train, '2016-04-10')
new_val = create_features(train, '2016-06-10')

In [11]:
new_train.head()

Unnamed: 0,Page,date,Visits,lag_1,lag_2,lag_3,lag_4,lag_5,lag_6,lag_7,...,lag_24,lag_25,lag_26,lag_27,lag_28,lag_29,lag_30,Month,Day,DayOfWeek
0,2NE1_zh.wikipedia.org_all-access_spider,2016-04-10,4.276666,2.944439,3.7612,2.772589,1.791759,3.091043,4.043051,2.302585,...,2.197225,2.772589,2.484907,3.044523,4.094345,2.484907,2.944439,0,10,6
1,2PM_zh.wikipedia.org_all-access_spider,2016-04-10,2.995732,2.890372,3.135494,3.295837,2.833213,2.833213,3.367296,2.995732,...,2.70805,2.772589,2.70805,2.772589,3.367296,3.610918,3.178054,0,10,6
2,3C_zh.wikipedia.org_all-access_spider,2016-04-10,2.197225,1.791759,1.098612,0.693147,1.609438,2.079442,1.098612,1.098612,...,2.302585,1.791759,0.693147,1.94591,2.079442,1.609438,1.94591,0,10,6
3,5566_zh.wikipedia.org_all-access_spider,2016-04-10,2.890372,2.833213,2.397895,2.484907,2.944439,3.988984,5.365976,3.044523,...,2.484907,2.484907,2.197225,2.484907,2.70805,2.890372,2.639057,0,10,6
4,A'N'D_zh.wikipedia.org_all-access_spider,2016-04-10,2.397895,3.135494,3.583519,2.772589,3.367296,3.091043,3.044523,2.890372,...,3.044523,2.890372,3.044523,2.70805,2.484907,2.944439,2.772589,0,10,6


In [12]:
train_cols = [ 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5',
               'lag_6', 'lag_7', 'lag_8', 'lag_9', 'lag_10', 'lag_11', 'lag_12',
               'lag_13', 'lag_14', 'lag_15', 'lag_16', 'lag_17', 'lag_18', 'lag_19',
               'lag_20', 'lag_21', 'lag_22', 'lag_23', 'lag_24', 'lag_25', 'lag_26',
               'lag_27', 'lag_28', 'lag_29', 'lag_30', 'Month', 'Day', 'DayOfWeek']

In [13]:
print("Train:      {} - {}".format(str(new_train["date"].min())[:10], str(new_train["date"].max())[:10]))
print("Validation: {} - {}".format(str(new_val["date"].min())[:10], str(new_val["date"].max())[:10]))

Train:      2016-04-10 - 2016-06-10
Validation: 2016-06-10 - 2016-08-10


In [14]:
import lightgbm as lgb
param = {}
param['application'] = 'regression_l2'
param['learning_rate'] = 0.1
param['feature_fraction'] = 0.5
param['bagging_fraction'] = 0.5
param['bagging_freq'] = 1
param['max_depth'] = 5
param['num_threads'] = 4
param['verbose'] = 0

from numba import jit
import math

@jit
def smape_fast(y_true, y_pred):
    out = 0
    for i in range(y_true.shape[0]):
        a = y_true[i]
        b = y_pred[i]
        if b < 1:
            b = 0
        c = a+b
        if c == 0:
            continue
        out += math.fabs(a - b) / c
    out *= (200.0 / y_true.shape[0])
    return out

def lgb_smape(preds, df):
    labels = df.get_label()
    labels, preds = np.expm1(np.array(labels)), np.expm1(np.array(preds))
    return 'smape', smape_fast(labels, preds), False

In [15]:
lgb_train = lgb.Dataset(new_train[train_cols], label=new_train["Visits"], free_raw_data=False)
lgb_val = lgb.Dataset(new_val[train_cols], label=new_val["Visits"], free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 1000, valid_sets=[lgb_train,lgb_val], feval=lgb_smape, early_stopping_rounds=10)

[1]	training's smape: 122.135	valid_1's smape: 119.787
Training until validation scores don't improve for 10 rounds.
[2]	training's smape: 116.116	valid_1's smape: 113.509
[3]	training's smape: 110.138	valid_1's smape: 107.303
[4]	training's smape: 104.321	valid_1's smape: 101.314
[5]	training's smape: 98.691	valid_1's smape: 95.597
[6]	training's smape: 93.3253	valid_1's smape: 90.1823
[7]	training's smape: 88.2323	valid_1's smape: 85.1667
[8]	training's smape: 83.4797	valid_1's smape: 80.5636
[9]	training's smape: 79.0923	valid_1's smape: 76.4159
[10]	training's smape: 75.0659	valid_1's smape: 72.5733
[11]	training's smape: 71.3993	valid_1's smape: 69.1857
[12]	training's smape: 68.1162	valid_1's smape: 66.2667
[13]	training's smape: 65.1659	valid_1's smape: 63.6003
[14]	training's smape: 62.5614	valid_1's smape: 61.2966
[15]	training's smape: 60.2826	valid_1's smape: 59.3545
[16]	training's smape: 58.3219	valid_1's smape: 57.7493
[17]	training's smape: 56.6229	valid_1's smape: 56.38

In [16]:
gc.collect()

4515

In [17]:
new_test = create_features(train, '2016-09-10', test)

In [18]:
del train, new_train, test, lgb_train, lgb_val, model

In [19]:
# on my server the optimal number of iterations was 50

lgb_train = lgb.Dataset(new_val[train_cols], label=new_val["Visits"], free_raw_data=False)
lgb_val = lgb.Dataset(new_test[train_cols], label=new_test["Visits"], free_raw_data=False, reference=lgb_train)

model = lgb.train(param, lgb_train, 50, valid_sets=[lgb_train,lgb_val], feval=lgb_smape)

[1]	training's smape: 120.425	valid_1's smape: 121.516
[2]	training's smape: 114.49	valid_1's smape: 115.856
[3]	training's smape: 108.633	valid_1's smape: 110.323
[4]	training's smape: 102.91	valid_1's smape: 104.937
[5]	training's smape: 97.4187	valid_1's smape: 99.6301
[6]	training's smape: 92.2466	valid_1's smape: 94.7072
[7]	training's smape: 87.3806	valid_1's smape: 90.0377
[8]	training's smape: 82.854	valid_1's smape: 85.7104
[9]	training's smape: 78.6804	valid_1's smape: 81.6777
[10]	training's smape: 74.8646	valid_1's smape: 77.9816
[11]	training's smape: 71.4296	valid_1's smape: 74.6734
[12]	training's smape: 68.347	valid_1's smape: 71.6786
[13]	training's smape: 65.6256	valid_1's smape: 69.0209
[14]	training's smape: 63.2201	valid_1's smape: 66.6279
[15]	training's smape: 61.1032	valid_1's smape: 64.5152
[16]	training's smape: 59.2803	valid_1's smape: 62.6619
[17]	training's smape: 57.7012	valid_1's smape: 61.0528
[18]	training's smape: 56.3368	valid_1's smape: 59.6398
[19]	