In [1]:
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline

pd.options.display.max_rows = 10
pd.options.display.max_colwidth = 100
pd.options.display.max_columns = 600
from tqdm import tqdm
import gc

from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import cross_val_predict, KFold
from sklearn.decomposition import PCA

In [2]:
np.random.seed=0

In [3]:
def smape(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff)

def smape_mask(y_true, y_pred, threshold):
    denominator = (np.abs(y_true) + np.abs(y_pred)) 
    diff = np.abs(y_true - y_pred) 
    diff[denominator == 0] = 0.0
    
    return diff <= (threshold / 2.0) * denominator

In [4]:
max_size = 184 # number of days in 2015

train_all = pd.read_csv("../input/train_2.csv")

In [5]:
all_page = train_all.Page.copy()
train_key = train_all[['Page']].copy()
all_page = train_all.Page.copy()

In [6]:
def get_date_index(date, train_all=train_all):
    for idx, c in enumerate(train_all.columns):
        if date == c:
            break
    if idx == len(train_all.columns):
        return None
    return idx

In [7]:
train_end = get_date_index('2016-09-10') + 1
test_start = get_date_index('2016-09-13')

In [8]:
train = train_all.iloc[ : , (train_end - max_size) : train_end].copy().astype('float32')
test = train_all.iloc[:, test_start : (63 + test_start)].copy().astype('float32')

train_all = train_all.iloc[:,-(max_size):].astype('float32')

train = train.iloc[:,::-1].copy()
train_all = train_all.iloc[:,::-1].copy()

In [9]:
train_all.head()

Unnamed: 0,2017-09-10,2017-09-09,2017-09-08,2017-09-07,2017-09-06,2017-09-05,2017-09-04,2017-09-03,2017-09-02,2017-09-01,2017-08-31,2017-08-30,2017-08-29,2017-08-28,2017-08-27,2017-08-26,2017-08-25,2017-08-24,2017-08-23,2017-08-22,2017-08-21,2017-08-20,2017-08-19,2017-08-18,2017-08-17,2017-08-16,2017-08-15,2017-08-14,2017-08-13,2017-08-12,2017-08-11,2017-08-10,2017-08-09,2017-08-08,2017-08-07,2017-08-06,2017-08-05,2017-08-04,2017-08-03,2017-08-02,2017-08-01,2017-07-31,2017-07-30,2017-07-29,2017-07-28,2017-07-27,2017-07-26,2017-07-25,2017-07-24,2017-07-23,2017-07-22,2017-07-21,2017-07-20,2017-07-19,2017-07-18,2017-07-17,2017-07-16,2017-07-15,2017-07-14,2017-07-13,2017-07-12,2017-07-11,2017-07-10,2017-07-09,2017-07-08,2017-07-07,2017-07-06,2017-07-05,2017-07-04,2017-07-03,2017-07-02,2017-07-01,2017-06-30,2017-06-29,2017-06-28,2017-06-27,2017-06-26,2017-06-25,2017-06-24,2017-06-23,2017-06-22,2017-06-21,2017-06-20,2017-06-19,2017-06-18,2017-06-17,2017-06-16,2017-06-15,2017-06-14,2017-06-13,2017-06-12,2017-06-11,2017-06-10,2017-06-09,2017-06-08,2017-06-07,2017-06-06,2017-06-05,2017-06-04,2017-06-03,2017-06-02,2017-06-01,2017-05-31,2017-05-30,2017-05-29,2017-05-28,2017-05-27,2017-05-26,2017-05-25,2017-05-24,2017-05-23,2017-05-22,2017-05-21,2017-05-20,2017-05-19,2017-05-18,2017-05-17,2017-05-16,2017-05-15,2017-05-14,2017-05-13,2017-05-12,2017-05-11,2017-05-10,2017-05-09,2017-05-08,2017-05-07,2017-05-06,2017-05-05,2017-05-04,2017-05-03,2017-05-02,2017-05-01,2017-04-30,2017-04-29,2017-04-28,2017-04-27,2017-04-26,2017-04-25,2017-04-24,2017-04-23,2017-04-22,2017-04-21,2017-04-20,2017-04-19,2017-04-18,2017-04-17,2017-04-16,2017-04-15,2017-04-14,2017-04-13,2017-04-12,2017-04-11,2017-04-10,2017-04-09,2017-04-08,2017-04-07,2017-04-06,2017-04-05,2017-04-04,2017-04-03,2017-04-02,2017-04-01,2017-03-31,2017-03-30,2017-03-29,2017-03-28,2017-03-27,2017-03-26,2017-03-25,2017-03-24,2017-03-23,2017-03-22,2017-03-21,2017-03-20,2017-03-19,2017-03-18,2017-03-17,2017-03-16,2017-03-15,2017-03-14,2017-03-13,2017-03-12,2017-03-11
0,38.0,54.0,23.0,29.0,27.0,16.0,18.0,33.0,33.0,19.0,21.0,24.0,26.0,44.0,47.0,25.0,23.0,25.0,58.0,37.0,21.0,28.0,16.0,22.0,39.0,19.0,14.0,18.0,57.0,13.0,18.0,17.0,39.0,45.0,25.0,11.0,22.0,22.0,24.0,46.0,20.0,14.0,22.0,13.0,20.0,14.0,19.0,31.0,16.0,20.0,12.0,16.0,16.0,17.0,43.0,38.0,24.0,12.0,17.0,25.0,19.0,20.0,25.0,22.0,29.0,19.0,15.0,14.0,15.0,26.0,24.0,23.0,24.0,21.0,34.0,32.0,23.0,23.0,21.0,29.0,26.0,28.0,22.0,46.0,38.0,29.0,32.0,35.0,26.0,24.0,27.0,23.0,37.0,23.0,18.0,28.0,30.0,25.0,35.0,30.0,57.0,37.0,16.0,19.0,29.0,18.0,22.0,40.0,15.0,66.0,16.0,22.0,12.0,33.0,24.0,21.0,14.0,23.0,19.0,21.0,23.0,22.0,22.0,16.0,21.0,15.0,21.0,55.0,42.0,13.0,29.0,25.0,80.0,36.0,20.0,24.0,71.0,20.0,19.0,31.0,33.0,50.0,20.0,13.0,18.0,16.0,43.0,13.0,22.0,57.0,21.0,17.0,32.0,17.0,17.0,18.0,124.0,20.0,27.0,19.0,38.0,32.0,16.0,17.0,17.0,73.0,19.0,66.0,33.0,22.0,11.0,44.0,23.0,22.0,28.0,26.0,30.0,28.0,21.0,19.0,24.0,22.0,28.0,19.0
1,81.0,13.0,23.0,26.0,25.0,54.0,19.0,11.0,30.0,32.0,25.0,34.0,29.0,32.0,33.0,34.0,31.0,16.0,34.0,14.0,37.0,12.0,20.0,22.0,20.0,13.0,12.0,20.0,21.0,47.0,16.0,16.0,13.0,20.0,11.0,7.0,19.0,31.0,20.0,39.0,18.0,14.0,15.0,21.0,8.0,12.0,13.0,18.0,44.0,11.0,16.0,23.0,15.0,13.0,19.0,16.0,16.0,34.0,15.0,16.0,12.0,15.0,24.0,76.0,20.0,816.0,16.0,19.0,23.0,16.0,26.0,15.0,29.0,20.0,104.0,127.0,62.0,26.0,193.0,18.0,24.0,17.0,10.0,15.0,17.0,25.0,21.0,18.0,26.0,22.0,14.0,34.0,30.0,20.0,19.0,22.0,23.0,21.0,41.0,31.0,29.0,39.0,28.0,21.0,25.0,20.0,28.0,40.0,17.0,18.0,19.0,35.0,61.0,27.0,52.0,28.0,20.0,18.0,14.0,9.0,25.0,13.0,37.0,24.0,22.0,34.0,19.0,22.0,15.0,27.0,15.0,11.0,22.0,12.0,18.0,19.0,15.0,18.0,11.0,22.0,9.0,84.0,41.0,20.0,21.0,16.0,21.0,19.0,24.0,17.0,13.0,74.0,42.0,52.0,25.0,13.0,41.0,22.0,45.0,31.0,19.0,18.0,22.0,28.0,24.0,14.0,7.0,14.0,21.0,15.0,48.0,31.0,54.0,27.0,33.0,28.0,28.0,36.0,53.0,45.0,24.0,23.0,24.0,18.0
2,6.0,7.0,4.0,3.0,7.0,4.0,2.0,7.0,6.0,6.0,9.0,12.0,8.0,12.0,13.0,6.0,8.0,11.0,23.0,6.0,3.0,4.0,2.0,8.0,3.0,3.0,3.0,6.0,2.0,4.0,9.0,1.0,3.0,7.0,3.0,1.0,3.0,4.0,8.0,22.0,0.0,4.0,7.0,5.0,2.0,3.0,3.0,2.0,8.0,5.0,3.0,2.0,2.0,3.0,1.0,3.0,2.0,1.0,2.0,7.0,1.0,5.0,6.0,2.0,1.0,10.0,1.0,1.0,1.0,7.0,1.0,1.0,6.0,0.0,5.0,5.0,4.0,4.0,5.0,3.0,7.0,8.0,3.0,3.0,6.0,3.0,5.0,11.0,19.0,11.0,3.0,3.0,6.0,0.0,2.0,1.0,3.0,2.0,4.0,4.0,1.0,5.0,4.0,1.0,6.0,10.0,2.0,2.0,4.0,12.0,6.0,3.0,3.0,3.0,7.0,0.0,1.0,0.0,0.0,4.0,4.0,7.0,3.0,2.0,3.0,0.0,3.0,3.0,1.0,10.0,3.0,2.0,1.0,3.0,0.0,2.0,5.0,4.0,0.0,3.0,9.0,7.0,1.0,0.0,6.0,1.0,3.0,2.0,7.0,2.0,4.0,4.0,5.0,9.0,3.0,3.0,5.0,3.0,14.0,8.0,5.0,4.0,8.0,1.0,2.0,3.0,3.0,8.0,6.0,4.0,1.0,3.0,1.0,8.0,8.0,8.0,6.0,3.0,8.0,3.0,1.0,2.0,4.0,0.0
3,4.0,38.0,30.0,19.0,16.0,6.0,9.0,19.0,19.0,7.0,15.0,110.0,20.0,20.0,21.0,11.0,17.0,23.0,42.0,21.0,11.0,11.0,20.0,18.0,11.0,17.0,8.0,19.0,9.0,19.0,11.0,15.0,15.0,14.0,10.0,11.0,13.0,12.0,14.0,37.0,14.0,13.0,13.0,14.0,21.0,11.0,10.0,9.0,19.0,15.0,7.0,12.0,17.0,5.0,8.0,19.0,11.0,14.0,10.0,20.0,16.0,12.0,18.0,6.0,6.0,22.0,15.0,17.0,10.0,16.0,19.0,7.0,30.0,21.0,22.0,10.0,12.0,11.0,12.0,15.0,17.0,12.0,26.0,13.0,19.0,18.0,10.0,8.0,17.0,16.0,12.0,10.0,15.0,5.0,17.0,12.0,10.0,12.0,9.0,14.0,14.0,24.0,13.0,16.0,17.0,22.0,14.0,8.0,14.0,43.0,15.0,18.0,29.0,25.0,19.0,26.0,22.0,18.0,23.0,28.0,15.0,25.0,16.0,13.0,20.0,18.0,24.0,13.0,34.0,27.0,21.0,16.0,63.0,26.0,17.0,17.0,13.0,18.0,19.0,7.0,9.0,40.0,17.0,9.0,25.0,5.0,20.0,17.0,15.0,8.0,34.0,18.0,29.0,26.0,7.0,23.0,12.0,14.0,10.0,24.0,14.0,22.0,14.0,19.0,32.0,29.0,11.0,11.0,20.0,36.0,17.0,15.0,24.0,25.0,25.0,24.0,27.0,24.0,29.0,15.0,19.0,21.0,25.0,30.0
4,7.0,8.0,14.0,28.0,23.0,20.0,9.0,19.0,16.0,16.0,24.0,17.0,20.0,29.0,24.0,20.0,22.0,25.0,41.0,23.0,23.0,19.0,66.0,23.0,11.0,20.0,14.0,22.0,12.0,63.0,14.0,14.0,11.0,12.0,29.0,19.0,114.0,66.0,39.0,40.0,14.0,15.0,4.0,4.0,14.0,3.0,10.0,7.0,9.0,8.0,11.0,6.0,20.0,5.0,6.0,4.0,11.0,14.0,16.0,11.0,12.0,9.0,15.0,16.0,11.0,14.0,12.0,18.0,16.0,6.0,16.0,12.0,12.0,9.0,14.0,21.0,10.0,12.0,16.0,17.0,12.0,12.0,8.0,10.0,17.0,17.0,38.0,9.0,6.0,18.0,14.0,16.0,22.0,13.0,7.0,12.0,13.0,30.0,13.0,12.0,16.0,25.0,7.0,19.0,8.0,25.0,23.0,14.0,9.0,16.0,8.0,22.0,64.0,14.0,11.0,11.0,15.0,18.0,13.0,8.0,11.0,42.0,5.0,9.0,9.0,9.0,4.0,7.0,46.0,6.0,12.0,14.0,6.0,10.0,12.0,12.0,8.0,12.0,11.0,13.0,8.0,38.0,16.0,12.0,13.0,13.0,14.0,17.0,16.0,13.0,8.0,8.0,26.0,13.0,13.0,12.0,19.0,9.0,33.0,12.0,8.0,14.0,13.0,21.0,20.0,27.0,15.0,13.0,17.0,20.0,25.0,14.0,14.0,21.0,20.0,14.0,22.0,19.0,24.0,13.0,24.0,14.0,21.0,15.0


In [10]:
train.head()

Unnamed: 0,2016-09-10,2016-09-09,2016-09-08,2016-09-07,2016-09-06,2016-09-05,2016-09-04,2016-09-03,2016-09-02,2016-09-01,2016-08-31,2016-08-30,2016-08-29,2016-08-28,2016-08-27,2016-08-26,2016-08-25,2016-08-24,2016-08-23,2016-08-22,2016-08-21,2016-08-20,2016-08-19,2016-08-18,2016-08-17,2016-08-16,2016-08-15,2016-08-14,2016-08-13,2016-08-12,2016-08-11,2016-08-10,2016-08-09,2016-08-08,2016-08-07,2016-08-06,2016-08-05,2016-08-04,2016-08-03,2016-08-02,2016-08-01,2016-07-31,2016-07-30,2016-07-29,2016-07-28,2016-07-27,2016-07-26,2016-07-25,2016-07-24,2016-07-23,2016-07-22,2016-07-21,2016-07-20,2016-07-19,2016-07-18,2016-07-17,2016-07-16,2016-07-15,2016-07-14,2016-07-13,2016-07-12,2016-07-11,2016-07-10,2016-07-09,2016-07-08,2016-07-07,2016-07-06,2016-07-05,2016-07-04,2016-07-03,2016-07-02,2016-07-01,2016-06-30,2016-06-29,2016-06-28,2016-06-27,2016-06-26,2016-06-25,2016-06-24,2016-06-23,2016-06-22,2016-06-21,2016-06-20,2016-06-19,2016-06-18,2016-06-17,2016-06-16,2016-06-15,2016-06-14,2016-06-13,2016-06-12,2016-06-11,2016-06-10,2016-06-09,2016-06-08,2016-06-07,2016-06-06,2016-06-05,2016-06-04,2016-06-03,2016-06-02,2016-06-01,2016-05-31,2016-05-30,2016-05-29,2016-05-28,2016-05-27,2016-05-26,2016-05-25,2016-05-24,2016-05-23,2016-05-22,2016-05-21,2016-05-20,2016-05-19,2016-05-18,2016-05-17,2016-05-16,2016-05-15,2016-05-14,2016-05-13,2016-05-12,2016-05-11,2016-05-10,2016-05-09,2016-05-08,2016-05-07,2016-05-06,2016-05-05,2016-05-04,2016-05-03,2016-05-02,2016-05-01,2016-04-30,2016-04-29,2016-04-28,2016-04-27,2016-04-26,2016-04-25,2016-04-24,2016-04-23,2016-04-22,2016-04-21,2016-04-20,2016-04-19,2016-04-18,2016-04-17,2016-04-16,2016-04-15,2016-04-14,2016-04-13,2016-04-12,2016-04-11,2016-04-10,2016-04-09,2016-04-08,2016-04-07,2016-04-06,2016-04-05,2016-04-04,2016-04-03,2016-04-02,2016-04-01,2016-03-31,2016-03-30,2016-03-29,2016-03-28,2016-03-27,2016-03-26,2016-03-25,2016-03-24,2016-03-23,2016-03-22,2016-03-21,2016-03-20,2016-03-19,2016-03-18,2016-03-17,2016-03-16,2016-03-15,2016-03-14,2016-03-13,2016-03-12,2016-03-11
0,16.0,55.0,18.0,19.0,18.0,23.0,19.0,25.0,20.0,21.0,25.0,8.0,26.0,18.0,15.0,40.0,14.0,20.0,33.0,20.0,30.0,13.0,37.0,42.0,49.0,12.0,14.0,24.0,17.0,55.0,30.0,23.0,41.0,26.0,19.0,10.0,49.0,20.0,17.0,57.0,19.0,19.0,12.0,29.0,26.0,31.0,17.0,18.0,13.0,12.0,30.0,18.0,22.0,24.0,15.0,15.0,14.0,18.0,15.0,18.0,17.0,45.0,56.0,115.0,30.0,15.0,21.0,18.0,16.0,15.0,17.0,20.0,12.0,23.0,18.0,11.0,69.0,15.0,22.0,8.0,22.0,26.0,62.0,8.0,234.0,14.0,15.0,17.0,13.0,14.0,18.0,14.0,11.0,10.0,18.0,18.0,14.0,15.0,11.0,13.0,27.0,21.0,16.0,17.0,15.0,12.0,14.0,19.0,20.0,21.0,10.0,14.0,13.0,15.0,12.0,16.0,35.0,19.0,16.0,22.0,23.0,21.0,23.0,39.0,22.0,50.0,11.0,19.0,22.0,20.0,13.0,16.0,18.0,29.0,15.0,15.0,19.0,40.0,25.0,17.0,31.0,6.0,44.0,22.0,18.0,23.0,16.0,23.0,17.0,17.0,79.0,57.0,21.0,71.0,126.0,38.0,102.0,189.0,490.0,9.0,13.0,14.0,12.0,17.0,18.0,11.0,59.0,20.0,11.0,15.0,8.0,11.0,12.0,11.0,13.0,18.0,6.0,17.0,14.0,10.0,9.0,17.0,9.0,8.0
1,13.0,13.0,35.0,43.0,44.0,19.0,78.0,66.0,17.0,31.0,31.0,144.0,22.0,25.0,12.0,15.0,23.0,14.0,10.0,29.0,20.0,13.0,30.0,34.0,9.0,12.0,63.0,77.0,12.0,60.0,17.0,19.0,39.0,15.0,54.0,16.0,30.0,29.0,19.0,39.0,24.0,25.0,19.0,49.0,35.0,17.0,18.0,18.0,19.0,22.0,22.0,19.0,16.0,16.0,32.0,36.0,21.0,49.0,34.0,31.0,92.0,24.0,64.0,184.0,48.0,27.0,17.0,28.0,28.0,12.0,11.0,13.0,15.0,16.0,24.0,14.0,13.0,14.0,24.0,13.0,25.0,18.0,14.0,10.0,14.0,9.0,25.0,20.0,10.0,15.0,11.0,19.0,19.0,16.0,9.0,13.0,12.0,12.0,10.0,16.0,11.0,10.0,18.0,15.0,6.0,17.0,16.0,9.0,18.0,14.0,10.0,17.0,13.0,10.0,22.0,11.0,13.0,11.0,5.0,13.0,13.0,12.0,12.0,18.0,23.0,88.0,17.0,45.0,18.0,16.0,30.0,33.0,66.0,22.0,22.0,64.0,9.0,16.0,35.0,41.0,17.0,17.0,39.0,61.0,14.0,10.0,42.0,11.0,15.0,16.0,37.0,20.0,14.0,19.0,15.0,13.0,16.0,17.0,20.0,16.0,18.0,18.0,25.0,12.0,23.0,36.0,28.0,15.0,14.0,15.0,14.0,17.0,12.0,17.0,15.0,17.0,46.0,18.0,22.0,16.0,36.0,11.0,15.0,7.0
2,19.0,5.0,10.0,11.0,6.0,7.0,2.0,5.0,7.0,5.0,2.0,4.0,5.0,3.0,2.0,3.0,5.0,2.0,4.0,4.0,2.0,2.0,7.0,2.0,1.0,4.0,2.0,6.0,5.0,3.0,13.0,7.0,6.0,5.0,1.0,2.0,7.0,9.0,3.0,3.0,5.0,4.0,5.0,7.0,3.0,2.0,2.0,6.0,11.0,2.0,5.0,5.0,3.0,3.0,3.0,6.0,6.0,6.0,5.0,7.0,4.0,12.0,3.0,1.0,0.0,2.0,4.0,3.0,4.0,2.0,4.0,5.0,4.0,1.0,5.0,6.0,1.0,5.0,3.0,4.0,1.0,6.0,1.0,6.0,3.0,3.0,6.0,16.0,1.0,2.0,2.0,6.0,4.0,3.0,9.0,1.0,4.0,3.0,5.0,7.0,7.0,5.0,4.0,5.0,1.0,5.0,1.0,7.0,5.0,4.0,9.0,5.0,4.0,6.0,4.0,2.0,4.0,12.0,3.0,0.0,5.0,2.0,5.0,7.0,4.0,5.0,3.0,8.0,3.0,1.0,2.0,6.0,1.0,3.0,6.0,7.0,3.0,7.0,2.0,5.0,6.0,4.0,8.0,7.0,9.0,10.0,7.0,6.0,4.0,3.0,7.0,5.0,8.0,8.0,13.0,6.0,16.0,31.0,48.0,83.0,155.0,155.0,4.0,2.0,6.0,4.0,7.0,6.0,1.0,5.0,9.0,10.0,1.0,1.0,7.0,4.0,3.0,2.0,2.0,4.0,2.0,1.0,3.0,3.0
3,9.0,11.0,13.0,14.0,7.0,11.0,16.0,9.0,14.0,14.0,9.0,7.0,19.0,15.0,27.0,13.0,9.0,18.0,7.0,5.0,22.0,8.0,11.0,11.0,13.0,7.0,15.0,17.0,17.0,55.0,15.0,23.0,16.0,21.0,32.0,17.0,11.0,10.0,10.0,16.0,13.0,13.0,9.0,14.0,19.0,8.0,16.0,13.0,8.0,43.0,22.0,11.0,16.0,17.0,13.0,19.0,18.0,15.0,25.0,20.0,34.0,12.0,15.0,17.0,14.0,22.0,35.0,19.0,36.0,11.0,20.0,19.0,15.0,11.0,17.0,8.0,12.0,14.0,20.0,22.0,24.0,39.0,54.0,27.0,25.0,39.0,69.0,121.0,29.0,303.0,11.0,9.0,14.0,10.0,8.0,8.0,9.0,25.0,10.0,15.0,12.0,8.0,35.0,11.0,9.0,13.0,13.0,14.0,6.0,17.0,23.0,23.0,18.0,12.0,10.0,5.0,13.0,9.0,7.0,54.0,7.0,7.0,22.0,42.0,10.0,9.0,16.0,22.0,20.0,10.0,5.0,9.0,14.0,11.0,10.0,9.0,19.0,11.0,21.0,23.0,14.0,14.0,12.0,15.0,15.0,15.0,8.0,114.0,74.0,17.0,23.0,10.0,15.0,14.0,26.0,9.0,16.0,11.0,39.0,27.0,23.0,18.0,18.0,25.0,25.0,13.0,12.0,12.0,16.0,10.0,10.0,10.0,13.0,11.0,17.0,28.0,9.0,11.0,8.0,12.0,10.0,15.0,9.0,9.0
4,5.0,5.0,5.0,5.0,1.0,3.0,4.0,39.0,3.0,4.0,2.0,3.0,2.0,5.0,3.0,1.0,3.0,3.0,4.0,2.0,4.0,13.0,0.0,2.0,3.0,4.0,3.0,2.0,3.0,1.0,2.0,6.0,6.0,2.0,1.0,1.0,5.0,2.0,8.0,5.0,2.0,7.0,5.0,2.0,10.0,3.0,5.0,10.0,2.0,4.0,4.0,1.0,11.0,2.0,3.0,2.0,2.0,5.0,2.0,6.0,1.0,0.0,9.0,1.0,1.0,4.0,3.0,2.0,4.0,3.0,1.0,10.0,3.0,2.0,2.0,2.0,5.0,1.0,2.0,2.0,2.0,6.0,2.0,4.0,0.0,1.0,6.0,8.0,2.0,1.0,1.0,1.0,2.0,5.0,4.0,3.0,8.0,7.0,3.0,8.0,2.0,6.0,6.0,7.0,7.0,3.0,6.0,8.0,8.0,5.0,5.0,2.0,3.0,3.0,5.0,6.0,2.0,4.0,6.0,4.0,5.0,7.0,10.0,6.0,6.0,6.0,9.0,9.0,0.0,4.0,4.0,5.0,57.0,234.0,55.0,3.0,0.0,5.0,0.0,2.0,9.0,10.0,1.0,4.0,9.0,159.0,38.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [11]:
test.head()

Unnamed: 0,2016-09-13,2016-09-14,2016-09-15,2016-09-16,2016-09-17,2016-09-18,2016-09-19,2016-09-20,2016-09-21,2016-09-22,2016-09-23,2016-09-24,2016-09-25,2016-09-26,2016-09-27,2016-09-28,2016-09-29,2016-09-30,2016-10-01,2016-10-02,2016-10-03,2016-10-04,2016-10-05,2016-10-06,2016-10-07,2016-10-08,2016-10-09,2016-10-10,2016-10-11,2016-10-12,2016-10-13,2016-10-14,2016-10-15,2016-10-16,2016-10-17,2016-10-18,2016-10-19,2016-10-20,2016-10-21,2016-10-22,2016-10-23,2016-10-24,2016-10-25,2016-10-26,2016-10-27,2016-10-28,2016-10-29,2016-10-30,2016-10-31,2016-11-01,2016-11-02,2016-11-03,2016-11-04,2016-11-05,2016-11-06,2016-11-07,2016-11-08,2016-11-09,2016-11-10,2016-11-11,2016-11-12,2016-11-13,2016-11-14
0,11.0,13.0,20.0,21.0,13.0,24.0,20.0,13.0,32.0,16.0,10.0,13.0,44.0,17.0,13.0,72.0,40.0,19.0,14.0,13.0,12.0,14.0,10.0,26.0,13.0,22.0,14.0,23.0,12.0,8.0,50.0,13.0,10.0,16.0,14.0,10.0,24.0,10.0,20.0,10.0,26.0,25.0,16.0,19.0,20.0,12.0,19.0,50.0,16.0,30.0,18.0,25.0,14.0,20.0,8.0,67.0,13.0,41.0,10.0,21.0,13.0,8.0,15.0
1,37.0,38.0,22.0,28.0,19.0,46.0,24.0,22.0,43.0,58.0,26.0,20.0,27.0,35.0,20.0,31.0,24.0,24.0,94.0,18.0,20.0,18.0,16.0,38.0,54.0,29.0,49.0,25.0,72.0,144.0,36.0,97.0,179.0,29.0,12.0,21.0,42.0,53.0,41.0,19.0,25.0,19.0,15.0,21.0,21.0,27.0,33.0,15.0,24.0,13.0,11.0,14.0,26.0,11.0,21.0,14.0,14.0,54.0,5.0,10.0,12.0,11.0,14.0
2,4.0,10.0,3.0,4.0,6.0,3.0,4.0,8.0,10.0,3.0,3.0,1.0,10.0,5.0,4.0,4.0,3.0,4.0,1.0,3.0,6.0,6.0,6.0,3.0,5.0,11.0,6.0,3.0,7.0,6.0,0.0,2.0,4.0,4.0,3.0,6.0,4.0,3.0,4.0,1.0,6.0,5.0,5.0,2.0,3.0,3.0,2.0,2.0,6.0,1.0,3.0,3.0,3.0,2.0,10.0,2.0,2.0,2.0,7.0,3.0,6.0,4.0,2.0
3,11.0,15.0,28.0,10.0,24.0,8.0,20.0,19.0,12.0,31.0,14.0,9.0,40.0,15.0,83.0,60.0,19.0,15.0,15.0,12.0,23.0,17.0,20.0,26.0,11.0,13.0,9.0,44.0,7.0,18.0,4.0,36.0,34.0,10.0,8.0,21.0,7.0,6.0,12.0,15.0,9.0,13.0,21.0,13.0,10.0,21.0,15.0,103.0,22.0,15.0,12.0,11.0,15.0,7.0,12.0,13.0,9.0,8.0,21.0,16.0,38.0,13.0,14.0
4,13.0,63.0,2.0,2.0,3.0,6.0,10.0,2.0,8.0,4.0,3.0,3.0,6.0,4.0,1.0,5.0,9.0,1.0,6.0,4.0,0.0,4.0,9.0,6.0,8.0,13.0,4.0,7.0,6.0,9.0,3.0,21.0,6.0,13.0,10.0,2.0,3.0,6.0,7.0,10.0,6.0,6.0,4.0,173.0,5.0,10.0,10.0,18.0,20.0,11.0,5.0,6.0,33.0,13.0,10.0,22.0,11.0,8.0,4.0,10.0,13.0,11.0,8.0


In [12]:
data = [page.split('_') for page in tqdm(train_key.Page)]

access = ['_'.join(page[-2:]) for page in data]

site = [page[-3] for page in data]

page = ['_'.join(page[:-3]) for page in data]
page[:2]

train_key['PageTitle'] = page
train_key['Site'] = site
train_key['AccessAgent'] = access
train_key.head()

100%|██████████| 145063/145063 [00:00<00:00, 353671.24it/s]


Unnamed: 0,Page,PageTitle,Site,AccessAgent
0,2NE1_zh.wikipedia.org_all-access_spider,2NE1,zh.wikipedia.org,all-access_spider
1,2PM_zh.wikipedia.org_all-access_spider,2PM,zh.wikipedia.org,all-access_spider
2,3C_zh.wikipedia.org_all-access_spider,3C,zh.wikipedia.org,all-access_spider
3,4minute_zh.wikipedia.org_all-access_spider,4minute,zh.wikipedia.org,all-access_spider
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,52_Hz_I_Love_You,zh.wikipedia.org,all-access_spider


In [13]:
train_norm = np.log1p(train)
train_norm.head()

Unnamed: 0,2016-09-10,2016-09-09,2016-09-08,2016-09-07,2016-09-06,2016-09-05,2016-09-04,2016-09-03,2016-09-02,2016-09-01,2016-08-31,2016-08-30,2016-08-29,2016-08-28,2016-08-27,2016-08-26,2016-08-25,2016-08-24,2016-08-23,2016-08-22,2016-08-21,2016-08-20,2016-08-19,2016-08-18,2016-08-17,2016-08-16,2016-08-15,2016-08-14,2016-08-13,2016-08-12,2016-08-11,2016-08-10,2016-08-09,2016-08-08,2016-08-07,2016-08-06,2016-08-05,2016-08-04,2016-08-03,2016-08-02,2016-08-01,2016-07-31,2016-07-30,2016-07-29,2016-07-28,2016-07-27,2016-07-26,2016-07-25,2016-07-24,2016-07-23,2016-07-22,2016-07-21,2016-07-20,2016-07-19,2016-07-18,2016-07-17,2016-07-16,2016-07-15,2016-07-14,2016-07-13,2016-07-12,2016-07-11,2016-07-10,2016-07-09,2016-07-08,2016-07-07,2016-07-06,2016-07-05,2016-07-04,2016-07-03,2016-07-02,2016-07-01,2016-06-30,2016-06-29,2016-06-28,2016-06-27,2016-06-26,2016-06-25,2016-06-24,2016-06-23,2016-06-22,2016-06-21,2016-06-20,2016-06-19,2016-06-18,2016-06-17,2016-06-16,2016-06-15,2016-06-14,2016-06-13,2016-06-12,2016-06-11,2016-06-10,2016-06-09,2016-06-08,2016-06-07,2016-06-06,2016-06-05,2016-06-04,2016-06-03,2016-06-02,2016-06-01,2016-05-31,2016-05-30,2016-05-29,2016-05-28,2016-05-27,2016-05-26,2016-05-25,2016-05-24,2016-05-23,2016-05-22,2016-05-21,2016-05-20,2016-05-19,2016-05-18,2016-05-17,2016-05-16,2016-05-15,2016-05-14,2016-05-13,2016-05-12,2016-05-11,2016-05-10,2016-05-09,2016-05-08,2016-05-07,2016-05-06,2016-05-05,2016-05-04,2016-05-03,2016-05-02,2016-05-01,2016-04-30,2016-04-29,2016-04-28,2016-04-27,2016-04-26,2016-04-25,2016-04-24,2016-04-23,2016-04-22,2016-04-21,2016-04-20,2016-04-19,2016-04-18,2016-04-17,2016-04-16,2016-04-15,2016-04-14,2016-04-13,2016-04-12,2016-04-11,2016-04-10,2016-04-09,2016-04-08,2016-04-07,2016-04-06,2016-04-05,2016-04-04,2016-04-03,2016-04-02,2016-04-01,2016-03-31,2016-03-30,2016-03-29,2016-03-28,2016-03-27,2016-03-26,2016-03-25,2016-03-24,2016-03-23,2016-03-22,2016-03-21,2016-03-20,2016-03-19,2016-03-18,2016-03-17,2016-03-16,2016-03-15,2016-03-14,2016-03-13,2016-03-12,2016-03-11
0,2.833213,4.025352,2.944439,2.995732,2.944439,3.178054,2.995732,3.258096,3.044523,3.091043,3.258096,2.197225,3.295837,2.944439,2.772589,3.713572,2.70805,3.044523,3.526361,3.044523,3.433987,2.639057,3.637586,3.7612,3.912023,2.564949,2.70805,3.218876,2.890372,4.025352,3.433987,3.178054,3.73767,3.295837,2.995732,2.397895,3.912023,3.044523,2.890372,4.060443,2.995732,2.995732,2.564949,3.401197,3.295837,3.465736,2.890372,2.944439,2.639057,2.564949,3.433987,2.944439,3.135494,3.218876,2.772589,2.772589,2.70805,2.944439,2.772589,2.944439,2.890372,3.828641,4.043051,4.75359,3.433987,2.772589,3.091043,2.944439,2.833213,2.772589,2.890372,3.044523,2.564949,3.178054,2.944439,2.484907,4.248495,2.772589,3.135494,2.197225,3.135494,3.295837,4.143135,2.197225,5.459586,2.70805,2.772589,2.890372,2.639057,2.70805,2.944439,2.70805,2.484907,2.397895,2.944439,2.944439,2.70805,2.772589,2.484907,2.639057,3.332205,3.091043,2.833213,2.890372,2.772589,2.564949,2.70805,2.995732,3.044523,3.091043,2.397895,2.70805,2.639057,2.772589,2.564949,2.833213,3.583519,2.995732,2.833213,3.135494,3.178054,3.091043,3.178054,3.688879,3.135494,3.931826,2.484907,2.995732,3.135494,3.044523,2.639057,2.833213,2.944439,3.401197,2.772589,2.772589,2.995732,3.713572,3.258096,2.890372,3.465736,1.94591,3.806663,3.135494,2.944439,3.178054,2.833213,3.178054,2.890372,2.890372,4.382027,4.060443,3.091043,4.276666,4.844187,3.663562,4.634729,5.247024,6.196444,2.302585,2.639057,2.70805,2.564949,2.890372,2.944439,2.484907,4.094345,3.044523,2.484907,2.772589,2.197225,2.484907,2.564949,2.484907,2.639057,2.944439,1.94591,2.890372,2.70805,2.397895,2.302585,2.890372,2.302585,2.197225
1,2.639057,2.639057,3.583519,3.78419,3.806663,2.995732,4.369448,4.204693,2.890372,3.465736,3.465736,4.976734,3.135494,3.258096,2.564949,2.772589,3.178054,2.70805,2.397895,3.401197,3.044523,2.639057,3.433987,3.555348,2.302585,2.564949,4.158883,4.356709,2.564949,4.110874,2.890372,2.995732,3.688879,2.772589,4.007333,2.833213,3.433987,3.401197,2.995732,3.688879,3.218876,3.258096,2.995732,3.912023,3.583519,2.890372,2.944439,2.944439,2.995732,3.135494,3.135494,2.995732,2.833213,2.833213,3.496508,3.610918,3.091043,3.912023,3.555348,3.465736,4.532599,3.218876,4.174387,5.220356,3.89182,3.332205,2.890372,3.367296,3.367296,2.564949,2.484907,2.639057,2.772589,2.833213,3.218876,2.70805,2.639057,2.70805,3.218876,2.639057,3.258096,2.944439,2.70805,2.397895,2.70805,2.302585,3.258096,3.044523,2.397895,2.772589,2.484907,2.995732,2.995732,2.833213,2.302585,2.639057,2.564949,2.564949,2.397895,2.833213,2.484907,2.397895,2.944439,2.772589,1.94591,2.890372,2.833213,2.302585,2.944439,2.70805,2.397895,2.890372,2.639057,2.397895,3.135494,2.484907,2.639057,2.484907,1.791759,2.639057,2.639057,2.564949,2.564949,2.944439,3.178054,4.488636,2.890372,3.828641,2.944439,2.833213,3.433987,3.526361,4.204693,3.135494,3.135494,4.174387,2.302585,2.833213,3.583519,3.73767,2.890372,2.890372,3.688879,4.127134,2.70805,2.397895,3.7612,2.484907,2.772589,2.833213,3.637586,3.044523,2.70805,2.995732,2.772589,2.639057,2.833213,2.890372,3.044523,2.833213,2.944439,2.944439,3.258096,2.564949,3.178054,3.610918,3.367296,2.772589,2.70805,2.772589,2.70805,2.890372,2.564949,2.890372,2.772589,2.890372,3.850148,2.944439,3.135494,2.833213,3.610918,2.484907,2.772589,2.079442
2,2.995732,1.791759,2.397895,2.484907,1.94591,2.079442,1.098612,1.791759,2.079442,1.791759,1.098612,1.609438,1.791759,1.386294,1.098612,1.386294,1.791759,1.098612,1.609438,1.609438,1.098612,1.098612,2.079442,1.098612,0.693147,1.609438,1.098612,1.94591,1.791759,1.386294,2.639057,2.079442,1.94591,1.791759,0.693147,1.098612,2.079442,2.302585,1.386294,1.386294,1.791759,1.609438,1.791759,2.079442,1.386294,1.098612,1.098612,1.94591,2.484907,1.098612,1.791759,1.791759,1.386294,1.386294,1.386294,1.94591,1.94591,1.94591,1.791759,2.079442,1.609438,2.564949,1.386294,0.693147,0.0,1.098612,1.609438,1.386294,1.609438,1.098612,1.609438,1.791759,1.609438,0.693147,1.791759,1.94591,0.693147,1.791759,1.386294,1.609438,0.693147,1.94591,0.693147,1.94591,1.386294,1.386294,1.94591,2.833213,0.693147,1.098612,1.098612,1.94591,1.609438,1.386294,2.302585,0.693147,1.609438,1.386294,1.791759,2.079442,2.079442,1.791759,1.609438,1.791759,0.693147,1.791759,0.693147,2.079442,1.791759,1.609438,2.302585,1.791759,1.609438,1.94591,1.609438,1.098612,1.609438,2.564949,1.386294,0.0,1.791759,1.098612,1.791759,2.079442,1.609438,1.791759,1.386294,2.197225,1.386294,0.693147,1.098612,1.94591,0.693147,1.386294,1.94591,2.079442,1.386294,2.079442,1.098612,1.791759,1.94591,1.609438,2.197225,2.079442,2.302585,2.397895,2.079442,1.94591,1.609438,1.386294,2.079442,1.791759,2.197225,2.197225,2.639057,1.94591,2.833213,3.465736,3.89182,4.430817,5.049856,5.049856,1.609438,1.098612,1.94591,1.609438,2.079442,1.94591,0.693147,1.791759,2.302585,2.397895,0.693147,0.693147,2.079442,1.609438,1.386294,1.098612,1.098612,1.609438,1.098612,0.693147,1.386294,1.386294
3,2.302585,2.484907,2.639057,2.70805,2.079442,2.484907,2.833213,2.302585,2.70805,2.70805,2.302585,2.079442,2.995732,2.772589,3.332205,2.639057,2.302585,2.944439,2.079442,1.791759,3.135494,2.197225,2.484907,2.484907,2.639057,2.079442,2.772589,2.890372,2.890372,4.025352,2.772589,3.178054,2.833213,3.091043,3.496508,2.890372,2.484907,2.397895,2.397895,2.833213,2.639057,2.639057,2.302585,2.70805,2.995732,2.197225,2.833213,2.639057,2.197225,3.78419,3.135494,2.484907,2.833213,2.890372,2.639057,2.995732,2.944439,2.772589,3.258096,3.044523,3.555348,2.564949,2.772589,2.890372,2.70805,3.135494,3.583519,2.995732,3.610918,2.484907,3.044523,2.995732,2.772589,2.484907,2.890372,2.197225,2.564949,2.70805,3.044523,3.135494,3.218876,3.688879,4.007333,3.332205,3.258096,3.688879,4.248495,4.804021,3.401197,5.717028,2.484907,2.302585,2.70805,2.397895,2.197225,2.197225,2.302585,3.258096,2.397895,2.772589,2.564949,2.197225,3.583519,2.484907,2.302585,2.639057,2.639057,2.70805,1.94591,2.890372,3.178054,3.178054,2.944439,2.564949,2.397895,1.791759,2.639057,2.302585,2.079442,4.007333,2.079442,2.079442,3.135494,3.7612,2.397895,2.302585,2.833213,3.135494,3.044523,2.397895,1.791759,2.302585,2.70805,2.484907,2.397895,2.302585,2.995732,2.484907,3.091043,3.178054,2.70805,2.70805,2.564949,2.772589,2.772589,2.772589,2.197225,4.744932,4.317488,2.890372,3.178054,2.397895,2.772589,2.70805,3.295837,2.302585,2.833213,2.484907,3.688879,3.332205,3.178054,2.944439,2.944439,3.258096,3.258096,2.639057,2.564949,2.564949,2.833213,2.397895,2.397895,2.397895,2.639057,2.484907,2.890372,3.367296,2.302585,2.484907,2.197225,2.564949,2.397895,2.772589,2.302585,2.302585
4,1.791759,1.791759,1.791759,1.791759,0.693147,1.386294,1.609438,3.688879,1.386294,1.609438,1.098612,1.386294,1.098612,1.791759,1.386294,0.693147,1.386294,1.386294,1.609438,1.098612,1.609438,2.639057,0.0,1.098612,1.386294,1.609438,1.386294,1.098612,1.386294,0.693147,1.098612,1.94591,1.94591,1.098612,0.693147,0.693147,1.791759,1.098612,2.197225,1.791759,1.098612,2.079442,1.791759,1.098612,2.397895,1.386294,1.791759,2.397895,1.098612,1.609438,1.609438,0.693147,2.484907,1.098612,1.386294,1.098612,1.098612,1.791759,1.098612,1.94591,0.693147,0.0,2.302585,0.693147,0.693147,1.609438,1.386294,1.098612,1.609438,1.386294,0.693147,2.397895,1.386294,1.098612,1.098612,1.098612,1.791759,0.693147,1.098612,1.098612,1.098612,1.94591,1.098612,1.609438,0.0,0.693147,1.94591,2.197225,1.098612,0.693147,0.693147,0.693147,1.098612,1.791759,1.609438,1.386294,2.197225,2.079442,1.386294,2.197225,1.098612,1.94591,1.94591,2.079442,2.079442,1.386294,1.94591,2.197225,2.197225,1.791759,1.791759,1.098612,1.386294,1.386294,1.791759,1.94591,1.098612,1.609438,1.94591,1.609438,1.791759,2.079442,2.397895,1.94591,1.94591,1.94591,2.302585,2.302585,0.0,1.609438,1.609438,1.791759,4.060443,5.459586,4.025352,1.386294,0.0,1.791759,0.0,1.098612,2.302585,2.397895,0.693147,1.609438,2.302585,5.075174,3.663562,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [14]:
train_all_norm = np.log1p(train_all)

In [15]:
test['Page'] = all_page
test = pd.melt(test, id_vars=['Page'], var_name='Date', value_name='Visits_true')
test['Week'] = pd.to_datetime(test.Date).dt.dayofyear // 7
test = test.merge(train_key, how='left', on='Page')
test['Visits_true'] = test.Visits_true.astype('float32')
test['Visits_norm'] = np.log1p(test.Visits_true).astype('float32')
test = test[test.Visits_true.isnull() != True].reset_index(drop=True)

test.head()

Unnamed: 0,Page,Date,Visits_true,Week,PageTitle,Site,AccessAgent,Visits_norm
0,2NE1_zh.wikipedia.org_all-access_spider,2016-09-13,11.0,36,2NE1,zh.wikipedia.org,all-access_spider,2.484907
1,2PM_zh.wikipedia.org_all-access_spider,2016-09-13,37.0,36,2PM,zh.wikipedia.org,all-access_spider,3.637586
2,3C_zh.wikipedia.org_all-access_spider,2016-09-13,4.0,36,3C,zh.wikipedia.org,all-access_spider,1.609438
3,4minute_zh.wikipedia.org_all-access_spider,2016-09-13,11.0,36,4minute,zh.wikipedia.org,all-access_spider,2.484907
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,2016-09-13,13.0,36,52_Hz_I_Love_You,zh.wikipedia.org,all-access_spider,2.639057


In [16]:
test_all = pd.read_csv('../input/key_2.csv')
test_all.head()

Unnamed: 0,Page,Id
0,007_スペクター_ja.wikipedia.org_all-access_all-agents_2017-09-13,0b293039387a
1,007_スペクター_ja.wikipedia.org_all-access_all-agents_2017-09-14,7114389dd824
2,007_スペクター_ja.wikipedia.org_all-access_all-agents_2017-09-15,057b02ff1f09
3,007_スペクター_ja.wikipedia.org_all-access_all-agents_2017-09-16,bd2aca21caa3
4,007_スペクター_ja.wikipedia.org_all-access_all-agents_2017-09-17,c0effb42cdd5


In [17]:
test_all['Date'] = [page[-10:] for page in tqdm(test_all.Page)]
test_all['Page'] = [page[:-11] for page in tqdm(test_all.Page)]
test_all.head()

100%|██████████| 8993906/8993906 [00:05<00:00, 1543546.37it/s]
100%|██████████| 8993906/8993906 [00:04<00:00, 1933295.54it/s]


Unnamed: 0,Page,Id,Date
0,007_スペクター_ja.wikipedia.org_all-access_all-agents,0b293039387a,2017-09-13
1,007_スペクター_ja.wikipedia.org_all-access_all-agents,7114389dd824,2017-09-14
2,007_スペクター_ja.wikipedia.org_all-access_all-agents,057b02ff1f09,2017-09-15
3,007_スペクター_ja.wikipedia.org_all-access_all-agents,bd2aca21caa3,2017-09-16
4,007_スペクター_ja.wikipedia.org_all-access_all-agents,c0effb42cdd5,2017-09-17


In [18]:
test_all['Week'] = pd.to_datetime(test_all.Date).dt.dayofyear // 7
test_all = test_all.merge(train_key, how='left', on='Page')
test_all.head()

Unnamed: 0,Page,Id,Date,Week,PageTitle,Site,AccessAgent
0,007_スペクター_ja.wikipedia.org_all-access_all-agents,0b293039387a,2017-09-13,36,007_スペクター,ja.wikipedia.org,all-access_all-agents
1,007_スペクター_ja.wikipedia.org_all-access_all-agents,7114389dd824,2017-09-14,36,007_スペクター,ja.wikipedia.org,all-access_all-agents
2,007_スペクター_ja.wikipedia.org_all-access_all-agents,057b02ff1f09,2017-09-15,36,007_スペクター,ja.wikipedia.org,all-access_all-agents
3,007_スペクター_ja.wikipedia.org_all-access_all-agents,bd2aca21caa3,2017-09-16,37,007_スペクター,ja.wikipedia.org,all-access_all-agents
4,007_スペクター_ja.wikipedia.org_all-access_all-agents,c0effb42cdd5,2017-09-17,37,007_スペクター,ja.wikipedia.org,all-access_all-agents


In [19]:
sites = train_key.Site.unique()
sites

array(['zh.wikipedia.org', 'fr.wikipedia.org', 'en.wikipedia.org',
       'commons.wikimedia.org', 'ru.wikipedia.org', 'www.mediawiki.org',
       'de.wikipedia.org', 'ja.wikipedia.org', 'es.wikipedia.org'], dtype=object)

In [20]:
# all visits is median

def add_median(test, train, train_key, periods, max_periods, first_train_weekday):
    train =  train.iloc[:,:7*max_periods]
    train_weekday = np.array([(first_train_weekday-w) % 7 for w in range(train.shape[1])])
    train_week_idx = [i for i,w in enumerate(train_weekday) if w <= 4]
    train_week = train.iloc[:,train_week_idx]
    train_weekend_idx = [i for i,w in enumerate(train_weekday) if w > 4]
    train_weekend = train.iloc[:,train_weekend_idx]

    test_week = (test.WeekDay <= 4)
    test_weekend = ~test_week
    test['WeekEnd'] = 1 * test_weekend
    df = train_key[['Page']].copy()
    df['AllVisits'] = train.median(axis=1).fillna(0)
    test = test.merge(df, how='left', on='Page', copy=False)
    test.AllVisits = test.AllVisits.fillna(0).astype('float32')
    
    for (w1, w2) in tqdm(periods):
        
        df = train_key[['Page']].copy()
        c = 'median_%d_%d' % (w1, w2)
        df[c] = train.iloc[:,7*w1:7*w2].median(axis=1, skipna=True) 
        test = test.merge(df, how='left', on='Page', copy=False)
        test[c] = (test[c] - test.AllVisits).fillna(0).astype('float32')
        
        c = 'median_day_%d_%d' % (w1, w2)
        test_page = test[['Page']].copy()
        
        df = train_key[['Page']].copy()
        df[c] = train_week.iloc[:,5*w1:5*w2].median(axis=1, skipna=True) 
        df = test_page.loc[test_week].merge(df, how='left', on='Page', copy=False)
        test.loc[test_week, c] = df[c].values
        
        df = train_key[['Page']].copy()
        df[c] = train_weekend.iloc[:,2*w1:2*w2].median(axis=1, skipna=True) 
        df = test_page.loc[test_weekend].merge(df, how='left', on='Page', copy=False)
        test.loc[test_weekend, c] = df[c].values

        test[c] = (test[c] - test.AllVisits).fillna(0).astype('float32')

    gc.collect()

    return test

In [21]:
test0 = test.copy()
test_all0 = test_all.copy()

In [22]:
npca = 0
max_periods = 20
periods = [(0,1), (1,2), (2,3), (3,4), 
           (4,5), (5,6), (6,7), (7,8),
           (0,2), (0,4),
           (0,8), (0,12), 
           (0,16),
           (0, max_periods)
          ]

test, test_all = test0.copy(), test_all0.copy()

res = 0
res_den = 0
out = []

test['Visits'] = 0
test_all['Visits'] = 0

threshold = 1.25

for site in sites:
    print(site)

    train_norm_site = train_norm[train_key.Site == site]
    train_all_norm_site = train_all_norm[train_key.Site == site]
    train_key_site = train_key[train_key.Site == site]

    test_site = test[test.Site == site].reset_index(drop=True)
    test_site['Date'] = pd.to_datetime(test_site.Date)
    test_site['WeekDay'] = test_site.Date.dt.dayofweek
    
    test_all_site = test_all[test_all.Site == site].reset_index(drop=True)
    test_all_site['Date'] = pd.to_datetime(test_all_site.Date)
    test_all_site['WeekDay'] = test_all_site.Date.dt.dayofweek

    test1 = add_median(test_site, train_norm_site, train_key_site, periods, max_periods, 3)
    test_all1 = add_median(test_all_site, train_all_norm_site, train_key_site, periods, max_periods, 5)
    
    test1.Visits_norm -= test1.AllVisits
    
    kf = KFold(5, shuffle=False)
    num_cols = (['median_day_%d_%d' % (w1,w2) for (w1,w2) in periods]) 

    print('threshold: %0.2f' % threshold)
    res_site = 0
    res_site_den = 0
    for week in test_site.Week.unique():
        #print('week:', week)
        test2 = test1[test1.Week == week].reset_index(drop=True)
        
        lr = HuberRegressor(epsilon=1)
        
        lr.fit(test2[num_cols], test2.Visits_norm)
        y = lr.predict(test2[num_cols])
        y += test2.AllVisits
        y = np.expm1(y)
        y[y < 0.85] = 0
        res_site_week0 = smape(test2.Visits_true, y)
        # print(site, week, 'smape: %0.5f' % res_site_week0)
        
        if site not in ['commons.wikimedia.org', 'www.mediawiki.org',]:
        
            mask = smape_mask(test2.Visits_true, y, threshold)
            test3 = test2[mask]
            lr.fit(test3[num_cols], test3.Visits_norm)
            y = lr.predict(test2[num_cols])
            y += test2.AllVisits
            y = np.expm1(y)
            y[y < 0.85] = 0
            res_site_week = smape(test2.Visits_true, y)
            #print(site, week, 'smape: %0.5f' % res_site_week)
        else:
            res_site_week = res_site_week0
            
        print(site, week, 'smape: %0.5f' % res_site_week, 'delta: %0.5f' % (res_site_week0 - res_site_week))
        test.loc[(test.Site == site) & (test.Week == week), 'Visits'] = y
        
        
        res_site += res_site_week * test2.shape[0]
        res_site_den += test2.shape[0]
        res += res_site_week * test2.shape[0]
        res_den += test2.shape[0]
        out.append((test2.Visits_true.values, y, test2.shape[0]))
        
        test_all2 = test_all1[test_all1.Week == week]
        y = lr.predict(test_all2[num_cols])
        y += test_all2.AllVisits
        y = np.expm1(y)
        y[y < 0.85] = 0
        test_all.loc[(test_all.Site == site) & (test_all.Week == week), 'Visits'] = y

    res_site /= res_site_den
    print('smape %s: %0.5f' % (site, res_site))

        
res /= res_den
print('smape all: %0.5f' % res)

y_true = np.concatenate([y_true for (y_true, y_pred, size) in out], axis=0)
y_pred = np.concatenate([y_pred for (y_true, y_pred, size) in out], axis=0)
print('smape all: %0.5f' % smape(y_true, y_pred))


zh.wikipedia.org


100%|██████████| 14/14 [00:11<00:00,  1.12it/s]
100%|██████████| 14/14 [00:14<00:00,  1.10s/it]


threshold: 1.25
zh.wikipedia.org 36 smape: 0.32419 delta: 0.00005
zh.wikipedia.org 37 smape: 0.38304 delta: 0.00008
zh.wikipedia.org 38 smape: 0.41316 delta: 0.00009
zh.wikipedia.org 39 smape: 0.43701 delta: -0.00005
zh.wikipedia.org 40 smape: 0.46697 delta: 0.00022
zh.wikipedia.org 41 smape: 0.47501 delta: 0.00026
zh.wikipedia.org 42 smape: 0.49672 delta: 0.00029
zh.wikipedia.org 43 smape: 0.51233 delta: 0.00020
zh.wikipedia.org 44 smape: 0.53740 delta: 0.00057
zh.wikipedia.org 45 smape: 0.55569 delta: 0.00073
smape zh.wikipedia.org: 0.46823
fr.wikipedia.org


100%|██████████| 14/14 [00:11<00:00,  1.11it/s]
100%|██████████| 14/14 [00:13<00:00,  1.03s/it]


threshold: 1.25
fr.wikipedia.org 36 smape: 0.36587 delta: -0.00003
fr.wikipedia.org 37 smape: 0.39203 delta: 0.00000
fr.wikipedia.org 38 smape: 0.42044 delta: -0.00001
fr.wikipedia.org 39 smape: 0.44099 delta: 0.00022
fr.wikipedia.org 40 smape: 0.53338 delta: 0.00106
fr.wikipedia.org 41 smape: 0.53730 delta: 0.00105
fr.wikipedia.org 42 smape: 0.48152 delta: 0.00041
fr.wikipedia.org 43 smape: 0.49067 delta: 0.00050
fr.wikipedia.org 44 smape: 0.51266 delta: 0.00047
fr.wikipedia.org 45 smape: 0.53196 delta: 0.00064
smape fr.wikipedia.org: 0.47716
en.wikipedia.org


100%|██████████| 14/14 [00:16<00:00,  1.27s/it]
100%|██████████| 14/14 [00:19<00:00,  1.45s/it]


threshold: 1.25
en.wikipedia.org 36 smape: 0.30595 delta: -0.00055
en.wikipedia.org 37 smape: 0.35064 delta: -0.00033
en.wikipedia.org 38 smape: 0.38214 delta: -0.00009
en.wikipedia.org 39 smape: 0.40748 delta: -0.00012
en.wikipedia.org 40 smape: 0.43220 delta: -0.00013
en.wikipedia.org 41 smape: 0.43795 delta: -0.00011
en.wikipedia.org 42 smape: 0.45699 delta: 0.00026
en.wikipedia.org 43 smape: 0.46231 delta: 0.00002
en.wikipedia.org 44 smape: 0.49723 delta: 0.00015
en.wikipedia.org 45 smape: 0.51777 delta: 0.00050
smape en.wikipedia.org: 0.43163
commons.wikimedia.org


100%|██████████| 14/14 [00:05<00:00,  2.18it/s]
100%|██████████| 14/14 [00:08<00:00,  1.66it/s]


threshold: 1.25
commons.wikimedia.org 36 smape: 0.42875 delta: 0.00000
commons.wikimedia.org 37 smape: 0.45759 delta: 0.00000
commons.wikimedia.org 38 smape: 0.49337 delta: 0.00000
commons.wikimedia.org 39 smape: 0.51500 delta: 0.00000
commons.wikimedia.org 40 smape: 0.51808 delta: 0.00000
commons.wikimedia.org 41 smape: 0.53925 delta: 0.00000
commons.wikimedia.org 42 smape: 0.55434 delta: 0.00000
commons.wikimedia.org 43 smape: 0.55783 delta: 0.00000
commons.wikimedia.org 44 smape: 0.57339 delta: 0.00000
commons.wikimedia.org 45 smape: 0.59674 delta: 0.00000
smape commons.wikimedia.org: 0.52874
ru.wikipedia.org


100%|██████████| 14/14 [00:09<00:00,  1.32it/s]
100%|██████████| 14/14 [00:11<00:00,  1.11it/s]


threshold: 1.25
ru.wikipedia.org 36 smape: 0.29053 delta: -0.00005
ru.wikipedia.org 37 smape: 0.33379 delta: 0.00011
ru.wikipedia.org 38 smape: 0.36406 delta: 0.00011
ru.wikipedia.org 39 smape: 0.37845 delta: 0.00004
ru.wikipedia.org 40 smape: 0.38654 delta: 0.00007
ru.wikipedia.org 41 smape: 0.39803 delta: 0.00013
ru.wikipedia.org 42 smape: 0.40962 delta: 0.00023
ru.wikipedia.org 43 smape: 0.41076 delta: 0.00020
ru.wikipedia.org 44 smape: 0.43679 delta: 0.00024
ru.wikipedia.org 45 smape: 0.45382 delta: 0.00018
smape ru.wikipedia.org: 0.39178
www.mediawiki.org


100%|██████████| 14/14 [00:03<00:00,  3.31it/s]
100%|██████████| 14/14 [00:05<00:00,  2.57it/s]


threshold: 1.25
www.mediawiki.org 36 smape: 0.46377 delta: 0.00000
www.mediawiki.org 37 smape: 0.50944 delta: 0.00000
www.mediawiki.org 38 smape: 0.50702 delta: 0.00000
www.mediawiki.org 39 smape: 0.52658 delta: 0.00000
www.mediawiki.org 40 smape: 0.52369 delta: 0.00000
www.mediawiki.org 41 smape: 0.53953 delta: 0.00000
www.mediawiki.org 42 smape: 0.53310 delta: 0.00000
www.mediawiki.org 43 smape: 0.54618 delta: 0.00000
www.mediawiki.org 44 smape: 0.54534 delta: 0.00000
www.mediawiki.org 45 smape: 0.54934 delta: 0.00000
smape www.mediawiki.org: 0.52836
de.wikipedia.org


100%|██████████| 14/14 [00:11<00:00,  1.07it/s]
100%|██████████| 14/14 [00:14<00:00,  1.09s/it]


threshold: 1.25
de.wikipedia.org 36 smape: 0.34341 delta: -0.00020
de.wikipedia.org 37 smape: 0.37379 delta: 0.00030
de.wikipedia.org 38 smape: 0.39926 delta: 0.00003
de.wikipedia.org 39 smape: 0.44283 delta: 0.00010
de.wikipedia.org 40 smape: 0.45284 delta: 0.00017
de.wikipedia.org 41 smape: 0.44777 delta: 0.00012
de.wikipedia.org 42 smape: 0.46865 delta: 0.00025
de.wikipedia.org 43 smape: 0.48986 delta: 0.00025
de.wikipedia.org 44 smape: 0.51478 delta: 0.00046
de.wikipedia.org 45 smape: 0.53889 delta: 0.00049
smape de.wikipedia.org: 0.45271
ja.wikipedia.org


100%|██████████| 14/14 [00:12<00:00,  1.01it/s]
100%|██████████| 14/14 [00:15<00:00,  1.19s/it]


threshold: 1.25
ja.wikipedia.org 36 smape: 0.31766 delta: 0.00016
ja.wikipedia.org 37 smape: 0.37987 delta: 0.00026
ja.wikipedia.org 38 smape: 0.40917 delta: 0.00033
ja.wikipedia.org 39 smape: 0.42912 delta: -0.00031
ja.wikipedia.org 40 smape: 0.46070 delta: 0.00050
ja.wikipedia.org 41 smape: 0.47482 delta: 0.00052
ja.wikipedia.org 42 smape: 0.49238 delta: 0.00057
ja.wikipedia.org 43 smape: 0.50234 delta: 0.00065
ja.wikipedia.org 44 smape: 0.51156 delta: -0.00009
ja.wikipedia.org 45 smape: 0.51865 delta: -0.00006
smape ja.wikipedia.org: 0.45799
es.wikipedia.org


100%|██████████| 14/14 [00:07<00:00,  1.55it/s]
100%|██████████| 14/14 [00:09<00:00,  1.32it/s]


threshold: 1.25
es.wikipedia.org 36 smape: 0.30045 delta: -0.00001
es.wikipedia.org 37 smape: 0.35864 delta: -0.00003
es.wikipedia.org 38 smape: 0.36963 delta: -0.00008
es.wikipedia.org 39 smape: 0.39266 delta: 0.00014
es.wikipedia.org 40 smape: 0.41099 delta: 0.00002
es.wikipedia.org 41 smape: 0.42559 delta: 0.00007
es.wikipedia.org 42 smape: 0.43086 delta: 0.00028
es.wikipedia.org 43 smape: 0.43936 delta: 0.00025
es.wikipedia.org 44 smape: 0.46602 delta: 0.00038
es.wikipedia.org 45 smape: 0.47780 delta: 0.00053
smape es.wikipedia.org: 0.41351
smape all: 0.45309
smape all: 0.45309


  app.launch_new_instance()


In [23]:
test.Visits = test.Visits.round(3)
test_all.Visits = test_all.Visits.round(3)

test[['Page', 'Date', 'Visits']].to_csv('../submissions/pred_10_stage2_sept_10_train.csv', index=False)
test_all[['Id', 'Visits']].to_csv('../submissions/pred_10_stage2_sept_10_test.csv', index=False)

In [24]:
test.head()

Unnamed: 0,Page,Date,Visits_true,Week,PageTitle,Site,AccessAgent,Visits_norm,Visits
0,2NE1_zh.wikipedia.org_all-access_spider,2016-09-13,11.0,36,2NE1,zh.wikipedia.org,all-access_spider,2.484907,17.363
1,2PM_zh.wikipedia.org_all-access_spider,2016-09-13,37.0,36,2PM,zh.wikipedia.org,all-access_spider,3.637586,30.356
2,3C_zh.wikipedia.org_all-access_spider,2016-09-13,4.0,36,3C,zh.wikipedia.org,all-access_spider,1.609438,7.523
3,4minute_zh.wikipedia.org_all-access_spider,2016-09-13,11.0,36,4minute,zh.wikipedia.org,all-access_spider,2.484907,12.516
4,52_Hz_I_Love_You_zh.wikipedia.org_all-access_spider,2016-09-13,13.0,36,52_Hz_I_Love_You,zh.wikipedia.org,all-access_spider,2.639057,4.544


In [25]:
test.Visits.mean()

1212.3880337400985

In [26]:
test_all.Visits.mean()

927.2241236134714