In [1]:

import numpy as np
import pandas as pd 
import seaborn as sns 

import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
# from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import time	
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline 
from pylab import rcParams

rcParams['figure.dpi'] = 120
rcParams['savefig.dpi'] = 120 
rcParams['font.size'] = 20


from tqdm import tqdm

In [2]:
df = pd.read_csv('../datasets/sales_train_validation.csv')
print(df.shape)
df.head()

(30490, 1919)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [3]:
submission = pd.read_csv('../datasets/sample_submission.csv')
print(submission.shape)
submission.head()

(60980, 29)


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
price_df = pd.read_csv('../datasets/sell_prices.csv')
print(price_df.shape)
price_df.head()

(6841121, 4)


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [5]:
cal_df = pd.read_csv('../datasets/calendar.csv')
print(cal_df.shape)
cal_df.head(5)

(1969, 14)


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [6]:
cal_df["d"]=cal_df["d"].apply(lambda x: int(x.split("_")[1])) # d_1 to 1
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [7]:
price_df["id"] = price_df["item_id"] + "_" + price_df["store_id"] + "_validation"

In [8]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


In [9]:
days_cols = df.columns[df.columns.str.contains("d_")].values
df_days = df[days_cols]
df_days

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,0,0,0,0,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,0,0,0,0,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,2,2,0,3,1,4,1,0,...,2,0,0,0,0,0,1,0,0,1
30486,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,0,0,1,0
30487,0,6,0,2,2,4,1,8,5,2,...,2,1,0,2,0,1,0,0,1,0
30488,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [10]:
unit_sales_df =pd.DataFrame()

for day in tqdm(range(1858, 1886)):
    wk_id = list(cal_df[cal_df["d"]==day]["wm_yr_wk"])[0]
#     print("wk_id",wk_id)
    wk_price_df = price_df[price_df["wm_yr_wk"]==wk_id]
#     print("wk_price_df",wk_price_df)
    df = df.merge(wk_price_df[["sell_price", "id"]], on=["id"], how='inner')
#     df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)]
    unit_sales_df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)]
    df.drop(columns=["sell_price"], inplace=True)

100%|██████████| 28/28 [00:21<00:00,  1.29it/s]


In [11]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [12]:
unit_sales_df

Unnamed: 0,unit_sales_1858,unit_sales_1859,unit_sales_1860,unit_sales_1861,unit_sales_1862,unit_sales_1863,unit_sales_1864,unit_sales_1865,unit_sales_1866,unit_sales_1867,...,unit_sales_1876,unit_sales_1877,unit_sales_1878,unit_sales_1879,unit_sales_1880,unit_sales_1881,unit_sales_1882,unit_sales_1883,unit_sales_1884,unit_sales_1885
0,0.00,16.52,0.00,8.26,8.26,0.00,8.26,0.00,0.00,8.26,...,24.78,8.26,24.78,8.26,16.52,16.52,0.00,8.26,8.26,8.26
1,0.00,0.00,0.00,3.97,0.00,0.00,3.97,3.97,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,3.97,3.97,3.97,3.97
2,0.00,0.00,0.00,0.00,0.00,17.82,2.97,2.97,5.94,0.00,...,2.97,0.00,0.00,0.00,0.00,0.00,0.00,2.97,2.97,0.00
3,0.00,0.00,0.00,4.64,0.00,4.64,23.20,13.92,4.64,0.00,...,18.56,9.28,4.64,18.56,4.64,13.92,23.20,0.00,27.84,27.84
4,2.88,5.76,2.88,0.00,0.00,2.88,0.00,5.76,2.88,2.88,...,8.64,5.76,5.76,5.76,8.64,2.88,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0.00,0.00,5.76,0.00,0.00,5.96,2.98,2.98,0.00,2.98,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
30486,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
30487,3.98,7.96,7.96,7.96,7.96,7.96,7.96,3.98,3.98,7.96,...,7.96,0.00,0.00,7.96,0.00,0.00,0.00,7.96,0.00,3.98
30488,2.56,2.56,2.56,2.56,0.00,2.56,2.56,0.00,0.00,0.00,...,1.28,1.28,1.28,1.28,0.00,2.56,1.28,1.28,2.56,5.12


In [13]:
df["dollar_sales"] = unit_sales_df.sum(axis=1)

In [14]:
print(df.shape)
print(df.columns)
df.head()

(30490, 1920)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911',
       'd_1912', 'd_1913', 'dollar_sales'],
      dtype='object', length=1920)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,231.28
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,35.73
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,44.55
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,245.92
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,89.28


In [15]:
df["weight"] = df["dollar_sales"] / df["dollar_sales"].sum()

In [16]:
print(df.shape)
print(df.columns)
df.head()

(30490, 1921)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911', 'd_1912',
       'd_1913', 'dollar_sales', 'weight'],
      dtype='object', length=1921)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,1,1,3,0,1,1,231.28,6.3e-05
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,0,0,0,0,35.73,1e-05
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,1,0,1,1,1,44.55,1.2e-05
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5,4,1,0,1,3,7,2,245.92,6.7e-05
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,1,1,2,2,2,4,89.28,2.4e-05


In [17]:
df.drop(columns=["dollar_sales"], inplace=True)
print(df.shape)
print(df.columns)
df.head()

(30490, 1920)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911',
       'd_1912', 'd_1913', 'weight'],
      dtype='object', length=1920)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,6.3e-05
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,1e-05
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,1.2e-05
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,6.7e-05
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,2.4e-05


## ML

In [18]:
# from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor


In [19]:
# train_start = 100
# train_end = 1885
# df.columns


In [20]:
# df_ml_x = df.drop(columns=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',"weight"] + [c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1])not in range(train_start,train_end+1)])
# df_ml_y = df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1]) not in range(train_end+1, train_end+28+1)] ]

In [21]:
# regressor = ExtraTreesRegressor(n_estimators=100,max_depth=50,max_features=50,random_state=42)

In [22]:
# regressor.fit(df_ml,df_ml_y)

In [23]:
# pred_df= pd.DataFrame(regressor.predict( 
#     df.drop(columns=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',"weight"] + [c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1])not in range(train_start+28,train_end+28+1)])))

# pred_df.columns=["F_"+str(d) for d in  range(train_end+1,train_end+28+1)]

# # df+df.join(pred_df)

## step2

In [24]:
for d in range(1886, 1914):
    df["F_" + str(d)] = 0

In [25]:
print(df.shape)
# print(df.columns)
df.head()

(30490, 1948)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
# df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1])<1885] + ['id'] ].set_index("id").transpose()

In [27]:
# complete_historical_mean_df = df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1])<1885] + ['id'] ].set_index("id").transpose().mean().reset_index()

In [28]:
# complete_historical_mean_df.head()

In [29]:
# df[[c for c in df.columns if c.find("d_") == 0 ]].sum(axis=1).min() # not have null

In [30]:
# def find_first_non_0(s):
#     assert type(s)==np.ndarray
#     return (s!=0).argmax(axis=0)

In [31]:
# non_0_strt_arr =[]
# hist_arr=np.array(df[[c for c in df.columns if c.find("d_") == 0 ]])
# for i in range(len(df)):
#     non_0_strt_arr.append(find_first_non_0(hist_arr[i,:]))

In [32]:
# test = list(df[[c for c in df.columns if c.find("d_") == 0 ] + ['id']].set_index('id').transpose()['HOBBIES_1_001_CA_1_validation'])

In [33]:
# print(test[non_0_strt_arr[0]])
# print(non_0_strt_arr[0])
# print(sum(test[:non_0_strt_arr[0]] ))
# print(sum(test[non_0_strt_arr[0]: ] ))

In [34]:
# num_non_zero =1885-np.array(non_0_strt_arr)
# num_non_zero

In [35]:
# non_zero_historical_mean_arr = np.array(df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1])<1885] + ['id'] ].set_index("id").transpose().mean().reset_index()[0])/num_non_zero
# non_zero_historical_mean_arr

In [36]:
# for d in range(1,29):
#     df['F_1_'+str(1885+d)] = list(complete_historical_mean_df[0])
#     df['F_2_'+str(1885+d)] = non_zero_historical_mean_arr

In [37]:
# method_dict={1:"complete hsitorical mean",2:"historical mean after first non-zero"}

In [38]:
# historical_mean_df10= df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1]) in range(1876,1886) ] + ['id'] ].set_index("id").transpose().mean().reset_index()
# historical_mean_df20= df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1]) in range(1866,1886) ] + ['id'] ].set_index("id").transpose().mean().reset_index()
# historical_mean_df30= df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1]) in range(1856,1886) ] + ['id'] ].set_index("id").transpose().mean().reset_index()
# historical_mean_df40= df[[c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1]) in range(1846,1886) ] + ['id'] ].set_index("id").transpose().mean().reset_index()

In [39]:
# for d in range(1,29):
#     df['F_3_'+str(1885+d)] = list(historical_mean_df10[0])
#     df['F_4_'+str(1885+d)] = list(historical_mean_df20[0])
#     df['F_5_'+str(1885+d)] = list(historical_mean_df30[0])
#     df['F_6_'+str(1885+d)] = list(historical_mean_df40[0])


In [40]:
# for d in range(1,29):
#     df['F_7_'+str(1885+d)] = df["d_"+str(1885+d-28)]
    


In [41]:
# method_dict[3] = "historical mean of recent 10 days"
# method_dict[4] = "historical mean of recent 20 days"
# method_dict[5] = "historical mean of recent 30 days"
# method_dict[6] = "historical mean of recent 40 days"
# method_dict[7] = "same as previous 28days"

## step3

In [42]:
agg_df = pd.DataFrame(
    df[[c for c in df.columns if c.find("d_") == 0 or c.find("F_") == 0]].sum()
    ).transpose()
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,0,0,0,0,0,0,0,0,0,0


In [43]:
agg_df["level"] = 1
agg_df["weight"] = 1/12
column_order = agg_df.columns

In [44]:
level_groupings = {2: ["state_id"], 3: ["store_id"], 4: ["cat_id"], 5: ["dept_id"], 
              6: ["state_id", "cat_id"], 7: ["state_id", "dept_id"], 8: ["store_id", "cat_id"], 9: ["store_id", "dept_id"],
              10: ["item_id"], 11: ["item_id", "state_id"]}

In [45]:
df.groupby(by=level_groupings[11]).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
item_id,state_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FOODS_1_001,CA,6,3,2,3,7,5,8,3,5,2,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_001,TX,0,1,2,2,0,4,0,0,4,3,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_001,WI,0,2,0,1,0,9,2,1,2,5,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_002,CA,3,3,4,4,3,3,0,2,1,1,...,0,0,0,0,0,0,0,0,0,0
FOODS_1_002,TX,0,0,2,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HOUSEHOLD_2_515,TX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOUSEHOLD_2_515,WI,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOUSEHOLD_2_516,CA,0,2,0,1,1,1,0,2,1,2,...,0,0,0,0,0,0,0,0,0,0
HOUSEHOLD_2_516,TX,2,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [46]:
for level in tqdm(level_groupings):
    temp_df = df.groupby(by=level_groupings[level]).sum().reset_index(drop=True)
    temp_df["level"] = level
    temp_df["weight"] /= 12
    agg_df = agg_df.append(temp_df[column_order])

del temp_df

100%|██████████| 10/10 [00:27<00:00,  2.71s/it]


In [47]:
print(agg_df.shape)
print(agg_df.columns)
agg_df.head(20)

(12350, 1943)
Index(['d_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10',
       ...
       'F_1906', 'F_1907', 'F_1908', 'F_1909', 'F_1910', 'F_1911', 'F_1912',
       'F_1913', 'level', 'weight'],
      dtype='object', length=1943)


Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,0,0,0,0,0,0,0,0,1,0.083333
0,14195,13805,10108,11047,9925,11322,12251,16610,14696,11822,...,0,0,0,0,0,0,0,0,2,0.036146
1,9438,9630,6778,7381,5912,9006,6226,9440,9376,7319,...,0,0,0,0,0,0,0,0,2,0.023281
2,8998,8314,6897,6984,3309,8883,9533,11882,8664,6431,...,0,0,0,0,0,0,0,0,2,0.023907
0,4337,4155,2816,3051,2630,3276,3450,5437,4340,3157,...,0,0,0,0,0,0,0,0,3,0.009092
1,3494,3046,2121,2324,1942,2288,2629,3729,2957,2218,...,0,0,0,0,0,0,0,0,3,0.008529
2,4739,4827,3785,4232,3817,4369,4703,5456,5581,4912,...,0,0,0,0,0,0,0,0,3,0.013165
3,1625,1777,1386,1440,1536,1389,1469,1988,1818,1535,...,0,0,0,0,0,0,0,0,3,0.00536
4,2556,2687,1822,2258,1694,2734,1691,2820,2887,2174,...,0,0,0,0,0,0,0,0,3,0.006599
5,3852,3937,2731,2954,2492,3439,2588,3772,3657,2932,...,0,0,0,0,0,0,0,0,3,0.008438


In [48]:
df["weight"] /= 12


In [49]:
print(df.shape)
print(df.columns)
df.head()

(30490, 1948)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'F_1904', 'F_1905', 'F_1906', 'F_1907', 'F_1908', 'F_1909', 'F_1910',
       'F_1911', 'F_1912', 'F_1913'],
      dtype='object', length=1948)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F_1904,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
print(df.shape[0], agg_df.shape[0], df.shape[0] + agg_df.shape[0])


30490 12350 42840


In [51]:
agg_df["weight"].sum() + df["weight"].sum()


0.9999999999999996

## step4

In [52]:
h = 28
n = 1885
def rmsse(ground_truth, forecast, train_series, axis=1):
    # assuming input are numpy array or matrices
    assert axis == 0 or axis == 1
    assert type(ground_truth) == np.ndarray and type(forecast) == np.ndarray and type(train_series) == np.ndarray
    
    if axis == 1:
        # using axis == 1 we must guarantee these are matrices and not arrays
        assert ground_truth.shape[1] > 1 and forecast.shape[1] > 1 and train_series.shape[1] > 1
    
    numerator = ((ground_truth - forecast)**2).sum(axis=axis)
    if axis == 1:
        denominator = 1/(n-1) * ((train_series[:, 1:] - train_series[:, :-1]) ** 2).sum(axis=axis)
    else:
        denominator = 1/(n-1) * ((train_series[1:] - train_series[:-1]) ** 2).sum(axis=axis)
    return (1/h * numerator/denominator) ** 0.5

In [53]:
# train_series_cols = days_cols[:-28] # X_train
# ground_truth_cols = df_days[-28:] # y_train
# forecast_cols = df.columns[df.columns.str.contains("F_")].values # y_pred

train_series_cols = [c for c in df.columns if c.find("d_") == 0][:-28]
ground_truth_cols = [c for c in df.columns if c.find("d_") == 0][-28:]
forecast_cols = [c for c in df.columns if c.find("F_") == 0]

In [54]:
print(len(days_cols))
# days_cols

1913


In [55]:
print(len(train_series_cols))
print(len(ground_truth_cols))
print(len(forecast_cols))

1885
28
28


In [56]:
df["rmsse"] = rmsse(np.array(df[ground_truth_cols]), 
                   np.array(df[forecast_cols]), np.array(df[train_series_cols]))
agg_df["rmsse"] = rmsse(np.array(agg_df[ground_truth_cols]), 
                   np.array(agg_df[forecast_cols]), np.array(agg_df[train_series_cols]))

In [57]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,F_1905,F_1906,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913,rmsse
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.768048
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.352828
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.530433
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.977543
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.161260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,0,0,0,0,0,0,0,0,0,0.513880
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.608944
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,0,0,0,0,0,0,0,0,0,0.954320
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.328081


In [58]:
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,F_1907,F_1908,F_1909,F_1910,F_1911,F_1912,F_1913,level,weight,rmsse
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,0,0,0,0,0,0,0,1,8.333333e-02,7.249848
0,14195,13805,10108,11047,9925,11322,12251,16610,14696,11822,...,0,0,0,0,0,0,0,2,3.614587e-02,6.776867
1,9438,9630,6778,7381,5912,9006,6226,9440,9376,7319,...,0,0,0,0,0,0,0,2,2.328074e-02,6.093869
2,8998,8314,6897,6984,3309,8883,9533,11882,8664,6431,...,0,0,0,0,0,0,0,2,2.390672e-02,6.968137
0,4337,4155,2816,3051,2630,3276,3450,5437,4340,3157,...,0,0,0,0,0,0,0,3,9.091642e-03,5.391700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11,2.239415e-07,0.000000
9143,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,11,4.478830e-07,1.139349
9144,0,2,0,1,1,1,0,2,1,2,...,0,0,0,0,0,0,0,11,2.295798e-06,1.065186
9145,2,1,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,11,2.430845e-06,0.763624


In [59]:
# for row_idx in range(len(df)):
#     row_df = pd.DataFrame(df.iloc[row_idx]).transpose()
#     train_series = np.array(row_df[train_series_cols].transpose()[row_idx])
#     ground_truth_series = np.array(row_df[ground_truth_cols].transpose()[row_idx])
#     forecast_series = np.array(row_df[forecast_cols].transpose()[row_idx])
#     print(rmsse(ground_truth_series, forecast_series, train_series, axis=0))

In [60]:
df["wrmsse"] = df["weight"] * df["rmsse"]
agg_df["wrmsse"] = agg_df["weight"] * agg_df["rmsse"]

In [61]:
df["wrmsse"].sum() + agg_df["wrmsse"].sum()


5.35979273095658

In [None]:
sample_sub = pd.read_csv('../datasets/sample_submission.csv')
print(sample_sub.shape)
sample_sub.head()

In [None]:
sample_sub.tail()

In [None]:
(sample_sub['id'][:len(df)] == df['id']).all()
print(df.shape)


In [None]:
df.head()

In [None]:
sample_sub['id'][len(df):]

In [None]:
df.head()

In [None]:
submit_df = df[['id']]
for i in range(1,29):
    submit_df['F'+str(i)]= df['F_7_'+str(1885+i)]

In [None]:
submit_df2 = submit_df.copy()
submit_df2['id'] = submit_df2['id'].apply(lambda x:x.replace("validation","evaluation"))

In [None]:
submit_df = submit_df.append(submit_df2).reset_index(drop=True)

In [None]:
submit_df

In [None]:
submit_df.to_csv("submission.csv",index=False)