# shallow learning

In [1]:

import numpy as np
import pandas as pd 
import seaborn as sns  

import matplotlib.pyplot as plt
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error
# from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

import time	
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline 
from pylab import rcParams

rcParams['figure.dpi'] = 120
rcParams['savefig.dpi'] = 120 
rcParams['font.size'] = 20


from tqdm import tqdm

In [2]:
df = pd.read_csv('../datasets/sales_train_validation.csv')
print(df.shape)
df.head()

(30490, 1919)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4


In [3]:
submission = pd.read_csv('../datasets/sample_submission.csv')
print(submission.shape)
submission.head()

(60980, 29)


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,...,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,HOBBIES_1_001_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1_validation,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
price_df = pd.read_csv('../datasets/sell_prices.csv')
print(price_df.shape)
price_df.head()

(6841121, 4)


Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price
0,CA_1,HOBBIES_1_001,11325,9.58
1,CA_1,HOBBIES_1_001,11326,9.58
2,CA_1,HOBBIES_1_001,11327,8.26
3,CA_1,HOBBIES_1_001,11328,8.26
4,CA_1,HOBBIES_1_001,11329,8.26


In [5]:
cal_df = pd.read_csv('../datasets/calendar.csv')
print(cal_df.shape)
cal_df.head(5)

(1969, 14)


Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,d_1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,d_2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,d_3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,d_4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,d_5,,,,,1,0,1


In [6]:
cal_df["d"]=cal_df["d"].apply(lambda x: int(x.split("_")[1])) # d_1 to 1
cal_df.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [7]:
price_df["id"] = price_df["item_id"] + "_" + price_df["store_id"] + "_validation"

In [8]:
price_df.head()

Unnamed: 0,store_id,item_id,wm_yr_wk,sell_price,id
0,CA_1,HOBBIES_1_001,11325,9.58,HOBBIES_1_001_CA_1_validation
1,CA_1,HOBBIES_1_001,11326,9.58,HOBBIES_1_001_CA_1_validation
2,CA_1,HOBBIES_1_001,11327,8.26,HOBBIES_1_001_CA_1_validation
3,CA_1,HOBBIES_1_001,11328,8.26,HOBBIES_1_001_CA_1_validation
4,CA_1,HOBBIES_1_001,11329,8.26,HOBBIES_1_001_CA_1_validation


In [9]:
days_cols = df.columns[df.columns.str.contains("d_")].values
df_days = df[days_cols]
df_days

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,0,0,0,0,0,0,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,0,0,0,0,0,0,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,0,0,0,0,0,0,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,2,2,0,3,1,4,1,0,...,2,0,0,0,0,0,1,0,0,1
30486,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,0,0,1,0
30487,0,6,0,2,2,4,1,8,5,2,...,2,1,0,2,0,1,0,0,1,0
30488,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


## step 1
### *12 LEVEL 重要*

In [10]:
unit_sales_df =pd.DataFrame()

for day in tqdm(range(1858, 1886)):
    wk_id = list(cal_df[cal_df["d"]==day]["wm_yr_wk"])[0]
#     print("wk_id",wk_id)
    wk_price_df = price_df[price_df["wm_yr_wk"]==wk_id]
#     print("wk_price_df",wk_price_df)
    df = df.merge(wk_price_df[["sell_price", "id"]], on=["id"], how='inner')
#     df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)]
    unit_sales_df["unit_sales_" + str(day)] = df["sell_price"] * df["d_" + str(day)]
    df.drop(columns=["sell_price"], inplace=True)

100%|██████████| 28/28 [00:20<00:00,  1.36it/s]


In [11]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,3,0,1,1,1,3,0,1,1
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,2,1,1,1,0,1,1,1
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,5,4,1,0,1,3,7,2
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,0,1,1,2,2,2,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,2,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,2,1,0,2,0,1,0,0,1,0
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,1,0,0,1,0,3,1,3


In [13]:
df["dollar_sales"] = unit_sales_df.sum(axis=1)

In [14]:
print(df.shape)
print(df.columns)
df.head()

(30490, 1920)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911',
       'd_1912', 'd_1913', 'dollar_sales'],
      dtype='object', length=1920)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,231.28
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,35.73
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,44.55
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,245.92
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,89.28


In [15]:
df["weight"] = df["dollar_sales"] / df["dollar_sales"].sum()

In [16]:
print(df.shape)
print(df.columns)
df.head()

(30490, 1921)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911', 'd_1912',
       'd_1913', 'dollar_sales', 'weight'],
      dtype='object', length=1921)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,dollar_sales,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,1,1,1,3,0,1,1,231.28,6.3e-05
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,1,0,0,0,0,35.73,1e-05
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,2,1,1,1,0,1,1,1,44.55,1.2e-05
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,5,4,1,0,1,3,7,2,245.92,6.7e-05
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,0,1,1,2,2,2,4,89.28,2.4e-05


In [17]:
df.drop(columns=["dollar_sales"], inplace=True)
df["weight"]/=12
print(df.shape)
print(df.columns)
df.head()

(30490, 1920)
Index(['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'd_1',
       'd_2', 'd_3', 'd_4',
       ...
       'd_1905', 'd_1906', 'd_1907', 'd_1908', 'd_1909', 'd_1910', 'd_1911',
       'd_1912', 'd_1913', 'weight'],
      dtype='object', length=1920)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,5.258191e-06
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,8.123278e-07
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,1.012852e-06
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,5.591034e-06
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,2.029797e-06


In [18]:
# df.to_csv("m_df.csv")

# ***

In [19]:
# df = pd.read_csv("m_df.csv",index_col=0)
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d_1,d_2,d_3,d_4,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,weight
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,3,0,1,1,1,3,0,1,1,5.258191e-06
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,0,0,0,1,0,0,0,0,8.123278e-07
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,2,1,1,1,0,1,1,1,1.012852e-06
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,0,5,4,1,0,1,3,7,2,5.591034e-06
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,0,0,0,...,1,1,0,1,1,2,2,2,4,2.029797e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0,2,2,...,0,0,0,0,0,1,0,0,1,4.697087e-07
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0.000000e+00
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,6,0,2,...,1,0,2,0,1,0,0,1,0,3.167010e-06
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,0,0,0,...,0,1,0,0,1,0,3,1,3,1.018536e-06


In [20]:
# regressor.fit(df_ml,df_ml_y)

In [21]:
# pred_df= pd.DataFrame(regressor.predict( 
#     df.drop(columns=['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id',"weight"] + [c for c in df.columns if c.find("d_") == 0 and int(c.split("_")[1])not in range(train_start+28,train_end+28+1)])))

# pred_df.columns=["F_"+str(d) for d in  range(train_end+1,train_end+28+1)]

# # df+df.join(pred_df)

## step 2 
### *agg_df 12 level*

In [22]:
agg_df = pd.DataFrame(
#     df[[c for c in df.columns if c.find("d_") == 0 ]].sum()
    df[days_cols].sum()
    ).transpose()
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,41789,48362,51640,38059,37570,35343,35033,40517,48962,49795


In [23]:
id_cols=["item_id",'dept_id','cat_id','store_id','state_id']
df[id_cols]
for col in id_cols:
    agg_df[col]='all'
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1909,d_1910,d_1911,d_1912,d_1913,item_id,dept_id,cat_id,store_id,state_id
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,35343,35033,40517,48962,49795,all,all,all,all,all


In [24]:
agg_df['level'] =1
agg_df['weight'] =1/12
column_order =agg_df.columns
print(column_order)
agg_df

Index(['d_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10',
       ...
       'd_1911', 'd_1912', 'd_1913', 'item_id', 'dept_id', 'cat_id',
       'store_id', 'state_id', 'level', 'weight'],
      dtype='object', length=1920)


Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1911,d_1912,d_1913,item_id,dept_id,cat_id,store_id,state_id,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,40517,48962,49795,all,all,all,all,all,1,0.083333


In [25]:
level_groupings = {2: ["state_id"], 3: ["store_id"], 4: ["cat_id"], 5: ["dept_id"], 
              6: ["state_id", "cat_id"], 7: ["state_id", "dept_id"], 8: ["store_id", "cat_id"], 9: ["store_id", "dept_id"],
              10: ["item_id"], 11: ["item_id", "state_id"]}

In [26]:
agg_df

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1911,d_1912,d_1913,item_id,dept_id,cat_id,store_id,state_id,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,40517,48962,49795,all,all,all,all,all,1,0.083333


In [27]:
df.groupby(by=level_groupings[11]).sum() # 12级别分类

Unnamed: 0_level_0,Unnamed: 1_level_0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,weight
item_id,state_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
FOODS_1_001,CA,6,3,2,3,7,5,8,3,5,2,...,2,27,6,2,2,4,2,3,5,4.990826e-06
FOODS_1_001,TX,0,1,2,2,0,4,0,0,4,3,...,1,2,1,0,0,2,0,1,0,2.342633e-06
FOODS_1_001,WI,0,2,0,1,0,9,2,1,2,5,...,1,1,0,3,1,0,0,12,1,1.273170e-06
FOODS_1_002,CA,3,3,4,4,3,3,0,2,1,1,...,5,2,0,0,2,2,1,2,1,1.293177e-05
FOODS_1_002,TX,0,0,2,0,0,0,0,1,0,0,...,1,0,0,2,1,0,0,0,0,3.017413e-06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
HOUSEHOLD_2_515,TX,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.239415e-07
HOUSEHOLD_2_515,WI,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,1,1,1,0,4.478830e-07
HOUSEHOLD_2_516,CA,0,2,0,1,1,1,0,2,1,2,...,0,0,0,0,0,1,1,0,1,2.295798e-06
HOUSEHOLD_2_516,TX,2,1,0,0,0,0,1,1,0,0,...,1,1,0,0,1,0,0,0,0,2.430845e-06


In [28]:
for level in tqdm(level_groupings):
    temp_df = df.groupby(by=level_groupings[level]).sum().reset_index()
    temp_df["level"] = level
#     temp_df["weight"] /= 12
    for c in column_order:
        if c not in temp_df.columns:
            temp_df[c] = "all"
    agg_df = agg_df.append(temp_df[column_order])

del temp_df

100%|██████████| 10/10 [00:17<00:00,  1.74s/it]


In [29]:
print(agg_df.shape)
print(agg_df.columns)
agg_df
# agg_df.head(40)

(12350, 1920)
Index(['d_1', 'd_2', 'd_3', 'd_4', 'd_5', 'd_6', 'd_7', 'd_8', 'd_9', 'd_10',
       ...
       'd_1911', 'd_1912', 'd_1913', 'item_id', 'dept_id', 'cat_id',
       'store_id', 'state_id', 'level', 'weight'],
      dtype='object', length=1920)


Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,d_1911,d_1912,d_1913,item_id,dept_id,cat_id,store_id,state_id,level,weight
0,32631,31749,23783,25412,19146,29211,28010,37932,32736,25572,...,40517,48962,49795,all,all,all,all,all,1,8.333333e-02
0,14195,13805,10108,11047,9925,11322,12251,16610,14696,11822,...,17095,21834,23187,all,all,all,all,CA,2,3.614587e-02
1,9438,9630,6778,7381,5912,9006,6226,9440,9376,7319,...,10615,12266,12282,all,all,all,all,TX,2,2.328074e-02
2,8998,8314,6897,6984,3309,8883,9533,11882,8664,6431,...,12807,14862,14326,all,all,all,all,WI,2,2.390672e-02
0,4337,4155,2816,3051,2630,3276,3450,5437,4340,3157,...,4387,5577,6113,all,all,all,CA_1,all,3,9.091642e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9142,0,0,0,0,0,0,0,0,0,0,...,0,0,0,HOUSEHOLD_2_515,all,all,all,TX,11,2.239415e-07
9143,0,0,0,0,0,0,0,0,0,0,...,1,1,0,HOUSEHOLD_2_515,all,all,all,WI,11,4.478830e-07
9144,0,2,0,1,1,1,0,2,1,2,...,1,0,1,HOUSEHOLD_2_516,all,all,all,CA,11,2.295798e-06
9145,2,1,0,0,0,0,1,1,0,0,...,0,0,0,HOUSEHOLD_2_516,all,all,all,TX,11,2.430845e-06


In [30]:
print(df.shape[0], agg_df.shape[0], df.shape[0] + agg_df.shape[0])


30490 12350 42840


In [31]:
agg_df["weight"].sum()  + df["weight"].sum()


1.0000000000000007

In [32]:
# agg_df.to_csv("agg_df_12level.csv")

In [33]:
# agg_df=pd.read_csv("agg_df_12level.csv",index_col=0)

## ML

In [34]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor
import random


In [35]:
h = 28
def rmsse(ground_truth, forecast, train_series, axis=1, n=1885):
    # assuming input are numpy array or matrices
    assert axis == 0 or axis == 1
    assert type(ground_truth) == np.ndarray and type(forecast) == np.ndarray and type(train_series) == np.ndarray
    
    if axis == 1:
        # using axis == 1 we must guarantee these are matrices and not arrays
        assert ground_truth.shape[1] > 1 and forecast.shape[1] > 1 and train_series.shape[1] > 1
    
    numerator = ((ground_truth - forecast)**2).sum(axis=axis)
    if axis == 1:
        denominator = 1/(n-1) * ((train_series[:, 1:] - train_series[:, :-1]) ** 2).sum(axis=axis)
    else:
        denominator = 1/(n-1) * ((train_series[1:] - train_series[:-1]) ** 2).sum(axis=axis)
    return (1/h * numerator/denominator) ** 0.5

In [36]:
pd.get_dummies(df.drop(columns=["id", "item_id", "weight"]))

Unnamed: 0,d_1,d_2,d_3,d_4,d_5,d_6,d_7,d_8,d_9,d_10,...,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3,state_id_CA,state_id_TX,state_id_WI
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,0,0,2,2,0,3,1,4,1,0,...,0,0,0,0,0,0,1,0,0,1
30486,0,0,0,0,0,5,0,1,1,3,...,0,0,0,0,0,0,1,0,0,1
30487,0,6,0,2,2,4,1,8,5,2,...,0,0,0,0,0,0,1,0,0,1
30488,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [37]:
df = df[["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "weight"]].join(pd.get_dummies(df.drop(columns=["id", "item_id", "weight"])))

In [38]:
df

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,weight,d_1,d_2,d_3,...,store_id_CA_4,store_id_TX_1,store_id_TX_2,store_id_TX_3,store_id_WI_1,store_id_WI_2,store_id_WI_3,state_id_CA,state_id_TX,state_id_WI
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,5.258191e-06,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,8.123278e-07,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,1.012852e-06,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,5.591034e-06,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,2.029797e-06,0,0,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_validation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,4.697087e-07,0,0,2,...,0,0,0,0,0,0,1,0,0,1
30486,FOODS_3_824_WI_3_validation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0.000000e+00,0,0,0,...,0,0,0,0,0,0,1,0,0,1
30487,FOODS_3_825_WI_3_validation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,3.167010e-06,0,6,0,...,0,0,0,0,0,0,1,0,0,1
30488,FOODS_3_826_WI_3_validation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,1.018536e-06,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [39]:
best_s = 100
best_m = None
best_start_date = 1000

In [None]:
for _ in tqdm(range(50)):
    rand_est = random.randint(20, 50)
    rand_depth = random.randint(10, 30)
    rand_start_date = random.randint(1200, 1500)
    
    print(rand_est, rand_depth)
    
    
    average = []
    
    for cv in range(1, 4):
        train_start = rand_start_date - 28 * cv
        train_end = 1885 - 28 * cv
        
        regressor = ExtraTreesRegressor(n_estimators=rand_est, max_depth=rand_depth, random_state=42)
        
        drop_cols = [item for item in [c for c in df.columns if c.find("F_")==0] + ['wrmsse', 'rmsse'] if item in df.columns]
        df.drop(columns=drop_cols, inplace=True)

        regressor.fit(df.drop(columns=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] +\
                              [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) not in range(train_start, train_end + 1)]),
              df[[c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) in range(train_end + 1, train_end + 28 + 1)]])

        pred_df = pd.DataFrame(regressor.predict(df.drop(columns=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] +\
                                       [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) not in range(train_start+28, train_end + 28 + 1)])))
        pred_df.columns = ["F_" + str(d) for d in range(train_end + 28 + 1, train_end + 28 + 28 + 1)]
        df = df.join(pred_df)

        # remake agg_df
        new_agg_df = pd.DataFrame(df[[c for c in df.columns if c.find("d_") == 0 or c.find("F_") == 0]].sum()).transpose()
        new_agg_df["level"] = 1
        new_agg_df["weight"] = 1/12
        column_order = new_agg_df.columns

        for level in level_groupings:
            temp_df = df.groupby(by=level_groupings[level]).sum().reset_index()
            temp_df["level"] = level
            new_agg_df = new_agg_df.append(temp_df[column_order])
        del temp_df

        agg_df = new_agg_df
        
        train_series_cols = [c for c in df.columns if c.find("d_") == 0][:-28]
        ground_truth_cols = [c for c in df.columns if c.find("d_") == 0][-28:]
        forecast_cols = [c for c in df.columns if c.find("F_") == 0]

        df["rmsse"] = rmsse(np.array(df[ground_truth_cols]), 
                np.array(df[forecast_cols]), np.array(df[train_series_cols]))
        agg_df["rmsse"] = rmsse(np.array(agg_df[ground_truth_cols]), 
                np.array(agg_df[forecast_cols]), np.array(agg_df[train_series_cols]))

        df["wrmsse"] = df["weight"] * df["rmsse"]
        agg_df["wrmsse"] = agg_df["weight"] * agg_df["rmsse"]

        print("CV", cv, ":", df["wrmsse"].sum() + agg_df["wrmsse"].sum())

        average.append(df["wrmsse"].sum() + agg_df["wrmsse"].sum())
    
    this_s = np.array(average).mean()
    if this_s < best_s:
        best_s = this_s
        best_m = regressor
        best_start_date = rand_start_date
        
    print(this_s, best_s)

  0%|          | 0/50 [00:00<?, ?it/s]

36 15
CV 1 : 0.7836962141263903
CV 2 : 0.866442054218693


  2%|▏         | 1/50 [05:46<4:42:46, 346.25s/it]

CV 3 : 0.8891817109041317
0.8464399930830716 0.8464399930830716
22 25
CV 1 : 0.7714919907299541
CV 2 : 0.8402991482171679


  4%|▍         | 2/50 [12:22<4:49:04, 361.34s/it]

CV 3 : 0.852875351019692
0.8215554966556047 0.8215554966556047
21 17
CV 1 : 0.7830002809623493
CV 2 : 0.86116527934148


  6%|▌         | 3/50 [17:03<4:24:00, 337.03s/it]

CV 3 : 0.8717597288078816
0.8386417630372369 0.8215554966556047
39 30
CV 1 : 0.7618731983380417
CV 2 : 0.8369803090841645


  8%|▊         | 4/50 [27:22<5:23:14, 421.62s/it]

CV 3 : 0.851505967048588
0.8167864914902646 0.8167864914902646
24 16
CV 1 : 0.7824386336076652
CV 2 : 0.865525607252154


 10%|█         | 5/50 [31:07<4:31:57, 362.60s/it]

CV 3 : 0.8826677139897637
0.843543984949861 0.8167864914902646
22 14
CV 1 : 0.7856550152885491
CV 2 : 0.8690917861240048


 12%|█▏        | 6/50 [34:38<3:52:41, 317.30s/it]

CV 3 : 0.8800017221543924
0.8449161745223154 0.8167864914902646
34 24
CV 1 : 0.7700803977773538
CV 2 : 0.8353904095525989


 14%|█▍        | 7/50 [42:56<4:26:11, 371.43s/it]

CV 3 : 0.8587408384590443
0.8214038819296657 0.8167864914902646
33 12
CV 1 : 0.8039149698585317
CV 2 : 0.8856252263116733


 16%|█▌        | 8/50 [47:11<3:55:31, 336.46s/it]

CV 3 : 0.918104722957546
0.8692149730425837 0.8167864914902646
41 23
CV 1 : 0.7706793423261269
CV 2 : 0.8407679558058683


 18%|█▊        | 9/50 [56:42<4:38:03, 406.93s/it]

CV 3 : 0.8500416636337174
0.8204963205885708 0.8167864914902646
23 23
CV 1 : 0.770843519028543
CV 2 : 0.8517835195696416


 20%|██        | 10/50 [1:02:22<4:17:47, 386.70s/it]

CV 3 : 0.8753600438646376
0.8326623608209408 0.8167864914902646
43 29
CV 1 : 0.7671401261056419
CV 2 : 0.8301220002161538


 22%|██▏       | 11/50 [1:15:11<5:25:55, 501.41s/it]

CV 3 : 0.8536469450725304
0.8169696904647754 0.8167864914902646
50 24
CV 1 : 0.7637172318764425
CV 2 : 0.8429056902599145


 24%|██▍       | 12/50 [1:24:45<5:31:25, 523.31s/it]

CV 3 : 0.8518324234614486
0.8194851151992685 0.8167864914902646
37 20
CV 1 : 0.7728821270468498
CV 2 : 0.8456648163926863


 26%|██▌       | 13/50 [1:31:21<4:59:10, 485.15s/it]

CV 3 : 0.8545152207035448
0.8243540547143603 0.8167864914902646
38 24
CV 1 : 0.7691073927688172
CV 2 : 0.8380511468356258


 28%|██▊       | 14/50 [1:41:02<5:08:16, 513.80s/it]

CV 3 : 0.8495993829958985
0.8189193075334472 0.8167864914902646
42 23
CV 1 : 0.7716166708061559
CV 2 : 0.8375208899078438


 30%|███       | 15/50 [1:52:08<5:26:20, 559.44s/it]

CV 3 : 0.8533155529403691
0.8208177045514562 0.8167864914902646
27 16
CV 1 : 0.7807153088379157
CV 2 : 0.8621200011964143


 32%|███▏      | 16/50 [1:56:20<4:24:47, 467.29s/it]

CV 3 : 0.8824386828393125
0.8417579976245474 0.8167864914902646
39 10
CV 1 : 0.8377971139539478
CV 2 : 0.9095755030040551


 34%|███▍      | 17/50 [2:00:11<3:37:59, 396.34s/it]

CV 3 : 0.9288226067910288
0.8920650745830105 0.8167864914902646
24 30
CV 1 : 0.7667766234109523
CV 2 : 0.8342829311048072


 36%|███▌      | 18/50 [2:09:11<3:54:26, 439.59s/it]

CV 3 : 0.863107868901839
0.8213891411391995 0.8167864914902646
39 28
CV 1 : 0.7662292619027838
CV 2 : 0.8310383076236607


 38%|███▊      | 19/50 [2:21:41<4:35:09, 532.56s/it]

CV 3 : 0.8534850445602403
0.816917538028895 0.8167864914902646
45 15
CV 1 : 0.786155539217597
CV 2 : 0.8629035379913083


 40%|████      | 20/50 [2:28:02<4:03:34, 487.15s/it]

CV 3 : 0.8805113697679154
0.8431901489922735 0.8167864914902646
45 12
CV 1 : 0.8013276022508549
CV 2 : 0.880847306793157


 42%|████▏     | 21/50 [2:33:36<3:33:12, 441.10s/it]

CV 3 : 0.9111469809871373
0.8644406300103831 0.8167864914902646
41 15
CV 1 : 0.7824621171537295
CV 2 : 0.8678859175017335


 44%|████▍     | 22/50 [2:38:49<3:07:56, 402.71s/it]

CV 3 : 0.8943081005780146
0.8482187117444925 0.8167864914902646
44 23
CV 1 : 0.76987859288037
CV 2 : 0.8383131773401781


 46%|████▌     | 23/50 [2:51:08<3:46:42, 503.80s/it]

CV 3 : 0.8659436687556484
0.8247118129920654 0.8167864914902646
25 23
CV 1 : 0.7701785858502376
CV 2 : 0.8443405032834363


 48%|████▊     | 24/50 [2:58:26<3:29:39, 483.83s/it]

CV 3 : 0.865681111134456
0.8267334000893767 0.8167864914902646
38 23
CV 1 : 0.7656104351480494
CV 2 : 0.8397610208506534


In [None]:
# fit the best_m with the closest training set
best_m.fit(df.drop(columns=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] +\
                              [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) not in range(best_start_date, 1886)]),
              df[[c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) in range(1886, 1914)]])

In [None]:
#x_train
df.drop(columns=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] +\
                              [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) not in range(best_start_date, 1886)])

In [None]:
#y-train
df[[c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) in range(1886, 1914)]]

In [None]:
#x_pred
df.drop(columns=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] +
                               [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) not in range(best_start_date+28, 1914)])

In [None]:
submit_df = df[["id"]]
pred = best_m.predict(df.drop(columns=["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"] +
                               [c for c in df.columns if c.find("d_")==0 and int(c.split("_")[1]) not in range(best_start_date+28, 1914)]))
for i in range(1, 29):
    submit_df["F" + str(i)] = pred[:, i-1]

In [None]:
submit_df2 = submit_df.copy()
submit_df2["id"] = submit_df2["id"].apply(lambda x: x.replace('validation',
                                                              'evaluation'))


In [None]:
submit_df = submit_df.append(submit_df2).reset_index(drop=True)


In [None]:
submit_df.to_csv("submission-v5.csv", index=False)


In [None]:
submit_df