In [1]:
TRAIN_CSV_PATH = r"../datasets/train.csv"
PRICES_CSV_PATH = r"../datasets/prices.csv"
CAL_CSV_PATH = r"../datasets/calendar.csv"
SAMPLE_CSV_PATH = r"../datasets/sample_submission.csv"

EXPORT_PATH = r"../submissions/"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsforecast import StatsForecast
from statsforecast.models import Naive, SeasonalNaive, WindowAverage, SeasonalWindowAverage

from sklearn.metrics import mean_squared_error

  from tqdm.autonotebook import tqdm


In [3]:
df_train = pd.read_csv(TRAIN_CSV_PATH)
df_sample_sub = pd.read_csv(SAMPLE_CSV_PATH)
df_cal = pd.read_csv(CAL_CSV_PATH)

df_dates = pd.DataFrame(columns = ["d"], data = df_train.columns[6:])
df_dates = df_dates.merge(df_cal[["date", "d"]], on = "d", how = "left")

In [12]:
# Convert to appropriate datatypes
df_train["id"] = df_train["id"].astype("string")
df_train["item_id"] = df_train["item_id"].astype("string")
df_train["subcat_id"] = df_train["subcat_id"].astype("string")
df_train["category_id"] = df_train["category_id"].astype("string")
df_train["store_id"] = df_train["store_id"].astype("string")
df_train["region_id"] = df_train["region_id"].astype("string")


# Change dtypes
df_cal["date"] = pd.to_datetime(df_cal["date"])
df_cal["weekday"] = df_cal["weekday"].astype("string")
df_cal["d"] = df_cal["d"].astype("string")
df_cal["wm_yr_wk"] = df_cal["wm_yr_wk"].astype(int)

# Add column with months in string
month_names_ls = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
month_int_ls = list(range(1, 13))
month_names_dict = {month_int_ls[i]: month_names_ls[i] for i in range(len(month_int_ls))}

df_cal["month_name"] = df_cal["month"].replace(month_names_dict)

In [15]:
df_dates["d"] = df_dates["d"].astype("string")
df_dates["date"] = pd.to_datetime(df_dates["date"])

#### Super Naive models (and more):
- Items sold same as previous 21 days

Kaggle score: 2.81

In [4]:
df_sample_sub.iloc[:,1:] = df_train.iloc[:, -21:].values

In [5]:
# df_sample_sub.to_csv(EXPORT_PATH+"/submission_4.csv", header=True, index=False)

- Items sold same as previous 21 days + 1

Kaggle score: 2.95

In [6]:
df_sample_sub.iloc[:,1:] = df_train.iloc[:, -21:].values+1

In [7]:
# df_sample_sub.to_csv(EXPORT_PATH+"/submission_5.csv", header=True, index=False)

### Forecasting using naive methods from statsmodels

From [here](https://forecastegy.com/posts/naive-time-series-forecasting-in-python/#:~:text=the%20older%20ones.-,Seasonal%20Naive%20Forecast,than%20the%20simple%20naive%20model.)

In [16]:
dates_dict = dict(zip(list(df_dates["d"]), list(df_dates["date"])))
df_train_dates = df_train.rename(dates_dict, axis = 1)

df_naive_train = df_train_dates[["id"] + list(df_train_dates.columns[6:])].melt(id_vars = ["id"], var_name= "ds", value_name = "y")
df_naive_train = df_naive_train.rename(columns = {"id":"unique_id"})
df_naive_train.head()

Unnamed: 0,unique_id,ds,y
0,Beauty_1_001_East_1,2011-01-29,0
1,Beauty_1_002_East_1,2011-01-29,0
2,Beauty_1_003_East_1,2011-01-29,0
3,Beauty_1_004_East_1,2011-01-29,0
4,Beauty_1_005_East_1,2011-01-29,0


#### Testing RMSE on train / test validation set 

In [17]:
# Split into train and validation 
train = df_naive_train.loc[df_naive_train['ds'] < '2015-01-29']
valid = df_naive_train.loc[(df_naive_train['ds'] >= '2015-01-29')]
h = valid['ds'].nunique()

In [18]:
model = StatsForecast(models=[Naive(), 
                              SeasonalNaive(season_length=7), 
                              WindowAverage(window_size=7), 
                              SeasonalWindowAverage(window_size=2, season_length=7)],
                      freq='D', n_jobs=-1)
model.fit(train)

StatsForecast(models=[Naive,SeasonalNaive,WindowAverage,SeasWA])

In [19]:
p = model.predict(h=h)
p = p.reset_index().merge(valid, on=['ds', 'unique_id'], how='left')
p.head()

Unnamed: 0,unique_id,ds,Naive,SeasonalNaive,WindowAverage,SeasWA,y
0,Beauty_1_001_Central_1,2015-01-29,0.0,0.0,0.0,0.0,1
1,Beauty_1_001_Central_1,2015-01-30,0.0,0.0,0.0,0.0,0
2,Beauty_1_001_Central_1,2015-01-31,0.0,0.0,0.0,0.0,0
3,Beauty_1_001_Central_1,2015-02-01,0.0,0.0,0.0,0.0,1
4,Beauty_1_001_Central_1,2015-02-02,0.0,0.0,0.0,0.0,0


In [20]:
# Unmelt into 4 different dataframes
df_seaswa = p[["unique_id", "ds", "SeasWA"]].pivot(index = "unique_id", columns = "ds", values = "SeasWA")
df_seaswa.head()

ds,2015-01-29,2015-01-30,2015-01-31,2015-02-01,2015-02-02,2015-02-03,2015-02-04,2015-02-05,2015-02-06,2015-02-07,...,2016-04-21,2016-04-22,2016-04-23,2016-04-24,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30
unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Beauty_1_001_Central_1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Beauty_1_001_Central_2,0.0,1.5,0.0,0.5,0.0,0.0,1.0,0.0,1.5,0.0,...,0.0,1.5,0.0,0.5,0.0,0.0,1.0,0.0,1.5,0.0
Beauty_1_001_Central_3,1.0,0.5,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0,...,1.0,0.5,0.0,0.0,0.0,1.0,0.5,1.0,0.5,0.0
Beauty_1_001_East_1,0.5,0.0,1.5,0.5,1.0,0.5,0.0,0.5,0.0,1.5,...,0.5,0.0,1.5,0.5,1.0,0.5,0.0,0.5,0.0,1.5
Beauty_1_001_East_2,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0


In [21]:
print(np.sqrt(mean_squared_error(valid["y"], p["Naive"])))
print(np.sqrt(mean_squared_error(valid["y"], p["SeasonalNaive"])))
print(np.sqrt(mean_squared_error(valid["y"], p["WindowAverage"])))
print(np.sqrt(mean_squared_error(valid["y"], p["SeasWA"])))

4.60855218075943
4.804250660557115
4.49464505136421
4.702569543222306


### Producing 21 day forecast, training on the entire train.csv

In [22]:
df_naive_train.head()

Unnamed: 0,unique_id,ds,y
0,Beauty_1_001_East_1,2011-01-29,0
1,Beauty_1_002_East_1,2011-01-29,0
2,Beauty_1_003_East_1,2011-01-29,0
3,Beauty_1_004_East_1,2011-01-29,0
4,Beauty_1_005_East_1,2011-01-29,0


In [23]:
model_full = StatsForecast(df = df_naive_train, 
                        models=[Naive(), 
                              SeasonalNaive(season_length=7), 
                              WindowAverage(window_size=7), 
                              SeasonalWindowAverage(window_size=2, season_length=7)],
                      freq='D', n_jobs=-1)

preds_full = model_full.forecast(h = 21)

In [24]:
preds_full = preds_full.reset_index()
preds_full.head()

Unnamed: 0,unique_id,ds,Naive,SeasonalNaive,WindowAverage,SeasWA
0,Beauty_1_001_Central_1,2016-05-01,0.0,2.0,0.571429,1.0
1,Beauty_1_001_Central_1,2016-05-02,0.0,0.0,0.571429,0.0
2,Beauty_1_001_Central_1,2016-05-03,0.0,0.0,0.571429,0.5
3,Beauty_1_001_Central_1,2016-05-04,0.0,0.0,0.571429,0.0
4,Beauty_1_001_Central_1,2016-05-05,0.0,2.0,0.571429,1.5


#### Exporting seasonal window average

In [25]:
df_seaswa = preds_full[["unique_id", "ds", "SeasWA"]].pivot(index = "unique_id", columns = "ds", values = "SeasWA")

# Change col names back to day ints
day_to_d = dict(zip(list(df_seaswa.columns), list(df_sample_sub.columns[1:])))
df_seaswa = df_seaswa.rename(day_to_d, axis = 1).reset_index()

# Round up to nearest int
df_seaswa.iloc[:, 1:] = df_seaswa.iloc[:, 1:].apply(np.ceil).astype(int)

df_seaswa.head()

ds,unique_id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
0,Beauty_1_001_Central_1,1,0,1,0,2,0,0,1,0,...,2,0,0,1,0,1,0,2,0,0
1,Beauty_1_001_Central_2,0,0,0,1,0,1,0,0,0,...,0,1,0,0,0,0,1,0,1,0
2,Beauty_1_001_Central_3,0,0,1,1,1,0,0,0,0,...,1,0,0,0,0,1,1,1,0,0
3,Beauty_1_001_East_1,1,1,1,1,3,0,2,1,1,...,3,0,2,1,1,1,1,3,0,2
4,Beauty_1_001_East_2,2,1,1,2,1,1,1,2,1,...,1,1,1,2,1,1,2,1,1,1


In [26]:
# Sort into the original ordering by ID
df_seaswa[["category", "store", "num", "region", "num_2"]] = df_seaswa["unique_id"].str.split("_", expand = True)
df_seaswa["region"] = pd.Categorical(df_seaswa["region"], ["East", "Central", "West"])
df_seaswa = df_seaswa.sort_values(by = ["region", "num_2", "category", "store", "num"])
df_seaswa = df_seaswa.drop(["category", "store", "num", "region", "num_2"], axis =1)

# Rename ID col
df_seaswa = df_seaswa.rename(columns = {"unique_id" : "id"})

df_seaswa.head()

ds,id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
3,Beauty_1_001_East_1,1,1,1,1,3,0,2,1,1,...,3,0,2,1,1,1,1,3,0,2
13,Beauty_1_002_East_1,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,1,0,0,0
23,Beauty_1_003_East_1,2,1,1,1,1,1,2,2,1,...,1,1,2,2,1,1,1,1,1,2
33,Beauty_1_004_East_1,4,2,1,1,2,4,4,4,2,...,2,4,4,4,2,1,1,2,4,4
43,Beauty_1_005_East_1,3,1,1,2,3,2,1,3,1,...,3,2,1,3,1,1,2,3,2,1


In [27]:
# df_seaswa.to_csv(EXPORT_PATH+"/submission_6.csv", header=True, index=False)

#### Exporting window average (lowest RMSE according to the train / test split)

In [28]:
df_wa = preds_full[["unique_id", "ds", "WindowAverage"]].pivot(index = "unique_id", columns = "ds", values = "WindowAverage")
day_to_d = dict(zip(list(df_wa.columns), list(df_sample_sub.columns[1:])))

# Change col names back to original day IDs
df_wa = df_wa.rename(day_to_d, axis = 1).reset_index()

# Round up
df_wa.iloc[:, 1:] = df_wa.iloc[:, 1:].apply(np.ceil).astype(int)

df_wa.head()

ds,unique_id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
0,Beauty_1_001_Central_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,Beauty_1_001_Central_2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,Beauty_1_001_Central_3,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,Beauty_1_001_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,Beauty_1_001_East_2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [29]:
# Sort into the original ordering by ID
df_wa[["category", "store", "num", "region", "num_2"]] = df_wa["unique_id"].str.split("_", expand = True)
df_wa["region"] = pd.Categorical(df_wa["region"], ["East", "Central", "West"])
df_wa = df_wa.sort_values(by = ["region", "num_2", "category", "store", "num"])
df_wa = df_wa.drop(["category", "store", "num", "region", "num_2"], axis =1)

# Rename ID col
df_wa = df_wa.rename(columns = {"unique_id" : "id"})

df_wa.head()

ds,id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
3,Beauty_1_001_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
13,Beauty_1_002_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
23,Beauty_1_003_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
33,Beauty_1_004_East_1,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
43,Beauty_1_005_East_1,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [30]:
# df_wa.to_csv(EXPORT_PATH+"/submission_7.csv", header=True, index=False)

#### Exporting window average (rounding to nearest int instead of rounding up)

In [31]:
df_wa = preds_full[["unique_id", "ds", "WindowAverage"]].pivot(index = "unique_id", columns = "ds", values = "WindowAverage")
day_to_d = dict(zip(list(df_wa.columns), list(df_sample_sub.columns[1:])))

# Change col names back to original day IDs
df_wa = df_wa.rename(day_to_d, axis = 1).reset_index()

# Round up
df_wa.iloc[:, 1:] = df_wa.iloc[:, 1:].round().astype(int)

df_wa.head()

ds,unique_id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
0,Beauty_1_001_Central_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
1,Beauty_1_001_Central_2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Beauty_1_001_Central_3,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,Beauty_1_001_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
4,Beauty_1_001_East_2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [32]:
# Sort into the original ordering by ID
df_wa[["category", "store", "num", "region", "num_2"]] = df_wa["unique_id"].str.split("_", expand = True)
df_wa["region"] = pd.Categorical(df_wa["region"], ["East", "Central", "West"])
df_wa = df_wa.sort_values(by = ["region", "num_2", "category", "store", "num"])
df_wa = df_wa.drop(["category", "store", "num", "region", "num_2"], axis =1)

# Rename ID col
df_wa = df_wa.rename(columns = {"unique_id" : "id"})

df_wa.head()

ds,id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
3,Beauty_1_001_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
13,Beauty_1_002_East_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,Beauty_1_003_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
33,Beauty_1_004_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
43,Beauty_1_005_East_1,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2


In [33]:
# df_wa.to_csv(EXPORT_PATH+"/submission_10.csv", header=True, index=False)

#### Exporting naive 

In [34]:
df_naive = preds_full[["unique_id", "ds", "Naive"]].pivot(index = "unique_id", columns = "ds", values = "Naive")
day_to_d = dict(zip(list(df_naive.columns), list(df_sample_sub.columns[1:])))

# Change col names back to original day IDs
df_naive = df_naive.rename(day_to_d, axis = 1).reset_index()

# Round up
df_naive.iloc[:, 1:] = df_naive.iloc[:, 1:].apply(np.ceil).astype(int)

In [35]:
# Sort into the original ordering by ID
df_naive[["category", "store", "num", "region", "num_2"]] = df_naive["unique_id"].str.split("_", expand = True)
df_naive["region"] = pd.Categorical(df_naive["region"], ["East", "Central", "West"])
df_naive = df_naive.sort_values(by = ["region", "num_2", "category", "store", "num"])
df_naive = df_naive.drop(["category", "store", "num", "region", "num_2"], axis =1)

# Rename ID col
df_naive = df_naive.rename(columns = {"unique_id" : "id"})

df_naive.head()

ds,id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
3,Beauty_1_001_East_1,3,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
13,Beauty_1_002_East_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
23,Beauty_1_003_East_1,2,2,2,2,2,2,2,2,2,...,2,2,2,2,2,2,2,2,2,2
33,Beauty_1_004_East_1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
43,Beauty_1_005_East_1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
# df_naive.to_csv(EXPORT_PATH+"/submission_8.csv", header=True, index=False)

#### Exporting seasonal naive

In [37]:
df_s_naive = preds_full[["unique_id", "ds", "SeasonalNaive"]].pivot(index = "unique_id", columns = "ds", values = "SeasonalNaive")
day_to_d = dict(zip(list(df_s_naive.columns), list(df_sample_sub.columns[1:])))

# Change col names back to original day IDs
df_s_naive = df_s_naive.rename(day_to_d, axis = 1).reset_index()

# Round up
df_s_naive.iloc[:, 1:] = df_s_naive.iloc[:, 1:].apply(np.ceil).astype(int)

In [38]:
# Sort into the original ordering by ID
df_s_naive[["category", "store", "num", "region", "num_2"]] = df_s_naive["unique_id"].str.split("_", expand = True)
df_s_naive["region"] = pd.Categorical(df_s_naive["region"], ["East", "Central", "West"])
df_s_naive = df_s_naive.sort_values(by = ["region", "num_2", "category", "store", "num"])
df_s_naive = df_s_naive.drop(["category", "store", "num", "region", "num_2"], axis =1)

# Rename ID col
df_s_naive = df_s_naive.rename(columns = {"unique_id" : "id"})

df_s_naive.head()

ds,id,d_1920,d_1921,d_1922,d_1923,d_1924,d_1925,d_1926,d_1927,d_1928,...,d_1931,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940
3,Beauty_1_001_East_1,1,0,0,0,2,0,3,1,0,...,2,0,3,1,0,0,0,2,0,3
13,Beauty_1_002_East_1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
23,Beauty_1_003_East_1,1,0,0,1,1,0,2,1,0,...,1,0,2,1,0,0,1,1,0,2
33,Beauty_1_004_East_1,2,0,0,1,2,4,1,2,0,...,2,4,1,2,0,0,1,2,4,1
43,Beauty_1_005_East_1,4,1,0,2,3,1,0,4,1,...,3,1,0,4,1,0,2,3,1,0


In [39]:
# df_s_naive.to_csv(EXPORT_PATH+"/submission_9.csv", header=True, index=False)