In [1]:
TRAIN_CSV_PATH = r"../datasets/train.csv"
PRICES_CSV_PATH = r"../datasets/prices.csv"
CAL_CSV_PATH = r"../datasets/calendar.csv"
SAMPLE_CSV_PATH = r"../datasets/sample_submission.csv"

EXPORT_PATH = r"../submissions/"

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from statsforecast import StatsForecast
from statsforecast.models import Naive, SeasonalNaive, WindowAverage, SeasonalWindowAverage, SimpleExponentialSmoothingOptimized, SeasonalExponentialSmoothingOptimized, Holt, HoltWinters

from sklearn.metrics import mean_squared_error

  from tqdm.autonotebook import tqdm


In [69]:
df_train = pd.read_csv(TRAIN_CSV_PATH)
df_sample_sub = pd.read_csv(SAMPLE_CSV_PATH)
df_cal = pd.read_csv(CAL_CSV_PATH)

df_dates = pd.DataFrame(columns = ["d"], data = df_train.columns[6:])
df_dates = df_dates.merge(df_cal[["date", "d"]], on = "d", how = "left")

In [4]:
# Function to change to 0 if negative, if not round to nearest int
def round_positive(s) :
    s[s < 0 ] = 0
    s = s.round().astype(int)

    return s

In [5]:
# Function to convert predictions into submission csv
def convert_to_sub_csv(preds_df, method, is_negative = False) :
    df_converted = preds_df[["unique_id", "ds", method]].pivot(index = "unique_id", columns = "ds", values = method)

    # Change col names back to day ints
    day_to_d = dict(zip(list(df_converted.columns), list(df_sample_sub.columns[1:])))
    df_converted = df_converted.rename(day_to_d, axis = 1).reset_index()

    # Round up to nearest int
    df_converted.iloc[:, 1:] = df_converted.iloc[:, 1:].round().astype(int)    

    # Sort into the original ordering by ID
    df_converted[["category", "store", "num", "region", "num_2"]] = df_converted["unique_id"].str.split("_", expand = True)
    df_converted["region"] = pd.Categorical(df_converted["region"], ["East", "Central", "West"])
    df_converted = df_converted.sort_values(by = ["region", "num_2", "category", "store", "num"])
    df_converted = df_converted.drop(["category", "store", "num", "region", "num_2"], axis =1)

    # Rename ID col
    df_converted = df_converted.rename(columns = {"unique_id" : "id"})

    return df_converted

In [6]:
# Convert to appropriate datatypes
df_train["id"] = df_train["id"].astype("string")
df_train["item_id"] = df_train["item_id"].astype("string")
df_train["subcat_id"] = df_train["subcat_id"].astype("string")
df_train["category_id"] = df_train["category_id"].astype("string")
df_train["store_id"] = df_train["store_id"].astype("string")
df_train["region_id"] = df_train["region_id"].astype("string")


# Change dtypes
df_cal["date"] = pd.to_datetime(df_cal["date"])
df_cal["weekday"] = df_cal["weekday"].astype("string")
df_cal["d"] = df_cal["d"].astype("string")
df_cal["wm_yr_wk"] = df_cal["wm_yr_wk"].astype(int)

# Add column with months in string
month_names_ls = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
month_int_ls = list(range(1, 13))
month_names_dict = {month_int_ls[i]: month_names_ls[i] for i in range(len(month_int_ls))}

df_cal["month_name"] = df_cal["month"].replace(month_names_dict)

In [7]:
df_dates["d"] = df_dates["d"].astype("string")
df_dates["date"] = pd.to_datetime(df_dates["date"])

#### Super Naive models (and more):
- Items sold same as previous 21 days

Kaggle score: 2.81

In [4]:
df_sample_sub.iloc[:,1:] = df_train.iloc[:, -21:].values

In [5]:
# df_sample_sub.to_csv(EXPORT_PATH+"/submission_4.csv", header=True, index=False)

- Items sold same as previous 21 days + 1

Kaggle score: 2.95

In [6]:
df_sample_sub.iloc[:,1:] = df_train.iloc[:, -21:].values+1

In [7]:
# df_sample_sub.to_csv(EXPORT_PATH+"/submission_5.csv", header=True, index=False)

#### Using the averaged differences from the same time period of the past 4 years, then add to 2015 to predict for 2016

In [8]:
dates_dict = dict(zip(list(df_dates["d"]), list(df_dates["date"])))
df_train_dates = df_train.rename(dates_dict, axis = 1)

df_train_dates.head()

Unnamed: 0,id,item_id,subcat_id,category_id,store_id,region_id,2011-01-29 00:00:00,2011-01-30 00:00:00,2011-01-31 00:00:00,2011-02-01 00:00:00,...,2016-04-21 00:00:00,2016-04-22 00:00:00,2016-04-23 00:00:00,2016-04-24 00:00:00,2016-04-25 00:00:00,2016-04-26 00:00:00,2016-04-27 00:00:00,2016-04-28 00:00:00,2016-04-29 00:00:00,2016-04-30 00:00:00
0,Beauty_1_001_East_1,Beauty_1_001,Beauty_1,Beauty,East_1,East,0,0,0,0,...,3,0,1,1,0,0,0,2,0,3
1,Beauty_1_002_East_1,Beauty_1_002,Beauty_1,Beauty,East_1,East,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,Beauty_1_003_East_1,Beauty_1_003,Beauty_1,Beauty,East_1,East,0,0,0,0,...,0,1,1,1,0,0,1,1,0,2
3,Beauty_1_004_East_1,Beauty_1_004,Beauty_1,Beauty,East_1,East,0,0,0,0,...,1,3,7,2,0,0,1,2,4,1
4,Beauty_1_005_East_1,Beauty_1_005,Beauty_1,Beauty,East_1,East,0,0,0,0,...,2,2,2,4,1,0,2,3,1,0


In [12]:
year_1_dates = pd.date_range(start="2011-05-01", end = "2011-05-21")
year_2_dates = pd.date_range(start="2012-05-01", end = "2012-05-21")
year_3_dates = pd.date_range(start="2013-05-01", end = "2013-05-21")
year_4_dates = pd.date_range(start="2014-05-01", end = "2014-05-21")
year_5_dates = pd.date_range(start="2015-05-01", end = "2015-05-21")

year_1_df = df_train_dates.loc[:, year_1_dates]
year_2_df = df_train_dates.loc[:, year_2_dates]
year_3_df = df_train_dates.loc[:, year_3_dates]
year_4_df = df_train_dates.loc[:, year_4_dates]
year_5_df = df_train_dates.loc[:, year_5_dates]

In [23]:
df_train_dates_diff_year = df_train_dates.iloc[:, 6:].diff(365, axis = 1).dropna(axis = 1)
df_train_dates_diff_year["id"] = df_train_dates["id"]

  """Entry point for launching an IPython kernel.


In [25]:
# This contains the difference for the 21 days in year 2 compared to year 1
year_2_df = df_train_dates_diff_year.loc[:, year_2_dates]

# Etc
year_3_df = df_train_dates_diff_year.loc[:, year_3_dates]
year_4_df = df_train_dates_diff_year.loc[:, year_4_dates]
year_5_df = df_train_dates_diff_year.loc[:, year_5_dates]

In [43]:
# Convert back to day_n to add different cols together
year_2_dict = dict(zip(list(year_2_df.columns), list(range(0, 21))))
year_3_dict = dict(zip(list(year_3_df.columns), list(range(0, 21))))
year_4_dict = dict(zip(list(year_4_df.columns), list(range(0, 21))))
year_5_dict = dict(zip(list(year_5_df.columns), list(range(0, 21))))

year_2_df = year_2_df.rename(year_2_dict, axis = 1)
year_3_df = year_3_df.rename(year_3_dict, axis = 1)
year_4_df = year_4_df.rename(year_4_dict, axis = 1)
year_5_df = year_5_df.rename(year_5_dict, axis = 1)

In [49]:
sum_df = year_2_df.add(year_3_df, axis = 1)
sum_df = sum_df.add(year_4_df, axis = 1)
sum_df = sum_df.add(year_5_df, axis = 1)

averaged_df = sum_df.div(4)

In [68]:
df_sample_sub.iloc[:, 1:] = averaged_df.add(year_5_df, axis = 1).round().astype(int).apply(round_positive).values

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11,12,13,14,15,16,17,18,19,20
0,1,1,2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,1,0,0,1,0,0,...,0,1,0,0,0,0,0,4,0,1
2,0,1,0,0,1,0,0,0,1,0,...,0,0,1,0,1,0,1,0,0,1
3,0,6,7,0,0,0,0,2,9,3,...,0,0,0,2,3,0,0,0,0,0
4,2,4,2,2,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0


In [72]:
# df_sample_sub.to_csv(EXPORT_PATH+"/submission_24.csv", header=True, index=False)

### Forecasting using naive methods from statsmodels

From [here](https://forecastegy.com/posts/naive-time-series-forecasting-in-python/#:~:text=the%20older%20ones.-,Seasonal%20Naive%20Forecast,than%20the%20simple%20naive%20model.)

In [54]:
df_naive_train = df_train_dates[["id"] + list(df_train_dates.columns[6:])].melt(id_vars = ["id"], var_name= "ds", value_name = "y")
df_naive_train = df_naive_train.rename(columns = {"id":"unique_id"})
df_naive_train.head()

Unnamed: 0,unique_id,ds,y
0,Beauty_1_001_East_1,2011-01-29,0
1,Beauty_1_002_East_1,2011-01-29,0
2,Beauty_1_003_East_1,2011-01-29,0
3,Beauty_1_004_East_1,2011-01-29,0
4,Beauty_1_005_East_1,2011-01-29,0


#### Testing RMSE on train / test validation set 

With Naive, SeasonalNaive, WindowAverage, Seasonal Window Average
- seasonal length 7 days

In [7]:
# Split into train and validation 
train = df_naive_train.loc[df_naive_train['ds'] < '2015-01-29']
valid = df_naive_train.loc[(df_naive_train['ds'] >= '2015-01-29')]
h = valid['ds'].nunique()

In [18]:
model = StatsForecast(models=[Naive(), 
                              SeasonalNaive(season_length=7), 
                              WindowAverage(window_size=7), 
                              SeasonalWindowAverage(window_size=2, season_length=7)],
                      freq='D', n_jobs=-1)
model.fit(train)

StatsForecast(models=[Naive,SeasonalNaive,WindowAverage,SeasWA])

In [19]:
p = model.predict(h=h)
p = p.reset_index().merge(valid, on=['ds', 'unique_id'], how='left')
p.head()

Unnamed: 0,unique_id,ds,Naive,SeasonalNaive,WindowAverage,SeasWA,y
0,Beauty_1_001_Central_1,2015-01-29,0.0,0.0,0.0,0.0,1
1,Beauty_1_001_Central_1,2015-01-30,0.0,0.0,0.0,0.0,0
2,Beauty_1_001_Central_1,2015-01-31,0.0,0.0,0.0,0.0,0
3,Beauty_1_001_Central_1,2015-02-01,0.0,0.0,0.0,0.0,1
4,Beauty_1_001_Central_1,2015-02-02,0.0,0.0,0.0,0.0,0


In [21]:
print(np.sqrt(mean_squared_error(valid["y"], p["Naive"])))
print(np.sqrt(mean_squared_error(valid["y"], p["SeasonalNaive"])))
print(np.sqrt(mean_squared_error(valid["y"], p["WindowAverage"])))
print(np.sqrt(mean_squared_error(valid["y"], p["SeasWA"])))

4.60855218075943
4.804250660557115
4.49464505136421
4.702569543222306


### Trying with different seasonal and window lengths
With SeasonalNaive, WindowAverage, Seasonal Window Average
- seasonal length 7 days

In [8]:
model_2 = StatsForecast(models=[SeasonalNaive(season_length=14), 
                              WindowAverage(window_size=14), 
                              SeasonalWindowAverage(window_size=2, season_length=7)],
                      freq='D', n_jobs=-1)
model_2.fit(train)

StatsForecast(models=[SeasonalNaive,WindowAverage,SeasWA])

In [9]:
p_2 = model_2.predict(h=h)
p_2 = p_2.reset_index().merge(valid, on=['ds', 'unique_id'], how='left')
p_2.head()

Unnamed: 0,unique_id,ds,SeasonalNaive,WindowAverage,SeasWA,y
0,Beauty_1_001_Central_1,2015-01-29,0.0,0.0,0.0,1
1,Beauty_1_001_Central_1,2015-01-30,0.0,0.0,0.0,0
2,Beauty_1_001_Central_1,2015-01-31,0.0,0.0,0.0,0
3,Beauty_1_001_Central_1,2015-02-01,0.0,0.0,0.0,1
4,Beauty_1_001_Central_1,2015-02-02,0.0,0.0,0.0,0


In [11]:
print(np.sqrt(mean_squared_error(valid["y"], p_2["SeasonalNaive"])))
print(np.sqrt(mean_squared_error(valid["y"], p_2["WindowAverage"])))
print(np.sqrt(mean_squared_error(valid["y"], p_2["SeasWA"])))

4.869437341538874
4.522386924418987
4.702569543222306


### Producing 21 day forecast, training on the entire train.csv

In [44]:
model_full = StatsForecast(df = df_naive_train, 
                        models=[Naive(), 
                              SeasonalNaive(season_length=7), 
                              WindowAverage(window_size=7), 
                              SeasonalWindowAverage(window_size=2, season_length=7)],
                      freq='D', n_jobs=-1)

In [45]:
preds_full = model_full.forecast(h = 21)
preds_full = preds_full.reset_index()
preds_full.head()

Unnamed: 0,unique_id,ds,Naive,SeasonalNaive,WindowAverage,SeasWA
0,Beauty_1_001_Central_1,2016-05-01,0.0,2.0,0.571429,1.0
1,Beauty_1_001_Central_1,2016-05-02,0.0,0.0,0.571429,0.0
2,Beauty_1_001_Central_1,2016-05-03,0.0,0.0,0.571429,0.5
3,Beauty_1_001_Central_1,2016-05-04,0.0,0.0,0.571429,0.0
4,Beauty_1_001_Central_1,2016-05-05,0.0,2.0,0.571429,1.5


#### Exporting seasonal window average

In [None]:
df_seaswa = convert_to_sub_csv(preds_full, "SeasWA")

In [None]:
# df_seaswa.to_csv(EXPORT_PATH+"/submission_6.csv", header=True, index=False)

In [None]:
df_wa = convert_to_sub_csv(preds_full, "WindowAverage")

In [None]:
# df_wa.to_csv(EXPORT_PATH+"/submission_10.csv", header=True, index=False)

In [None]:
df_naive = convert_to_sub_csv(preds_full, "Naive")

In [None]:
# df_naive.to_csv(EXPORT_PATH+"/submission_8.csv", header=True, index=False)

In [None]:
df_s_naive = convert_to_sub_csv(preds_full, "SeasonalNaive")

In [None]:
# df_naive.to_csv(EXPORT_PATH+"/submission_9.csv", header=True, index=False)

### Trying with ETS models

SimpleExponentialSmoothingOptimized, SeasonalExponentialSmoothingOptimized, Holt, HoltWinters

In [9]:
# Subset the last 50% of the time series because of computation
df_naive_train = df_naive_train.set_index("ds")
df_naive_train = df_naive_train.sort_index()
df_naive_train["ds"] = df_naive_train.index

# Find midpoint
midpoint = df_naive_train["ds"].iloc[0] + ((df_naive_train["ds"].iloc[-1] - df_naive_train["ds"].iloc[0]) / 2)

# Split into train and validation, from midpoint onwards
df_naive_train_subset = df_naive_train.loc[midpoint:]

# find date to split, 80% of subset
test_split_date = ((df_naive_train["ds"].iloc[-1] - midpoint) * 0.8) + midpoint

df_naive_train_subset = df_naive_train_subset.drop("ds", axis = 1)
df_naive_train_subset = df_naive_train_subset.reset_index()

# Split by date
train = df_naive_train_subset.loc[df_naive_train_subset['ds'] < test_split_date]
valid = df_naive_train_subset.loc[(df_naive_train_subset['ds'] >= test_split_date)]

h = valid['ds'].nunique()

In [10]:
print(test_split_date)

2015-10-21 04:48:00


In [9]:
model_3 = StatsForecast(models=[SimpleExponentialSmoothingOptimized(), 
                              SeasonalExponentialSmoothingOptimized(season_length=7)
                              ],
                      freq='D', n_jobs=-1)

p_3 = model_3.fit_predict(h = h, df = train)

In [10]:
p_3 = p_3.reset_index().merge(valid, on=['ds', 'unique_id'], how='left')
p_3.head()

Unnamed: 0,unique_id,ds,SESOpt,SeasESOpt,y
0,Beauty_1_001_Central_1,2015-10-22,0.239553,0.283106,0
1,Beauty_1_001_Central_1,2015-10-23,0.239553,0.352633,0
2,Beauty_1_001_Central_1,2015-10-24,0.239553,0.11898,0
3,Beauty_1_001_Central_1,2015-10-25,0.239553,0.191915,0
4,Beauty_1_001_Central_1,2015-10-26,0.239553,0.224423,0


In [12]:
print(np.sqrt(mean_squared_error(valid["y"], p_3["SESOpt"])))
print(np.sqrt(mean_squared_error(valid["y"], p_3["SeasESOpt"])))

4.529695390799881
4.719031412831159


### Forecasting for the next 21 days using 50% of the train data

In [16]:
p_3_forecast = model_3.forecast(h = 21, df = df_naive_train_subset)

In [17]:
p_3_forecast = p_3_forecast.reset_index()
p_3_forecast.head()

Unnamed: 0,unique_id,ds,SESOpt,SeasESOpt
0,Beauty_1_001_Central_1,2016-05-01,0.322579,0.211212
1,Beauty_1_001_Central_1,2016-05-02,0.322579,0.445361
2,Beauty_1_001_Central_1,2016-05-03,0.322579,0.136753
3,Beauty_1_001_Central_1,2016-05-04,0.322579,0.205991
4,Beauty_1_001_Central_1,2016-05-05,0.322579,0.279417


In [18]:
df_SEASOpt = convert_to_sub_csv(p_3_forecast, "SESOpt")

In [21]:
df_SeasESOpt = convert_to_sub_csv(p_3_forecast, "SeasESOpt")

In [20]:
# df_SEASOpt.to_csv(EXPORT_PATH+"/submission_11.csv", header=True, index=False)

In [22]:
# df_SeasESOpt.to_csv(EXPORT_PATH+"/submission_12.csv", header=True, index=False)

### Trying different ETS models

In [26]:
model_3 = StatsForecast(models=[SeasonalExponentialSmoothingOptimized(season_length=14)],
                      freq='D', n_jobs=-1)

p_3 = model_3.fit_predict(h = h, df = train)

In [27]:
p_3 = p_3.reset_index().merge(valid, on=['ds', 'unique_id'], how='left')
p_3.head()

Unnamed: 0,unique_id,ds,SeasESOpt,y
0,Beauty_1_001_Central_1,2015-10-22,0.174693,0
1,Beauty_1_001_Central_1,2015-10-23,0.470831,0
2,Beauty_1_001_Central_1,2015-10-24,0.054672,0
3,Beauty_1_001_Central_1,2015-10-25,0.219027,0
4,Beauty_1_001_Central_1,2015-10-26,0.259517,0


In [28]:
print(np.sqrt(mean_squared_error(valid["y"], p_3["SeasESOpt"])))

4.694386802773638


In [11]:
model_4 = StatsForecast(models=[HoltWinters(season_length=7)],
                      freq='D', n_jobs=-1)

p_4 = model_4.forecast(h = 21, df = valid)

In [12]:
p_4 = p_4.reset_index()
p_4.head()

Unnamed: 0,unique_id,ds,HoltWinters
0,Beauty_1_001_Central_1,2016-05-01,0.531215
1,Beauty_1_001_Central_1,2016-05-02,0.202442
2,Beauty_1_001_Central_1,2016-05-03,0.274709
3,Beauty_1_001_Central_1,2016-05-04,0.377578
4,Beauty_1_001_Central_1,2016-05-05,0.59292


In [13]:
df_holt_winters = convert_to_sub_csv(p_4, "HoltWinters")
# df_holt_winters.to_csv("../submissions/submission_23.csv", index = False)

Even with seasonal length of 14, still does not outperform window average

### Testing models with differencing by 7 (for window averages)

Window and seasonality = 14

In [12]:
dates_dict = dict(zip(list(df_dates["d"]), list(df_dates["date"])))
df_train_dates = df_train.rename(dates_dict, axis = 1)

# Difference by 7 and then save
df_train_dates_diff = df_train_dates.iloc[:, 6:].diff(periods = 7, axis = 1).iloc[:, 7:]
df_train_dates_diff["id"] = df_train_dates["id"]

In [13]:
df_naive_train_diff = df_train_dates_diff[["id"] + list(df_train_dates_diff.columns[:-1])].melt(id_vars = ["id"], var_name= "ds", value_name = "y")
df_naive_train_diff = df_naive_train_diff.rename(columns = {"id":"unique_id"})
df_naive_train_diff.head()

Unnamed: 0,unique_id,ds,y
0,Beauty_1_001_East_1,2011-02-05,0
1,Beauty_1_002_East_1,2011-02-05,0
2,Beauty_1_003_East_1,2011-02-05,0
3,Beauty_1_004_East_1,2011-02-05,0
4,Beauty_1_005_East_1,2011-02-05,0


In [15]:
# Split into train and validation 
train = df_naive_train_diff.loc[df_naive_train_diff['ds'] < '2015-01-29']
valid = df_naive_train_diff.loc[(df_naive_train_diff['ds'] >= '2015-01-29')]
h = valid['ds'].nunique()

In [37]:
model_4 = StatsForecast(models=[Naive(),
                                SeasonalNaive(season_length=7), 
                              WindowAverage(window_size=7), 
                              SeasonalWindowAverage(window_size=2, season_length=7)],
                      freq='D', n_jobs=-1)

In [38]:
p_4 = model_4.fit_predict(h = h, df = train)
p_4 = p_4.reset_index().merge(valid, on=['ds', 'unique_id'], how='left')
p_4.head()

KeyboardInterrupt: 

In [26]:
p_4[["Naive", "SeasonalNaive", "WindowAverage", "SeasWA"]] = p_4[["Naive", "SeasonalNaive", "WindowAverage", "SeasWA"]].apply(round_positive)

In [27]:
print(np.sqrt(mean_squared_error(valid["y"], p_4["SeasonalNaive"])))
print(np.sqrt(mean_squared_error(valid["y"], p_4["WindowAverage"])))
print(np.sqrt(mean_squared_error(valid["y"], p_4["SeasWA"])))

3.265376671835042
2.8772100978970427
2.926046344787223


### Forecast using entire train set for 21 days

In [39]:
p_4_forecast = model_4.forecast(h = 21, df = df_naive_train_diff)

In [40]:
p_4_forecast = p_4_forecast.reset_index()
p_4_forecast.head()

Unnamed: 0,unique_id,ds,Naive,SeasonalNaive,WindowAverage,SeasWA
0,Beauty_1_001_Central_1,2016-05-01,0.0,2.0,0.285714,1.0
1,Beauty_1_001_Central_1,2016-05-02,0.0,0.0,0.285714,0.0
2,Beauty_1_001_Central_1,2016-05-03,0.0,-1.0,0.285714,0.0
3,Beauty_1_001_Central_1,2016-05-04,0.0,0.0,0.285714,-0.5
4,Beauty_1_001_Central_1,2016-05-05,0.0,1.0,0.285714,0.5


In [41]:
p_4_forecast[["Naive", "SeasonalNaive", "WindowAverage", "SeasWA"]] = p_4_forecast[["Naive", "SeasonalNaive", "WindowAverage", "SeasWA"]].apply(round_positive)
p_4_forecast.head()

Unnamed: 0,unique_id,ds,Naive,SeasonalNaive,WindowAverage,SeasWA
0,Beauty_1_001_Central_1,2016-05-01,0,2,0,1
1,Beauty_1_001_Central_1,2016-05-02,0,0,0,0
2,Beauty_1_001_Central_1,2016-05-03,0,0,0,0
3,Beauty_1_001_Central_1,2016-05-04,0,0,0,0
4,Beauty_1_001_Central_1,2016-05-05,0,1,0,0


In [29]:
df_wa_diff_7 = convert_to_sub_csv(p_4_forecast, "WindowAverage")

In [30]:
# df_wa_diff_7.to_csv(EXPORT_PATH+"/submission_13.csv", header=True, index=False)

In [31]:
df_seaswa_diff_7 = convert_to_sub_csv(p_4_forecast, "SeasWA")

In [32]:
# df_seaswa_diff_7.to_csv(EXPORT_PATH+"/submission_14.csv", header=True, index=False)

In [42]:
df_naive_diff_7 = convert_to_sub_csv(p_4_forecast, "Naive")

# df_naive_diff_7.to_csv(EXPORT_PATH+"/submission_15.csv", header=True, index=False)

### Try by window of 14, data differenced by 7

In [33]:
model_5 = StatsForecast(models=[SeasonalNaive(season_length=14), 
                              WindowAverage(window_size=14), 
                              SeasonalWindowAverage(window_size=2, season_length=14)],
                      freq='D', n_jobs=-1)

p_5_forecast = model_5.forecast(h = 21, df = df_naive_train_diff)

In [34]:
p_5_forecast = p_5_forecast.reset_index()
p_5_forecast.head()

Unnamed: 0,unique_id,ds,SeasonalNaive,WindowAverage,SeasWA
0,Beauty_1_001_Central_1,2016-05-01,0.0,0.142857,0.0
1,Beauty_1_001_Central_1,2016-05-02,0.0,0.142857,0.0
2,Beauty_1_001_Central_1,2016-05-03,1.0,0.142857,0.5
3,Beauty_1_001_Central_1,2016-05-04,-1.0,0.142857,-0.5
4,Beauty_1_001_Central_1,2016-05-05,0.0,0.142857,0.0


In [35]:
p_5_forecast[["SeasonalNaive", "WindowAverage", "SeasWA"]] = p_5_forecast[["SeasonalNaive", "WindowAverage", "SeasWA"]].apply(round_positive)
p_5_forecast.head()

Unnamed: 0,unique_id,ds,SeasonalNaive,WindowAverage,SeasWA
0,Beauty_1_001_Central_1,2016-05-01,0,0,0
1,Beauty_1_001_Central_1,2016-05-02,0,0,0
2,Beauty_1_001_Central_1,2016-05-03,1,0,0
3,Beauty_1_001_Central_1,2016-05-04,0,0,0
4,Beauty_1_001_Central_1,2016-05-05,0,0,0


In [36]:
df_wa_diff_7_14 = convert_to_sub_csv(p_5_forecast, "WindowAverage")
# df_wa_diff_7_14.to_csv(EXPORT_PATH+"/submission_15.csv", header=True, index=False)

#### Different window sizes for window average

In [49]:
wa_3 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=3)],
                      freq='D', n_jobs=-1)

wa_9 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=9)],
                      freq='D', n_jobs=-1)

wa_12 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=12)],
                      freq='D', n_jobs=-1)

wa_14 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=14)],
                      freq='D', n_jobs=-1)

In [50]:
df_wa_3 = wa_3.forecast(h = 21)
df_wa_3 = df_wa_3.reset_index()

In [51]:
df_wa_9 = wa_9.forecast(h = 21)
df_wa_9 = df_wa_9.reset_index()

In [52]:
df_wa_12 = wa_12.forecast(h = 21)
df_wa_12 = df_wa_12.reset_index()

In [53]:
df_wa_14 = wa_14.forecast(h = 21)
df_wa_14 = df_wa_14.reset_index()

In [59]:
wa_3_sub = convert_to_sub_csv(df_wa_3, "WindowAverage")
# wa_3_sub.to_csv(EXPORT_PATH+"/submission_16.csv", header=True, index=False)

In [60]:
wa_9_sub = convert_to_sub_csv(df_wa_9, "WindowAverage")
# wa_9_sub.to_csv(EXPORT_PATH+"/submission_17.csv", header=True, index=False)

In [61]:
wa_12_sub = convert_to_sub_csv(df_wa_12, "WindowAverage")
# wa_12_sub.to_csv(EXPORT_PATH+"/submission_18.csv", header=True, index=False)

In [62]:
wa_14_sub = convert_to_sub_csv(df_wa_14, "WindowAverage")
# wa_14_sub.to_csv(EXPORT_PATH+"/submission_19.csv", header=True, index=False)

## Best model here!

Window average = 21, no differencing

In [55]:
wa_21 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=21)],
                      freq='D', n_jobs=-1)

In [56]:
df_wa_21 = wa_21.forecast(h = 21)
df_wa_21 = df_wa_21.reset_index()

In [57]:
wa_21_sub = convert_to_sub_csv(df_wa_21, "WindowAverage")
# wa_21_sub.to_csv(EXPORT_PATH+"/submission_20.csv", header=True, index=False)

In [12]:
wa_28 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=28)],
                      freq='D', n_jobs=-1)

In [16]:
wa_42 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=42)],
                      freq='D', n_jobs=-1)

In [13]:
wa_56 = StatsForecast(df = df_naive_train, 
                        models=[WindowAverage(window_size=56)],
                      freq='D', n_jobs=-1)

In [14]:
df_wa_56 = wa_56.forecast(h = 21)
df_wa_56 = df_wa_56.reset_index()

In [15]:
wa_56_sub = convert_to_sub_csv(df_wa_56, "WindowAverage")
# wa_56_sub.to_csv(EXPORT_PATH+"/submission_21.csv", header=True, index=False)

In [17]:
df_wa_42 = wa_42.forecast(h = 21)
df_wa_42 = df_wa_42.reset_index()

In [18]:
wa_42_sub = convert_to_sub_csv(df_wa_42, "WindowAverage")
# wa_42_sub.to_csv(EXPORT_PATH+"/submission_21.csv", header=True, index=False)

In [21]:
df_wa_28 = wa_28.forecast(h = 21)
df_wa_28 = df_wa_28.reset_index()

In [22]:
df_wa_28.head()

Unnamed: 0,unique_id,ds,WindowAverage
0,Beauty_1_001_Central_1,2016-05-01,0.357143
1,Beauty_1_001_Central_1,2016-05-02,0.357143
2,Beauty_1_001_Central_1,2016-05-03,0.357143
3,Beauty_1_001_Central_1,2016-05-04,0.357143
4,Beauty_1_001_Central_1,2016-05-05,0.357143


In [24]:
wa_28_sub = convert_to_sub_csv(df_wa_28, "WindowAverage")
# wa_28_sub.to_csv(EXPORT_PATH+"/submission_22.csv", header=True, index=False)