In [None]:
import torch
import random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightning as pl
from pytorch_lightning import loggers 
import tensorflow as tf 
import tensorboard as tb 
tf.io.gfile = tb.compat.tensorflow_stub.io.gfile
random.seed(30)
np.random.seed(30)
tf.random.set_seed(30)
torch.manual_seed(30)

torch.cuda.manual_seed(30)
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import NaNLabelEncoder,GroupNormalizer
from pytorch_forecasting.data.examples import generate_ar_data
from torchmetrics import TweedieDevianceScore,RMSE
from pytorch_forecasting.metrics import RMSE
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
import gc
from tqdm import tqdm
import holidays
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
import pickle

In [None]:
calendar= pd.read_csv("/content/calendar.csv")

# New Section

In [None]:
calendar.head()

In [None]:
calendar.describe()

In [None]:
print(calendar.event_name_1.unique())
print(calendar.event_type_1.unique())
print(calendar.event_name_2.unique())
print(calendar.event_type_2.unique())

In [None]:
calendar.dtypes

In [None]:
calendar.date=pd.to_datetime(calendar.date)


In [None]:
calendar.dtypes

In [None]:
price=pd.read_csv("/content/sell_prices.csv")

In [None]:
price.head()

In [None]:
price.shape

In [None]:
price.dtypes

In [None]:
price.store_id=price.store_id.astype("category")
price.item_id=price.item_id.astype("category")

In [None]:
price.dtypes

In [None]:
df= pd.read_csv("/content/sales_train_evaluation.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
for d in range(1942,1970):
    col = 'd_' + str(d)
    df[col] = 0
    df[col] = df[col].astype(np.int16)


In [None]:
catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
df = pd.melt(df,
                  id_vars = catcols,
                  value_vars = [col for col in df.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
l=[]
for i in df['d']:
  l.append(i.split('_')[1])
df['day']=l
df['day']=df['day'].astype(np.int16) 

In [None]:
df_subset=df[df['day']>1441]

In [None]:
df_subset = df_subset.merge(calendar, on= "d", copy = False)
df_subset.head()

In [None]:
df_subset.columns

In [None]:
df_subset = df_subset.merge(price, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
df_subset.head()

In [None]:
df_subset.columns

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df
reduce_mem_usage(df_subset)

In [None]:
df_subset["avg_volume_by_state"] = df_subset.groupby(["day", "state_id"], observed=True).sales.transform("mean")
df_subset["avg_volume_by_store"] = df_subset.groupby(["day", "store_id"], observed=True).sales.transform("mean")
df_subset["avg_volume_by_product"] = df_subset.groupby(["day", "item_id"], observed=True).sales.transform("mean")


In [None]:
df_subset["avg_volume_by_dept"] = df_subset.groupby(["day", "dept_id"], observed=True).sales.transform("mean")
df_subset["log_num_sold"] = np.log(df_subset.sales + 1e-8)
df_subset["avg_volume_by_category"] = df_subset.groupby(["day", "cat_id"], observed=True).sales.transform("mean")
gc.collect()


In [None]:
df_subset.dtypes

In [None]:
df_subset["wm_yr_wk"] = df_subset.wm_yr_wk.astype(str).str[-2:].astype("category") 
df_subset["snap_CA"] = df_subset.snap_CA.astype(str).astype("category") 
df_subset["snap_TX"] = df_subset.snap_TX.astype(str).astype("category") 
df_subset["snap_WI"] = df_subset.snap_WI.astype(str).astype("category") 
df_subset["wday"] = df_subset.wday.astype(str).astype("category")
df_subset["month"] = df_subset.month.astype(str).astype("category")
df_subset["item_id"] = df_subset.item_id.astype(str).astype("category")
df_subset["dept_id"] = df_subset.dept_id.astype(str).astype("category")
df_subset["store_id"] = df_subset.store_id.astype(str).astype("category")
df_subset["cat_id"] = df_subset.cat_id.astype(str).astype("category")
df_subset["state_id"] = df_subset.state_id.astype(str).astype("category")
df_subset["sales"] = df_subset.sales.astype(np.float16)

In [None]:
df_subset.isna().sum(0)

In [None]:
df_subset.replace(np.NaN,"normal",inplace=True)

In [None]:
df_subset.dtypes

In [None]:
train= df_subset[df_subset['day']<=1941]
test=df_subset[df_subset['day']>1941]

max_prediction_length = 28
max_encoder_length = 472
training_cutoff = train["day"].max() - max_prediction_length

# Let's create a Dataset
training = TimeSeriesDataSet(
    train[lambda x: x.day <= training_cutoff],
    time_idx="day",
    target="sales",
    group_ids=[ "store_id", "dept_id","item_id"], 
    min_encoder_length=max_prediction_length//2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=[ 'state_id','dept_id',"store_id", 
                         "item_id","snap_CA","snap_TX","snap_WI"],
    time_varying_known_categoricals=[ 'wm_yr_wk', 'wday', 'month'
                                      ],
    #variable_groups={"is_holiday": ["is_holiday"]},  # group of categorical variables can be treated as one variable
    time_varying_known_reals=["day","sell_price"],
    time_varying_unknown_categoricals=[],
    time_varying_unknown_reals=[
        "sales", "log_num_sold", "avg_volume_by_state",
        "avg_volume_by_store","avg_volume_by_category","avg_volume_by_dept", "avg_volume_by_product"
    ],
    target_normalizer=GroupNormalizer(
        groups=[ "store_id", "dept_id","item_id"], transformation="softplus"
    ),  # use softplus and normalize by group

    lags={'sales': [7, 14, 28]},
    add_encoder_length=True,
    add_relative_time_idx=True,
    add_target_scales=True,
    categorical_encoders={
        'item_id':NaNLabelEncoder(add_nan=True),
        'wm_yr_wk':NaNLabelEncoder(add_nan=True),
      'wday' : NaNLabelEncoder(add_nan=True),
        'month':NaNLabelEncoder(add_nan=True)
        }
    
)

# create validation set (predict=True) which means to predict the last max_prediction_length points in time
# for each series
validation = TimeSeriesDataSet.from_dataset(training, train, predict=True, stop_randomization=True)

# create dataloaders for model
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers=0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers=0)

In [None]:
actuals = torch.cat([y for x, (y, weight) in iter(val_dataloader)])
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

sm = TweedieDevianceScore()
print(f"Median loss for naive prediction on validation: {sm.loss(actuals, baseline_predictions).mean(axis = 1).median().item()}")