# Naive approach random_forest + optuna - holidays

In [1]:
model_name = 'ltsm_pytorch'

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_path = "./data/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

# drop id
df_train.drop('id', axis=1, inplace=True)

df_train.head(10)

Unnamed: 0,date,country,store,product,num_sold
0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49
5,2017-01-01,Argentina,Kaggle Store,Using LLMs to Improve Your Coding,88
6,2017-01-01,Argentina,Kaggle Store,Using LLMs to Train More LLMs,98
7,2017-01-01,Argentina,Kaggle Store,Using LLMs to Win Friends and Influence People,14
8,2017-01-01,Argentina,Kaggle Store,Using LLMs to Win More Kaggle Competitions,83
9,2017-01-01,Argentina,Kaggle Store,Using LLMs to Write Better,69


### Features

In [3]:
# helper functions
import requests
import holidays

# magical constants

def add_magical_constants(df):
    coef_c = {'Argentina': 4.23, 'Spain': 1.500, 'Japan': 1.14, 'Estonia': 1.62, 'Canada': 0.87}
    for c, country in enumerate(df['country'].unique()):
        df.loc[(df['country'] == country), 'num_sold'] *= coef_c[country]
    return df

# gdp feature

def get_gdp_per_capita(country,year):
    alpha3 = {'Argentina':'ARG','Canada':'CAN','Estonia':'EST','Japan':'JPN','Spain':'ESP'}
    url="https://api.worldbank.org/v2/country/{0}/indicator/NY.GDP.PCAP.CD?date={1}&format=json".format(alpha3[country],year)
    response = requests.get(url).json()
    return response[1][0]['value']

def create_gdp_df(df):
    gdp = []
    for country in df.country.unique():
        row = []
        for year in range(2017,2023):
            row.append(get_gdp_per_capita(country,year))
        gdp.append(row)

    gdp = np.array(gdp)
    gdp /= np.sum(gdp,axis=0)

    rel_gdp_df = pd.DataFrame(gdp,index=df.country.unique(),columns=range(2017,2023))
    return rel_gdp_df

def add_gdp_feature(df):
    rel_gdp_df_0 = create_gdp_df(df)
    rel_gdp_df = rel_gdp_df_0.reset_index(names="country")
    rel_gdp_df = pd.melt(rel_gdp_df, id_vars='country', value_vars=[2017, 2018, 2019, 2020, 2021, 2022])
    rel_gdp_df.columns = ['country', 'year', 'rel_gdp']
    rel_gdp_df['year'] = rel_gdp_df['year'].astype(int)
    df = df.merge(rel_gdp_df, on=['year', 'country'], how='left')
    return df

# holidays feature

def create_holidays_df():
    years = [2017, 2018, 2019, 2020, 2021, 2022, 2023]

    countries = {
        "AR": "Argentina",
        "CA": "Canada",
        "EE": "Estonia",
        "ES": "Spain",
        "JP": "Japan"
    }


    Argentina_holidays = holidays.CountryHoliday('AR', years=years)
    Canada_holidays = holidays.CountryHoliday('CA', years=years)
    Estonia_holidays = holidays.CountryHoliday('EE', years=years)
    Spain_holidays = holidays.CountryHoliday('ES', years=years)
    Japan_holidays = holidays.CountryHoliday('JP', years=years)

    all_holidays = [Argentina_holidays, Canada_holidays, Estonia_holidays, Spain_holidays, Japan_holidays]



    holidays_dfs = []
    for country_holidays in all_holidays:
        holidays_dates = []
        for date in country_holidays:
            holidays_dates.append(date)

        holidays_dates = list(set(holidays_dates))

        hdf = pd.DataFrame(holidays_dates, columns=['date'])
        hdf['country'] = countries[country_holidays.country]

        holidays_dfs.append(hdf)

    holidays_df = pd.concat(holidays_dfs)

    holidays_df['is_holiday'] = True

    holidays_df['date'] = pd.to_datetime(holidays_df['date'])
    return holidays_df

def add_holiday_feature(df):
    holidays_df = create_holidays_df()
    df = df.merge(holidays_df, on=['date', 'country'], how='left')
    df['is_holiday'] = df['is_holiday'].fillna(False)
    df['is_holiday'] = df['is_holiday'].astype(int)
    return df


def transform_date(df):
    df['date'] = pd.to_datetime(df['date'])
    return df

def create_features(df):
    # split date into year, month, day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

    # week number
    df['week_number'] = df['date'].dt.isocalendar().week.astype(int)
    
    # day of week
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.day
    # weekend
    df['weekend'] = (df['date'].dt.weekday >=4).astype(int)

    df['country'] = df['country'].astype('category')
    df['store'] = df['store'].astype('category')
    df['product'] = df['product'].astype('category')

    # remove march 2020 - june 2020
    #df = df[(df['date'] < '2020-03-01') | (df['date'] > '2020-06-30')]

    return df

def encode_cat_variables(df):
    categorical_features = ["country", "store", "product"]
    df = pd.get_dummies(df, columns=categorical_features)
    return df

def seasonality_features(df):
    df['month_sin'] = np.sin(2*np.pi*df.month/12)
    df['month_cos'] = np.cos(2*np.pi*df.month/12)
    df['day_sin'] = np.sin(2*np.pi*df.dayofmonth/31)
    df['day_cos'] = np.cos(2*np.pi*df.dayofmonth/31)
    return df

def SMAPE(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [16]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

import torch
from torch.nn import functional as F
from torch import nn
from torch.utils.data import TensorDataset, DataLoader

from torchvision import datasets, transforms

import lightning.pytorch as pl
from torchmetrics.functional.regression import mean_squared_error

class TimeseriesDataModule(pl.LightningDataModule):
    '''
    PyTorch Lighting DataModule subclass:
    https://pytorch-lightning.readthedocs.io/en/latest/datamodules.html

    Serves the purpose of aggregating all data loading 
      and processing work in one place.
    '''
    
    def __init__(self, seq_len = 1, batch_size = 128, num_workers=0):
        super().__init__()
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.X_train = None
        self.y_train = None
        self.X_val = None
        self.y_val = None
        self.X_test = None
        self.X_test = None
        self.columns = None
        self.preprocessing = None

    def prepare_data(self):
        pass

    def setup(self, stage=None):
        '''
        Data is resampled to hourly intervals.
        Both 'np.nan' and '?' are converted to 'np.nan'
        'Date' and 'Time' columns are merged into 'dt' index
        '''

        if stage == 'fit' and self.X_train is not None:
            return 
        if stage == 'test' and self.X_test is not None:
            return
        if stage is None and self.X_train is not None and self.X_test is not None:  
            return

        data_path = "./data/"
        df_train = pd.read_csv(data_path + "train.csv")
        df_test = pd.read_csv(data_path + "test.csv")

        # drop id
        df_train.drop('id', axis=1, inplace=True)

        df_train = transform_date(df_train)
        df_train = create_features(df_train)
        df_train = seasonality_features(df_train)
        df_train = add_gdp_feature(df_train)
        df_train = add_holiday_feature(df_train)
        df_train = encode_cat_variables(df_train)


        #df_test_0 = transform_date(df_test)
        #df_test_0 = create_features(df_test_0)
        #df_test_0 = seasonality_features(df_test_0)
        #df_test_0 = add_gdp_feature(df_test_0)
        #df_test_0 = add_holiday_feature(df_test_0)
        #df_test_0 = encode_cat_variables(df_test_0)

        model_features = df_train.columns.tolist()
        model_features.remove("num_sold")
        model_features.remove("date")

        # we pick the last half year of 2021 as validation set

        X_train = df_train[df_train["date"] < "2021-06-01"][model_features]
        y_train = df_train[df_train["date"] < "2021-06-01"]["num_sold"]

        X_val = df_train[df_train["date"] >= "2021-06-01"][model_features]
        y_val = df_train[df_train["date"] >= "2021-06-01"]["num_sold"]

        cat_features_indices = np.where((X_train.dtypes == "category") | (X_train.dtypes == "object"))[0]
        cat_features_indices


        preprocessing = StandardScaler()
        preprocessing.fit(X_train)

        if stage == 'fit' or stage is None:
            self.X_train = preprocessing.transform(X_train)
            self.y_train = y_train.values.reshape((-1, 1))
            self.X_val = preprocessing.transform(X_val)
            self.y_val = y_val.values.reshape((-1, 1))
        

    def train_dataloader(self):
        train_dataset = TimeSeriesDataSet(self.X_train, 
                                          self.y_train, 
                                          seq_len=self.seq_len)
        train_loader = DataLoader(train_dataset, 
                                  batch_size = self.batch_size, 
                                  shuffle = False, 
                                  num_workers = self.num_workers)
        
        return train_loader

    def val_dataloader(self):
        val_dataset = TimeSeriesDataSet(self.X_val, 
                                        self.y_val, 
                                        seq_len=self.seq_len)
        val_loader = DataLoader(val_dataset, 
                                batch_size = self.batch_size, 
                                shuffle = False, 
                                num_workers = self.num_workers)

        return val_loader


ImportError: cannot import name 'TimeSeriesDataSet' from 'torch.utils.data' (c:\Users\Wiktor\AppData\Local\Programs\Python\Python310\lib\site-packages\torch\utils\data\__init__.py)

In [None]:
class LSTMRegressor(pl.LightningModule):
    '''
    Standard PyTorch Lightning module:
    https://pytorch-lightning.readthedocs.io/en/latest/lightning_module.html
    '''
    def __init__(self, 
                 n_features, 
                 hidden_size, 
                 seq_len, 
                 batch_size,
                 num_layers, 
                 dropout, 
                 learning_rate,
                 criterion):
        super(LSTMRegressor, self).__init__()
        self.n_features = n_features
        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.num_layers = num_layers
        self.dropout = dropout
        self.criterion = criterion
        self.learning_rate = learning_rate

        self.lstm = nn.LSTM(input_size=n_features, 
                            hidden_size=hidden_size,
                            num_layers=num_layers, 
                            dropout=dropout, 
                            batch_first=True)
        self.linear = nn.Linear(hidden_size, 1)
        
    def forward(self, x):
        # lstm_out = (batch_size, seq_len, hidden_size)
        lstm_out, _ = self.lstm(x)
        y_pred = self.linear(lstm_out[:,-1])
        return y_pred
    
    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        result = pl.TrainResult(loss)
        result.log('train_loss', loss)
        return result

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        result = pl.EvalResult(checkpoint_on=loss)
        result.log('val_loss', loss)
        return result
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.criterion(y_hat, y)
        result = pl.EvalResult()
        result.log('test_loss', loss)
        return result

In [9]:
params = dict(
    seq_len = 24,
    batch_size = 70, 
    criterion = nn.MSELoss(),
    max_epochs = 10,
    n_features = 7,
    hidden_size = 100,
    num_layers = 1,
    dropout = 0.2,
    learning_rate = 0.001,
)

In [14]:
import torch
import torch.nn as nn

from torch.utils.data import Dataset, DataLoader

import lightning.pytorch as pl
from lightning.pytorch.loggers import TensorBoardLogger

logger = TensorBoardLogger("lightning_logs", name="playground-series")

trainer = pl.Trainer(
    max_epochs=params['max_epochs'], 
    precision=16, 
    accelerator="gpu",
    logger=logger,
)

model = LSTMRegressor(
    n_features = params['n_features'],
    hidden_size = params['hidden_size'],
    seq_len = params['seq_len'],
    batch_size = params['batch_size'],
    criterion = params['criterion'],
    num_layers = params['num_layers'],
    dropout = params['dropout'],
    learning_rate = params['learning_rate']
)

dm = TimeseriesDataModule(
    seq_len = params['seq_len'],
    batch_size = params['batch_size']
)

trainer.fit(model, dm)
#trainer.test(model, datamodule=dm)

  rank_zero_warn(
Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


NameError: name 'LSTMRegressor' is not defined