## Let's make a linear model 
We will use mean square error instead of SMAPE

In [1]:
# Importing helpful libraries
import pandas as pd
import numpy as np
import pickle
import itertools
import gc
import math
import matplotlib.pyplot as plt
import dateutil.easter
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from datetime import datetime, date
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder, minmax_scale
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression, HuberRegressor, SGDRegressor
from sklearn.svm import SVR, LinearSVR

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Import the dataset 
original_train_df = pd.read_csv('../input/tabular-playground-series-jan-2022/train.csv')
original_test_df = pd.read_csv('../input/tabular-playground-series-jan-2022/test.csv')

In [3]:
# Since dates are read as strings, we will need to convert it into datetime series 
for df in [original_train_df,original_test_df]:
    df['date'] = pd.to_datetime(df.date)
    df.set_index('date', inplace= True, drop= False)
original_train_df.head(2)

Unnamed: 0_level_0,row_id,date,country,store,product,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-01,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
2015-01-01,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520


In [4]:
def smape_loss(y_true, y_pred):
    """SMAPE Loss to be used with TensorFlow"""
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200


## Feature Engineering  

In [5]:
def feature_engineering(df):
    new_df = pd.DataFrame({'daynum':            (df.date - datetime(2015,1,1)).dt.days,
                           'dayofyear':          df.date.dt.dayofyear,
                            'wd1': df.date.dt.weekday == 1,
                           'wd2': df.date.dt.weekday == 2,
                           'wd3': df.date.dt.weekday == 3,
                           'wd4': df.date.dt.weekday == 4,
                           'wd5': df.date.dt.weekday == 5,
                           'wd6': df.date.dt.weekday == 6,
                           #'dec26': (df.date.dt.month == 12) & (df.date.dt.day == 29),
                           'dec27': (df.date.dt.month == 12) & (df.date.dt.day == 29),
                           'dec28': (df.date.dt.month == 12) & (df.date.dt.day == 29),
                           'dec29': (df.date.dt.month == 12) & (df.date.dt.day == 29),
                           'dec30': (df.date.dt.month == 12) & (df.date.dt.day == 30),
                           #'dec31': (df.date.dt.month == 12) & (df.date.dt.day == 30),
                           #'yearend': df.date.dt.dayofyear >= 360,
                           #'dec30': (df.date.dt.month == 12) & (df.date.dt.day == 30),
                           'easter': 0,
                           'easter1': 0,
                           'easter2': 0,
                           'easter3': 0,
                          })
    new_df['daynum2'] = np.log(new_df.daynum + 80) 
    
     # One-hot encoding (no need to encode the last categories)
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Sticker']:
        new_df[product] = df['product'] == product
    
    # Seasonal variations (Fourier series)
    for k in range(1,100):
        new_df[f'sin{k}'] = np.sin(new_df.dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(new_df.dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'sticker_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Sticker']
        new_df[f'sticker_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Sticker']
    
    return new_df
train_df = feature_engineering(original_train_df)
train_df['date'] = original_train_df.date
train_df['num_sold'] = original_train_df.num_sold.astype(np.float32)
test_df = feature_engineering(original_test_df)

features = test_df.columns
for df in [train_df, test_df]:
    df[features] = df[features].astype(np.float32)

display(train_df.info())
train_df.head(3)

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 26298 entries, 2015-01-01 to 2018-12-31
Columns: 618 entries, daynum to num_sold
dtypes: datetime64[ns](1), float32(617)
memory usage: 62.3 MB


None

Unnamed: 0_level_0,daynum,dayofyear,wd1,wd2,wd3,wd4,wd5,wd6,dec27,dec28,...,sticker_sin98,sticker_cos98,sin99,cos99,mug_sin99,mug_cos99,sticker_sin99,sticker_cos99,date,num_sold
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-01-01,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.991114,-0.133015,0.991114,-0.133015,0.0,-0.0,2015-01-01,329.0
2015-01-01,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-0.0,0.991114,-0.133015,0.0,-0.0,0.0,-0.0,2015-01-01,520.0
2015-01-01,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.993257,-0.115935,0.991114,-0.133015,0.0,-0.0,0.991114,-0.133015,2015-01-01,146.0
