In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
train_df = pd.read_csv("data/train.csv", parse_dates=['date'])
test_df = pd.read_csv("data/test.csv", parse_dates=['date'])
gdp_df = pd.read_csv("data/GDP_data_2015_to_2019_Finland_Norway_Sweden.csv")
gdp_df.set_index('year', inplace=True)
gdp_df.head()

Unnamed: 0_level_0,GDP_Finland,GDP_Norway,GDP_Sweden
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,234.44,385.802,505.104
2016,240.608,368.827,515.655
2017,255.017,398.394,541.019
2018,275.58,437.0,555.455
2019,268.782,405.51,533.88


In [3]:
train_df.head()

Unnamed: 0,row_id,date,country,store,product,num_sold
0,0,2015-01-01,Finland,KaggleMart,Kaggle Mug,329
1,1,2015-01-01,Finland,KaggleMart,Kaggle Hat,520
2,2,2015-01-01,Finland,KaggleMart,Kaggle Sticker,146
3,3,2015-01-01,Finland,KaggleRama,Kaggle Mug,572
4,4,2015-01-01,Finland,KaggleRama,Kaggle Hat,911


In [4]:
def smape_loss(y_true, y_pred):
    return np.abs(y_true - y_pred) / (y_true + np.abs(y_pred)) * 200

# Time step feature
https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298831

In [5]:
def time_step_feature(df):
    t0 = np.datetime64('2015-01-01')
    df['time_step'] = (df.date - t0).astype('timedelta64[D]').astype(np.int)

# EDA which makes sense
https://www.kaggle.com/ambrosm/tpsjan22-01-eda-which-makes-sense/notebook

# Official public holidays an unofficial days, Norway, Sweden, Finland
https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298990

In [6]:
holidays = pd.read_csv("data/holidays.csv", parse_dates=['date'])
print(holidays.head(), end='\n\n')
print(holidays.info())

        date country            event        type
0 2015-01-01  Norway   New Year's Day      public
1 2015-02-08  Norway     Mother's Day  unofficial
2 2015-02-14  Norway  Valentine's Day  unofficial
3 2015-03-20  Norway   Spring Equinox       other
4 2015-03-29  Norway      Palm Sunday  unofficial

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 432 entries, 0 to 431
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   date     432 non-null    datetime64[ns]
 1   country  432 non-null    object        
 2   event    432 non-null    object        
 3   type     432 non-null    object        
dtypes: datetime64[ns](1), object(3)
memory usage: 13.6+ KB
None


# Days til next holiday
https://www.kaggle.com/c/tabular-playground-series-jan-2022/discussion/298411

In [7]:
import holidays

def days_til_next_holi(country, date):
    country_holidays = holidays.CountryHoliday(country, years=[date.year, date.year+1])
    next_date = min([day for day in country_holidays if day >= date])
    return (next_date - date).days

def is_holi(country, date):
    country_holidays = holidays.CountryHoliday(country, years=date.year)
    return date in country_holidays

def add_holidays(df):
    df = df.copy()
    df['isHoliday'] = df.apply(lambda x: is_holi(x['country'], x['date'].date()), axis=1)
    df['daysTillHoliday'] = df.apply(lambda x: days_til_next_holi(x['country'], x['date'].date()), axis=1)
    return df

# Feature engineering from linear model
https://www.kaggle.com/ambrosm/tpsjan22-03-linear-model

In [8]:
def engineer(df):
    df = df.copy()
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]
    
    new_df = pd.DataFrame({
        'gdp': np.log(df.apply(get_gdp, axis=1)),
        'wd4': df.date.dt.weekday == 4, # Friday
        'wd56': df.date.dt.weekday >= 5, # Saturday and sunday
    })
    new_df['country'] = df['country']
    new_df['date'] = df['date']
    new_df = add_holidays(new_df)
    new_df.drop(['country', 'date'], axis=1, inplace=True)
    
    # One hot encoding
    for country in ['Finland', 'Norway']:
        new_df[country] = df.country == country
    new_df['KaggleRama'] = df.store == 'KaggleRama'
    for product in ['Kaggle Mug', 'Kaggle Hat']:
        new_df[product] = df['product'] == product
    
    # Fourier series of Seasonal variations
    dayofyear = df.date.dt.dayofyear
    for k in range(3):
        new_df[f'sin{k}'] = np.sin(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'cos{k}'] = np.cos(dayofyear / 365 * 2 * math.pi * k)
        new_df[f'mug_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Mug']
        new_df[f'mug_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Mug']
        new_df[f'hat_sin{k}'] = new_df[f'sin{k}'] * new_df['Kaggle Hat']
        new_df[f'hat_cos{k}'] = new_df[f'cos{k}'] * new_df['Kaggle Hat']
    
    return new_df

In [9]:
new_train_df = engineer(train_df)
# new_train_df['date'] = train_df.date
new_train_df['num_sold'] = train_df.num_sold.astype(np.float32)
new_test_df = engineer(test_df)

print(new_train_df.head())

features = new_test_df.columns
for df in [new_train_df, new_test_df]:
    df[features] = df[features].astype(np.float32)
print(list(features))

      gdp    wd4   wd56  isHoliday  daysTillHoliday  Finland  Norway  \
0  5.4572  False  False       True                0     True   False   
1  5.4572  False  False       True                0     True   False   
2  5.4572  False  False       True                0     True   False   
3  5.4572  False  False       True                0     True   False   
4  5.4572  False  False       True                0     True   False   

   KaggleRama  Kaggle Mug  Kaggle Hat  ...  mug_cos1  hat_sin1  hat_cos1  \
0       False        True       False  ...  0.999852  0.000000  0.000000   
1       False       False        True  ...  0.000000  0.017213  0.999852   
2       False       False       False  ...  0.000000  0.000000  0.000000   
3        True        True       False  ...  0.999852  0.000000  0.000000   
4        True       False        True  ...  0.000000  0.017213  0.999852   

       sin2      cos2  mug_sin2  mug_cos2  hat_sin2  hat_cos2  num_sold  
0  0.034422  0.999407  0.034422  0.9

## Took a while to load new data, save it:

In [10]:
new_train_df.to_csv("Featured_Train.csv")
new_test_df.to_csv("Featured_Test.csv")