In [3]:
import pandas as pd
import numpy as np
import pickle
import itertools
import gc
import math
import matplotlib.pyplot as plt
import dateutil.easter as easter
from matplotlib.ticker import MaxNLocator, FormatStrFormatter, PercentFormatter
from datetime import datetime, date, timedelta
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import GroupKFold
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_percentage_error
import scipy.stats

In [6]:
NO_STORE = True

original_train_df = pd.read_csv('/home/onyxia/work/Forecasting_Sticker_Sales/train.csv')
original_test_df = pd.read_csv('/home/onyxia/work/Forecasting_Sticker_Sales/test.csv')

gdp_df = pd.read_csv('/home/onyxia/work/Forecasting_Sticker_Sales/linear_model/filtered_gdp_per_capita.csv')
gdp_df.set_index('year', inplace=True)

cci_df = pd.read_csv('/home/onyxia/work/Forecasting_Sticker_Sales/linear_model/DP_LIVE_21012022073653464.csv')
cci_df.set_index(['LOCATION', 'TIME'], inplace=True)

# The dates are read as strings and must be converted
for df in [original_train_df, original_test_df]:
    df['date'] = pd.to_datetime(df.date)
original_train_df.head(6)

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
5,5,2010-01-01,Canada,Stickers for Less,Holographic Goose,300.0


In [7]:
display(original_train_df.head())
display(gdp_df.head())
display(cci_df.head())

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0


Unnamed: 0_level_0,GDP_Year,GDP_GDP_Canada,GDP_GDP_Finland,GDP_GDP_Italy,GDP_GDP_Kenya,GDP_GDP_Norway,GDP_GDP_Singapore
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,2010,47562.083425,46459.973255,36000.520118,1080.296184,87693.790066,47236.960235
1,2011,52223.696112,51081.99767,38599.062207,1085.487152,100600.562408,53890.428727
2,2012,52669.089963,47710.790217,35053.526244,1271.815383,101524.141852,55546.488539
3,2013,52635.174958,49878.043244,35549.974697,1354.820833,102913.450844,56967.425794
4,2014,50955.998323,50260.299859,35518.415292,1462.220052,97019.182753,57562.530794


Unnamed: 0_level_0,Unnamed: 1_level_0,INDICATOR,SUBJECT,MEASURE,FREQUENCY,Value,Flag Codes
LOCATION,TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
NLD,1973-01,CCI,AMPLITUD,LTRENDIDX,M,101.5028,
NLD,1973-02,CCI,AMPLITUD,LTRENDIDX,M,101.4815,
NLD,1973-03,CCI,AMPLITUD,LTRENDIDX,M,101.3081,
NLD,1973-04,CCI,AMPLITUD,LTRENDIDX,M,101.0173,
NLD,1973-05,CCI,AMPLITUD,LTRENDIDX,M,100.8456,


In [21]:
unique_countries = cci_df.index.get_level_values(0).unique()
print(f"Number of countries: {len(unique_countries)}")
print("Countries:", unique_countries.tolist())

Number of countries: 44
Countries: ['NLD', 'CHE', 'FRA', 'POL', 'CZE', 'JPN', 'OECDE', 'AUS', 'OECD', 'SWE', 'MEX', 'GBR', 'ZAF', 'USA', 'HUN', 'PRT', 'DNK', 'ESP', 'LUX', 'GRC', 'BRA', 'SVK', 'CHN', 'BEL', 'FIN', 'NZL', 'G-7', 'IDN', 'TUR', 'AUT', 'ITA', 'IRL', 'SVN', 'DEU', 'KOR', 'EST', 'EA19', 'ISR', 'RUS', 'LVA', 'LTU', 'COL', 'CHL', 'CRI']


In [22]:
def mape(y_true, y_pred):
    """
    MAPE Loss

    """
    mape = mean_absolute_percentage_error(actual, predicted) * 100
    return mape

### Feature Engineering

In [23]:
cci_df.columns

Index(['INDICATOR', 'SUBJECT', 'MEASURE', 'FREQUENCY', 'Value', 'Flag Codes'], dtype='object')

In [26]:
print(cci_df['INDICATOR'].unique())
print(cci_df['SUBJECT'].unique())
print(cci_df['MEASURE'].unique())
print(cci_df['FREQUENCY'].unique())
print(cci_df['Value'].unique())
print(cci_df['Flag Codes'].unique())

['CCI']
['AMPLITUD']
['LTRENDIDX']
['M']
[101.5028  101.4815  101.3081  ...  99.4906   99.57128  99.61041]
[nan]


In [24]:
# List of countries to check
countries = ['Canada', 'Finland', 'Italy', 'Kenya', 'Norway', 'Singapore']

# Mapping country names to ISO Alpha-3 codes
country_codes = {
    'Canada': 'CAN',
    'Finland': 'FIN',
    'Italy': 'ITA',
    'Kenya': 'KEN',
    'Norway': 'NOR',
    'Singapore': 'SGP'
}

# Extracting the 'LOCATION' index
locations_in_index = cci_df.index.get_level_values('LOCATION')

# Check for existence of countries
existing_countries = [country for country in countries if country_codes.get(country, country) in locations_in_index]
missing_countries = [country for country in countries if country_codes.get(country, country) not in locations_in_index]

print("Existing countries:", existing_countries)
print("Missing countries:", missing_countries)

Existing countries: ['Finland', 'Italy']
Missing countries: ['Canada', 'Kenya', 'Norway', 'Singapore']


In [None]:
def engineer(df):
    """Return a new dataframe with the engineered features"""
    
    def get_gdp(row):
        country = 'GDP_' + row.country
        return gdp_df.loc[row.date.year, country]

    def get_cci(row):
        country = row.country
        time = f"{row.date.year}-{row.date.month:02d}"
        return cci_df.loc[country[:3].upper(), time].Value

    new_df = pd.DataFrame({'gdp': np.log(df.apply(get_gdp, axis=1)),
                           'cci': df.apply(get_cci, axis=1),
                           'wd4': df.date.dt.weekday == 4, # Friday
                           'wd56': df.date.dt.weekday >= 5, # Saturday and Sunday
                          })

    