# feature importance

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

data_path = "./data/"
df_train = pd.read_csv(data_path + "train.csv")
df_test = pd.read_csv(data_path + "test.csv")

# drop id
df_train.drop('id', axis=1, inplace=True)

df_train.head(10)

Unnamed: 0,date,country,store,product,num_sold
0,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Improve Your Coding,63
1,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Train More LLMs,66
2,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win Friends and Influence People,9
3,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Win More Kaggle Competitions,59
4,2017-01-01,Argentina,Kaggle Learn,Using LLMs to Write Better,49
5,2017-01-01,Argentina,Kaggle Store,Using LLMs to Improve Your Coding,88
6,2017-01-01,Argentina,Kaggle Store,Using LLMs to Train More LLMs,98
7,2017-01-01,Argentina,Kaggle Store,Using LLMs to Win Friends and Influence People,14
8,2017-01-01,Argentina,Kaggle Store,Using LLMs to Win More Kaggle Competitions,83
9,2017-01-01,Argentina,Kaggle Store,Using LLMs to Write Better,69


### Features

In [21]:
# helper functions
import requests
import holidays

# gdp feature

def get_gdp_per_capita(country,year):
    alpha3 = {'Argentina':'ARG','Canada':'CAN','Estonia':'EST','Japan':'JPN','Spain':'ESP'}
    url="https://api.worldbank.org/v2/country/{0}/indicator/NY.GDP.PCAP.CD?date={1}&format=json".format(alpha3[country],year)
    response = requests.get(url).json()
    return response[1][0]['value']

def create_gdp_df(df):
    gdp = []
    for country in df.country.unique():
        row = []
        for year in range(2017,2023):
            row.append(get_gdp_per_capita(country,year))
        gdp.append(row)

    gdp = np.array(gdp)
    gdp /= np.sum(gdp,axis=0)

    rel_gdp_df = pd.DataFrame(gdp,index=df.country.unique(),columns=range(2017,2023))
    return rel_gdp_df

def add_gdp_feature(df):
    rel_gdp_df_0 = create_gdp_df(df_train)
    rel_gdp_df = rel_gdp_df_0.reset_index(names="country")
    rel_gdp_df = pd.melt(rel_gdp_df, id_vars='country', value_vars=[2017, 2018, 2019, 2020, 2021, 2022])
    rel_gdp_df.columns = ['country', 'year', 'rel_gdp']
    rel_gdp_df['year'] = rel_gdp_df['year'].astype(int)
    df = df.merge(rel_gdp_df, on=['year', 'country'], how='left')
    return df

# holidays feature

def create_holidays_df():
    years = [2017, 2018, 2019, 2020, 2021, 2022, 2023]

    countries = {
        "AR": "Argentina",
        "CA": "Canada",
        "EE": "Estonia",
        "ES": "Spain",
        "JP": "Japan"
    }


    Argentina_holidays = holidays.CountryHoliday('AR', years=years)
    Canada_holidays = holidays.CountryHoliday('CA', years=years)
    Estonia_holidays = holidays.CountryHoliday('EE', years=years)
    Spain_holidays = holidays.CountryHoliday('ES', years=years)
    Japan_holidays = holidays.CountryHoliday('JP', years=years)

    all_holidays = [Argentina_holidays, Canada_holidays, Estonia_holidays, Spain_holidays, Japan_holidays]



    holidays_dfs = []
    for country_holidays in all_holidays:
        holidays_dates = []
        for date in country_holidays:
            holidays_dates.append(date)

        holidays_dates = list(set(holidays_dates))

        hdf = pd.DataFrame(holidays_dates, columns=['date'])
        hdf['country'] = countries[country_holidays.country]

        holidays_dfs.append(hdf)

    holidays_df = pd.concat(holidays_dfs)

    holidays_df['is_holiday'] = True

    holidays_df['date'] = pd.to_datetime(holidays_df['date'])
    return holidays_df

def add_holiday_feature(df):
    holidays_df = create_holidays_df()
    df = df.merge(holidays_df, on=['date', 'country'], how='left')
    df['is_holiday'] = df['is_holiday'].fillna(False)
    df['is_holiday'] = df['is_holiday'].astype(int)
    return df

In [22]:
import holidays

def transform_date(df):
    df['date'] = pd.to_datetime(df['date'])
    return df

def create_features(df):
    # split date into year, month, day
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day

    # week number
    df['week_number'] = df['date'].dt.isocalendar().week.astype(int)
    
    # day of week
    df['dayofweek'] = df['date'].dt.dayofweek
    df['dayofmonth'] = df['date'].dt.day
    # weekend
    df['weekend'] = (df['date'].dt.weekday >=4).astype(int)

    df['country'] = df['country'].astype('category')
    df['store'] = df['store'].astype('category')
    df['product'] = df['product'].astype('category')

    return df

def encode_cat_variables(df):
    categorical_features = ["country", "store", "product"]
    df = pd.get_dummies(df, columns=categorical_features)
    return df

def seasonality_features(df):
    df['month_sin'] = np.sin(2*np.pi*df.month/12)
    df['month_cos'] = np.cos(2*np.pi*df.month/12)
    df['day_sin'] = np.sin(2*np.pi*df.dayofmonth/31)
    df['day_cos'] = np.cos(2*np.pi*df.dayofmonth/31)
    return df

In [23]:
df_train_0 = transform_date(df_train)
df_train_0 = create_features(df_train_0)
df_train_0 = seasonality_features(df_train_0)
df_train_0 = add_gdp_feature(df_train_0)
df_train_0 = add_holiday_feature(df_train_0)
df_train_0 = encode_cat_variables(df_train_0)
df_train_0.head()

Unnamed: 0,date,num_sold,year,month,day,week_number,dayofweek,dayofmonth,weekend,month_sin,...,country_Japan,country_Spain,store_Kagglazon,store_Kaggle Learn,store_Kaggle Store,product_Using LLMs to Improve Your Coding,product_Using LLMs to Train More LLMs,product_Using LLMs to Win Friends and Influence People,product_Using LLMs to Win More Kaggle Competitions,product_Using LLMs to Write Better
0,2017-01-01,63,2017,1,1,52,6,1,1,0.5,...,False,False,False,True,False,True,False,False,False,False
1,2017-01-01,66,2017,1,1,52,6,1,1,0.5,...,False,False,False,True,False,False,True,False,False,False
2,2017-01-01,9,2017,1,1,52,6,1,1,0.5,...,False,False,False,True,False,False,False,True,False,False
3,2017-01-01,59,2017,1,1,52,6,1,1,0.5,...,False,False,False,True,False,False,False,False,True,False
4,2017-01-01,49,2017,1,1,52,6,1,1,0.5,...,False,False,False,True,False,False,False,False,False,True


In [24]:
model_features = df_train_0.columns.tolist()
model_features.remove("num_sold")
model_features.remove("date")
model_features

['year',
 'month',
 'day',
 'week_number',
 'dayofweek',
 'dayofmonth',
 'weekend',
 'month_sin',
 'month_cos',
 'day_sin',
 'day_cos',
 'rel_gdp',
 'is_holiday',
 'country_Argentina',
 'country_Canada',
 'country_Estonia',
 'country_Japan',
 'country_Spain',
 'store_Kagglazon',
 'store_Kaggle Learn',
 'store_Kaggle Store',
 'product_Using LLMs to Improve Your Coding',
 'product_Using LLMs to Train More LLMs',
 'product_Using LLMs to Win Friends and Influence People',
 'product_Using LLMs to Win More Kaggle Competitions',
 'product_Using LLMs to Write Better']

Function by which submissions are scored is SMAPE:

In [25]:
def SMAPE(A, F):
    return 100/len(A) * np.sum(2 * np.abs(F - A) / (np.abs(A) + np.abs(F)))

In [26]:
df_train_0[model_features].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 136950 entries, 0 to 136949
Data columns (total 26 columns):
 #   Column                                                  Non-Null Count   Dtype  
---  ------                                                  --------------   -----  
 0   year                                                    136950 non-null  int32  
 1   month                                                   136950 non-null  int32  
 2   day                                                     136950 non-null  int32  
 3   week_number                                             136950 non-null  int32  
 4   dayofweek                                               136950 non-null  int32  
 5   dayofmonth                                              136950 non-null  int32  
 6   weekend                                                 136950 non-null  int32  
 7   month_sin                                               136950 non-null  float64
 8   month_cos               

In [30]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor

import xgboost as xgb

train_index, val_index = [ x for x in TimeSeriesSplit(n_splits=5).split(df_train_0) ][-1]

X_train = df_train_0.iloc[train_index][model_features]
y_train = df_train_0.iloc[train_index]['num_sold']

X_val = df_train_0.iloc[val_index][model_features]
y_val = df_train_0.iloc[val_index]['num_sold']

cat_features_indices = np.where((X_train.dtypes == "category") | (X_train.dtypes == "object"))[0]
cat_features_indices

model_xgb = xgb.XGBRegressor(tree_method="gpu_hist", enable_categorical=True)

model_xgb.fit(
    X_train, y_train,
    eval_set=[( X_train, y_train), ( X_val, y_val)],
    verbose=False,
)

y_pred = model_xgb.predict(X_val).astype(int)

print(f"SMAPE score: {SMAPE(y_val, y_pred)}")

SMAPE score: 12.133348898565481


In [31]:
from sklearn.ensemble import RandomForestRegressor

best_hyperparams = {'n_estimators': 551,
 'max_depth': 89,
 'bootstrap': True,
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'max_features': 'log2'}

model_rfr = RandomForestRegressor(**best_hyperparams)
model_rfr.fit(X_train, y_train)

y_pred = model_rfr.predict(X_val).astype(int)

print(f"SMAPE score: {SMAPE(y_val, y_pred)}")

KeyboardInterrupt: 

### Plot feature importance

In [29]:
# get feature importance from model

feature_importance = pd.DataFrame({'feature': X_train.columns, 'importance': model.feature_importances_})
feature_importance.sort_values(by='importance', ascending=False, inplace=True)
feature_importance.reset_index(drop=True, inplace=True)
feature_importance

Unnamed: 0,feature,importance
0,store_Kagglazon,0.325822
1,product_Using LLMs to Win Friends and Influenc...,0.146108
2,store_Kaggle Learn,0.129218
3,store_Kaggle Store,0.093561
4,rel_gdp,0.083885
5,country_Argentina,0.049373
6,country_Canada,0.03283
7,product_Using LLMs to Improve Your Coding,0.022907
8,product_Using LLMs to Train More LLMs,0.022709
9,country_Japan,0.015201
