In [1]:
# implémentation du RMSSE (weighted)

In [9]:
import pandas as pd
import numpy as np
from typing import Union
from tqdm.notebook import tqdm

In [3]:
df_train_full = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/data/sales_train_evaluation.csv")
df_train_full.iloc[:, -31:].head()

Unnamed: 0,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0,1,1,0,0,0,2,0,3,5,...,2,4,0,0,0,0,3,3,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,1,1,1,0,0,1,1,0,2,1,...,1,0,2,0,0,0,2,3,0,1
3,3,7,2,0,0,1,2,4,1,6,...,1,1,0,4,0,1,3,0,2,6
4,2,2,4,1,0,2,3,1,0,3,...,0,0,0,2,1,0,0,2,1,0


In [4]:

class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        
        # 28 derniers jours pour le poids
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'cat_id',
            'state_id',
            'dept_id',
            'store_id',
            'item_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        group_ids = []
        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            group_ids.append(group_id)
            all_scores.append(lv_scores.sum())

        return group_ids, all_scores


In [5]:
## public LB rank
def get_lb_rank(score):
    """
    score de la LB depuis le fichier de la leaderboard fourni par kaggle
    """
    df_lb = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/data/datasets_687702_1205782_m5-forecasting-accuracy-publicleaderboard-rank.csv")

    return (df_lb.Score <= score).sum() + 1


In [6]:
## reading data
df_calendar = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/data/calendar.csv")
df_prices = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/data/sell_prices.csv")
df_sample_submission = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/data/sample_submission.csv")
df_sample_submission["order"] = range(df_sample_submission.shape[0])

df_train = df_train_full.iloc[:, :-28]
df_valid = df_train_full.iloc[:, -28:]

evaluator = WRMSSEEvaluator(df_train, df_valid, df_calendar, df_prices)

HBox(children=(IntProgress(value=0, max=12), HTML(value='')))




In [7]:
## structure de validation
# les F1 - F28 sont remplacés par leur véritables valeur d_1914 -> d_1941
preds_valid = df_valid.copy() + np.random.randint(100, size = df_valid.shape)
preds_valid.head()

Unnamed: 0,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,d_1922,d_1923,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,97,22,4,43,22,13,11,26,15,89,...,67,41,7,26,93,23,72,83,18,45
1,35,22,52,63,92,9,62,71,5,38,...,41,3,47,12,18,4,43,11,54,51
2,48,46,8,12,98,2,59,13,30,83,...,31,25,7,27,34,98,39,53,17,41
3,1,60,42,40,11,84,20,18,42,90,...,86,94,23,79,14,29,68,50,37,77
4,72,77,99,67,9,41,48,39,49,99,...,36,19,60,78,41,81,3,5,60,20


In [26]:
#SCORE PROPHET DE BASE SANS TUNING NI REGRESSOR EN PLUS

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/submission_prophet2_ALL.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.81167
Score for group cat_id: 0.81156
Score for group state_id: 0.80682
Score for group dept_id: 0.86382
Score for group store_id: 0.83913
Score for group item_id: 0.99046
Score for group ['state_id', 'cat_id']: 0.82135
Score for group ['state_id', 'dept_id']: 0.85618
Score for group ['store_id', 'cat_id']: 0.85852
Score for group ['store_id', 'dept_id']: 0.88607
Score for group ['item_id', 'state_id']: 0.93917
Score for group ['item_id', 'store_id']: 0.89277

Public LB Score: 0.86479
Public LB Rank: 3559


In [24]:
# SCORE LIGHTGBM AVEC LES LAGS et ALPHA CHANGEANT POUR EVITER OVERFITTING

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/submission_1_all_data.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.22446
Score for group cat_id: 0.28115
Score for group state_id: 0.31021
Score for group dept_id: 0.36466
Score for group store_id: 0.4162
Score for group item_id: 0.80661
Score for group ['state_id', 'cat_id']: 0.37909
Score for group ['state_id', 'dept_id']: 0.46331
Score for group ['store_id', 'cat_id']: 0.48759
Score for group ['store_id', 'dept_id']: 0.57376
Score for group ['item_id', 'state_id']: 0.81841
Score for group ['item_id', 'store_id']: 0.82496

Public LB Score: 0.49587
Public LB Rank: 1698


In [28]:


# SCORE LIGHTGBM AVEC LES LAGS et ALPHA CHANGEANT POUR EVITER OVERFITTING

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/submission_hugo_lightGBM.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.23689
Score for group cat_id: 0.29054
Score for group state_id: 0.32037
Score for group dept_id: 0.37282
Score for group store_id: 0.42771
Score for group item_id: 0.80696
Score for group ['state_id', 'cat_id']: 0.38572
Score for group ['state_id', 'dept_id']: 0.46951
Score for group ['store_id', 'cat_id']: 0.49558
Score for group ['store_id', 'dept_id']: 0.58178
Score for group ['item_id', 'state_id']: 0.819
Score for group ['item_id', 'store_id']: 0.82556

Public LB Score: 0.5027
Public LB Rank: 1789


In [18]:
#SCORE PROPHET et mlighGBM mean

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/submission_mean_p_l.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.32808
Score for group cat_id: 0.36123
Score for group state_id: 0.40072
Score for group dept_id: 0.45352
Score for group store_id: 0.47683
Score for group item_id: 0.85941
Score for group ['state_id', 'cat_id']: 0.44934
Score for group ['state_id', 'dept_id']: 0.53176
Score for group ['store_id', 'cat_id']: 0.53543
Score for group ['store_id', 'dept_id']: 0.61711
Score for group ['item_id', 'state_id']: 0.85519
Score for group ['item_id', 'store_id']: 0.8493

Public LB Score: 0.55983
Public LB Rank: 2198


In [12]:
p_w_eval = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/prophet_tuned_without_eval_computed.csv")
p_w_eval = p_w_eval.drop('Unnamed: 0',axis=1)

In [13]:
#SCORE PROPHET TUNED MAIS SANS LE CALUL DE L'EVAL, JUSTE LA VALIDATION

preds_valid = p_w_eval
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.50495
Score for group cat_id: 0.52624
Score for group state_id: 0.55787
Score for group dept_id: 0.62018
Score for group store_id: 0.6205
Score for group item_id: 1.0094
Score for group ['state_id', 'cat_id']: 0.59602
Score for group ['state_id', 'dept_id']: 0.67504
Score for group ['store_id', 'cat_id']: 0.668
Score for group ['store_id', 'dept_id']: 0.74063
Score for group ['item_id', 'state_id']: 0.96692
Score for group ['item_id', 'store_id']: 0.93122

Public LB Score: 0.70141
Public LB Rank: 3144


In [15]:
test_metric = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/test_metric_computation.csv")
test_metric = test_metric.drop('Unnamed: 0',axis=1)

In [16]:
#TEST DE LA METRIC POUR SAVOIR SI DANS LE CALCUL DU LB PUBLIC IL Y A L'EVAL OU JUSTE LA VALIDATION
# IL S'AGIT ICI D'UN DATASET AVEC QUE DES 0 DANS LA PARTIE EVAL POUR POUVOIR CALCULER AVEC LE SCORRE D'AU DESSUS

preds_valid = test_metric
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.50495
Score for group cat_id: 0.52624
Score for group state_id: 0.55787
Score for group dept_id: 0.62018
Score for group store_id: 0.6205
Score for group item_id: 1.0094
Score for group ['state_id', 'cat_id']: 0.59602
Score for group ['state_id', 'dept_id']: 0.67504
Score for group ['store_id', 'cat_id']: 0.668
Score for group ['store_id', 'dept_id']: 0.74063
Score for group ['item_id', 'state_id']: 0.96692
Score for group ['item_id', 'store_id']: 0.93122

Public LB Score: 0.70141
Public LB Rank: 3144


In [17]:
#LE SCORE EST LE MEME PAR CONSEQUENT PAS BESOIN DE COMPUTE LA PARTIE EVAL CA SAVE DU TEMPS

In [8]:
#TEST DE LA METRIC POUR SAVOIR SI DANS LE CALCUL DU LB PUBLIC IL Y A L'EVAL OU JUSTE LA VALIDATION
# IL S'AGIT ICI D'UN DATASET AVEC QUE DES 0 DANS LA PARTIE EVAL POUR POUVOIR CALCULER AVEC LE SCORRE D'AU DESSUS

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/kaggle_walmart/model/mean_prophet_light_lighthugo.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

FileNotFoundError: [Errno 2] File b'/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/kaggle_walmart/model/mean_prophet_light_lighthugo.csv' does not exist: b'/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/kaggle_walmart/model/mean_prophet_light_lighthugo.csv'

In [13]:
submission_sample = pd.read_csv('/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/data/sample_submission.csv')
fourier = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/fourier_each_item.csv")
fourier_all = pd.concat([fourier, submission_sample[30490:]], ignore_index= True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
fourier_all

Unnamed: 0,F1,F10,F11,F12,F13,F14,F15,F16,F17,F18,...,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913,id
0,,,,,,,,,,,...,0.649667,0.632877,1.072808,0.950150,0.314333,0.851343,0.429533,0.114580,0.741544,FOODS_1_001_CA_1_validation
1,,,,,,,,,,,...,1.035258,2.069586,1.170895,0.475853,1.282107,1.138495,1.350810,1.495057,1.139127,FOODS_1_001_CA_2_validation
2,,,,,,,,,,,...,1.055270,1.361907,1.266157,0.705037,0.366896,1.055044,1.559367,0.597559,1.880193,FOODS_1_001_CA_3_validation
3,,,,,,,,,,,...,0.557659,0.304791,0.622486,0.276241,0.216536,0.846667,0.300178,0.257675,0.160741,FOODS_1_001_CA_4_validation
4,,,,,,,,,,,...,0.334654,0.292669,0.999014,0.418376,0.200649,0.935617,0.277866,0.395747,0.376730,FOODS_1_001_TX_1_validation
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,FOODS_3_823_WI_3_evaluation
60976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,FOODS_3_824_WI_3_evaluation
60977,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,FOODS_3_825_WI_3_evaluation
60978,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,FOODS_3_826_WI_3_evaluation


In [10]:
#SCORE FOURIER SUR CHACUN DES ITEMS

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/fourier_each_item.csv")
preds_valid = preds_valid.reset_index()
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

AssertionError: 

In [8]:
#SCORE FOURIER SUR CHACUN DES ITEMS

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/submission/submission_prophet_store.csv")
#preds_valid = preds_valid.reset_index()
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.50615
Score for group cat_id: 0.52848
Score for group state_id: 0.55781
Score for group dept_id: 0.6231
Score for group store_id: 0.62012
Score for group item_id: 1.01146
Score for group ['state_id', 'cat_id']: 0.59698
Score for group ['state_id', 'dept_id']: 0.67686
Score for group ['store_id', 'cat_id']: 0.6697
Score for group ['store_id', 'dept_id']: 0.74324
Score for group ['item_id', 'state_id']: 0.96854
Score for group ['item_id', 'store_id']: 0.93169

Public LB Score: 0.70284
Public LB Rank: 3148


In [10]:
#SCORE LIGHT ET FOURIER

preds_valid = pd.read_csv("/Users/jerem/cours_esiee_paris/E4/kaggle_walmart/kaggle_walmart/model/submission_with_fourier.csv")
#preds_valid = preds_valid.reset_index()
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1).reset_index(drop = True)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 7.50597
Score for group cat_id: 7.2123
Score for group state_id: 6.88219
Score for group dept_id: 7.02579
Score for group store_id: 6.52817
Score for group item_id: 2.71747
Score for group ['state_id', 'cat_id']: 6.48894
Score for group ['state_id', 'dept_id']: 6.21863
Score for group ['store_id', 'cat_id']: 5.91125
Score for group ['store_id', 'dept_id']: 5.46313
Score for group ['item_id', 'state_id']: 1.97804
Score for group ['item_id', 'store_id']: 1.412

Public LB Score: 5.44532
Public LB Rank: 4405


In [None]:
def rmsse(y_true, y_pred, y_hist):
    h, n = len(y_true), len(y_hist)

    numerateur = np.sum((y_true - y_pred)**2)
    denominateur = 1/(n-1)*np.sum((y_hist[1:] - y_hist[:-1])**2)

    rmsse = np.sqrt(1/h * numerateur/denominateur)
    return rmsse