In [1]:
import numpy as np
import pandas as pd

from typing import Union
from tqdm.notebook import tqdm_notebook as tqdm

In [2]:
cd C:/Users/minih/python_prac1/library/m5/

C:\Users\minih\python_prac1\library\m5


In [3]:
df_train_full = pd.read_csv("sales_train_evaluation.csv")
df_train_full.iloc[:, -31:].head()

Unnamed: 0,d_1911,d_1912,d_1913,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,...,d_1932,d_1933,d_1934,d_1935,d_1936,d_1937,d_1938,d_1939,d_1940,d_1941
0,0,1,1,0,0,0,2,0,3,5,...,2,4,0,0,0,0,3,3,0,1
1,0,0,0,0,1,0,0,0,0,0,...,0,1,2,1,1,0,0,0,0,0
2,1,1,1,0,0,1,1,0,2,1,...,1,0,2,0,0,0,2,3,0,1
3,3,7,2,0,0,1,2,4,1,6,...,1,1,0,4,0,1,3,0,2,6
4,2,2,4,1,0,2,3,1,0,3,...,0,0,0,2,1,0,0,2,1,0


In [5]:
class WRMSSEEvaluator(object):

    def __init__(self, train_df: pd.DataFrame, valid_df: pd.DataFrame, calendar: pd.DataFrame, prices: pd.DataFrame):
        train_y = train_df.loc[:, train_df.columns.str.startswith('d_')]
        train_target_columns = train_y.columns.tolist()
        weight_columns = train_y.iloc[:, -28:].columns.tolist()

        train_df['all_id'] = 0  # for lv1 aggregation

        id_columns = train_df.loc[:, ~train_df.columns.str.startswith('d_')].columns.tolist()
        valid_target_columns = valid_df.loc[:, valid_df.columns.str.startswith('d_')].columns.tolist()

        if not all([c in valid_df.columns for c in id_columns]):
            valid_df = pd.concat([train_df[id_columns], valid_df], axis=1, sort=False)

        self.train_df = train_df
        self.valid_df = valid_df
        self.calendar = calendar
        self.prices = prices

        self.weight_columns = weight_columns
        self.id_columns = id_columns
        self.valid_target_columns = valid_target_columns

        weight_df = self.get_weight_df()

        self.group_ids = (
            'all_id',
            'state_id',
            'store_id',
            'cat_id',
            'dept_id',
            ['state_id', 'cat_id'],
            ['state_id', 'dept_id'],
            ['store_id', 'cat_id'],
            ['store_id', 'dept_id'],
            'item_id',
            ['item_id', 'state_id'],
            ['item_id', 'store_id']
        )

        for i, group_id in enumerate(tqdm(self.group_ids)):
            train_y = train_df.groupby(group_id)[train_target_columns].sum()
            scale = []
            for _, row in train_y.iterrows():
                series = row.values[np.argmax(row.values != 0):]
                scale.append(((series[1:] - series[:-1]) ** 2).mean())
            setattr(self, f'lv{i + 1}_scale', np.array(scale))
            setattr(self, f'lv{i + 1}_train_df', train_y)
            setattr(self, f'lv{i + 1}_valid_df', valid_df.groupby(group_id)[valid_target_columns].sum())

            lv_weight = weight_df.groupby(group_id)[weight_columns].sum().sum(axis=1)
            setattr(self, f'lv{i + 1}_weight', lv_weight / lv_weight.sum())

    def get_weight_df(self) -> pd.DataFrame:
        day_to_week = self.calendar.set_index('d')['wm_yr_wk'].to_dict()
        weight_df = self.train_df[['item_id', 'store_id'] + self.weight_columns].set_index(['item_id', 'store_id'])
        weight_df = weight_df.stack().reset_index().rename(columns={'level_2': 'd', 0: 'value'})
        weight_df['wm_yr_wk'] = weight_df['d'].map(day_to_week)

        weight_df = weight_df.merge(self.prices, how='left', on=['item_id', 'store_id', 'wm_yr_wk'])
        weight_df['value'] = weight_df['value'] * weight_df['sell_price']
        weight_df = weight_df.set_index(['item_id', 'store_id', 'd']).unstack(level=2)['value']
        weight_df = weight_df.loc[zip(self.train_df.item_id, self.train_df.store_id), :].reset_index(drop=True)
        weight_df = pd.concat([self.train_df[self.id_columns], weight_df], axis=1, sort=False)
        return weight_df

    def rmsse(self, valid_preds: pd.DataFrame, lv: int) -> pd.Series:
        valid_y = getattr(self, f'lv{lv}_valid_df')
        score = ((valid_y - valid_preds) ** 2).mean(axis=1)
        scale = getattr(self, f'lv{lv}_scale')
        return (score / scale).map(np.sqrt)

    def score(self, valid_preds: Union[pd.DataFrame, np.ndarray]) -> float:
        assert self.valid_df[self.valid_target_columns].shape == valid_preds.shape

        if isinstance(valid_preds, np.ndarray):
            valid_preds = pd.DataFrame(valid_preds, columns=self.valid_target_columns)

        valid_preds = pd.concat([self.valid_df[self.id_columns], valid_preds], axis=1, sort=False)

        group_ids = []
        all_scores = []
        for i, group_id in enumerate(self.group_ids):
            lv_scores = self.rmsse(valid_preds.groupby(group_id)[self.valid_target_columns].sum(), i + 1)
            weight = getattr(self, f'lv{i + 1}_weight')
            lv_scores = pd.concat([weight, lv_scores], axis=1, sort=False).prod(axis=1)
            group_ids.append(group_id)
            all_scores.append(lv_scores.sum())

        return group_ids, all_scores

In [7]:
def get_lb_rank(score):
    df_lb = pd.read_csv("m5-forecasting-accuracy-publicleaderboard.csv")

    return (df_lb.Score <= score).sum() + 1

In [8]:
## reading data
df_calendar = pd.read_csv("calendar.csv")
df_prices = pd.read_csv("sell_prices.csv")
df_sample_submission = pd.read_csv("sample_submission.csv")
df_sample_submission["order"] = range(df_sample_submission.shape[0])

df_train = df_train_full.iloc[:, :-28]
df_valid = df_train_full.iloc[:, -28:]

evaluator = WRMSSEEvaluator(df_train, df_valid, df_calendar, df_prices)

HBox(children=(FloatProgress(value=0.0, max=12.0), HTML(value='')))




In [10]:
evaluator.get_weight_df()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,all_id,d_1886,d_1887,d_1888,...,d_1904,d_1905,d_1906,d_1907,d_1908,d_1909,d_1910,d_1911,d_1912,d_1913
0,HOBBIES_1_001_CA_1_evaluation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,0,8.26,0.0,0.00,...,8.38,25.14,0.00,8.38,8.38,8.38,25.14,0.00,8.38,8.38
1,HOBBIES_1_002_CA_1_evaluation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,0,3.97,0.0,0.00,...,0.00,0.00,0.00,0.00,0.00,3.97,0.00,0.00,0.00,0.00
2,HOBBIES_1_003_CA_1_evaluation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,0,0.00,0.0,0.00,...,5.94,2.97,5.94,2.97,2.97,2.97,0.00,2.97,2.97,2.97
3,HOBBIES_1_004_CA_1_evaluation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,0,0.00,0.0,0.00,...,4.64,0.00,23.20,18.56,4.64,0.00,4.64,13.92,32.48,9.28
4,HOBBIES_1_005_CA_1_evaluation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,0,2.88,0.0,11.52,...,5.76,2.88,2.88,0.00,2.88,2.88,5.76,5.76,5.76,11.52
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,FOODS_3_823_WI_3_evaluation,FOODS_3_823,FOODS_3,FOODS,WI_3,WI,0,0.00,0.0,0.00,...,5.96,0.00,0.00,0.00,0.00,0.00,2.98,0.00,0.00,2.98
30486,FOODS_3_824_WI_3_evaluation,FOODS_3_824,FOODS_3,FOODS,WI_3,WI,0,0.00,0.0,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,2.48,0.00
30487,FOODS_3_825_WI_3_evaluation,FOODS_3_825,FOODS_3,FOODS,WI_3,WI,0,3.98,0.0,0.00,...,7.96,3.98,0.00,7.96,0.00,3.98,0.00,0.00,3.98,0.00
30488,FOODS_3_826_WI_3_evaluation,FOODS_3_826,FOODS_3,FOODS,WI_3,WI,0,3.84,0.0,0.00,...,0.00,0.00,1.28,0.00,0.00,1.28,0.00,3.84,1.28,3.84


In [53]:
preds_valid = pd.read_csv("submission_6_03_3.csv")
for column in preds_valid.columns:
    if column == "id":
        continue
    preds_valid[column] = preds_valid[column].apply(lambda x:x*1.05)

In [54]:
#preds_valid = pd.read_csv("submission_6_04_3.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

In [55]:
groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.27618
Score for group state_id: 0.3621
Score for group store_id: 0.49279
Score for group cat_id: 0.36774
Score for group dept_id: 0.45927
Score for group ['state_id', 'cat_id']: 0.45668
Score for group ['state_id', 'dept_id']: 0.53662
Score for group ['store_id', 'cat_id']: 0.5801
Score for group ['store_id', 'dept_id']: 0.65517
Score for group item_id: 0.87315
Score for group ['item_id', 'state_id']: 0.86394
Score for group ['item_id', 'store_id']: 0.85557

Public LB Score: 0.56494
Public LB Rank: 8672


In [32]:
preds_valid = pd.read_csv("new_submittion4.csv")
preds_valid = preds_valid[preds_valid.id.str.contains("validation")]
preds_valid = preds_valid.merge(df_sample_submission[["id", "order"]], on = "id").sort_values("order").drop(["id", "order"], axis = 1)
preds_valid.rename(columns = {
    "F1": "d_1914", "F2": "d_1915", "F3": "d_1916", "F4": "d_1917", "F5": "d_1918", "F6": "d_1919", "F7": "d_1920",
    "F8": "d_1921", "F9": "d_1922", "F10": "d_1923", "F11": "d_1924", "F12": "d_1925", "F13": "d_1926", "F14": "d_1927",
    "F15": "d_1928", "F16": "d_1929", "F17": "d_1930", "F18": "d_1931", "F19": "d_1932", "F20": "d_1933", "F21": "d_1934",
    "F22": "d_1935", "F23": "d_1936", "F24": "d_1937", "F25": "d_1938", "F26": "d_1939", "F27": "d_1940", "F28": "d_1941"
}, inplace = True)

In [33]:
groups, scores = evaluator.score(preds_valid)

score_public_lb = np.mean(scores)
score_public_rank = get_lb_rank(score_public_lb)

for i in range(len(groups)):
    print(f"Score for group {groups[i]}: {round(scores[i], 5)}")

print(f"\nPublic LB Score: {round(score_public_lb, 5)}")
print(f"Public LB Rank: {score_public_rank}")

Score for group all_id: 0.29662
Score for group state_id: 0.36528
Score for group store_id: 0.45811
Score for group cat_id: 0.34339
Score for group dept_id: 0.45004
Score for group ['state_id', 'cat_id']: 0.43419
Score for group ['state_id', 'dept_id']: 0.53171
Score for group ['store_id', 'cat_id']: 0.53132
Score for group ['store_id', 'dept_id']: 0.62222
Score for group item_id: 0.82475
Score for group ['item_id', 'state_id']: 0.83321
Score for group ['item_id', 'store_id']: 0.83494

Public LB Score: 0.54382
Public LB Rank: 7921
