In [None]:
import sys

In [None]:
import joblib
import kaggle
import numpy as np
import pandas as pd
import scipy.stats as stats

In [None]:
src_dir = "../../src"
description = ""
accuracy = False
uncertainty = False

In [None]:
sys.path.append(src_dir)

In [None]:
from package.constants import *
from package.preprocessing import *

In [None]:
sample_submission = pd.read_csv(sample_submission_path)

In [None]:
prediction = pd.read_parquet(prediction_path)

In [None]:
d = prediction["d"].str[2:]
d = d.astype("int")
d -= train_days + 1

is_valid = d < evaluation_days
d %= evaluation_days

prediction["d"] = d
prediction.loc[is_valid, "id"] = prediction.loc[is_valid, "id"].str.replace(
    "_evaluation", "_validation"
)

In [None]:
submission = pd.pivot(prediction, index="id", columns="d", values=target)

In [None]:
submission.reset_index(inplace=True)

In [None]:
submission.columns = ["id"] + [f"F{i + 1}" for i in range(evaluation_days)]

In [None]:
submission = pd.merge(sample_submission["id"], submission, how="left")

In [None]:
submission.to_csv(submission_accuracy_path, index=False)

In [None]:
is_eval = submission["id"].str.endswith("_evaluation")
submission = submission[is_eval]

In [None]:
create_ids(submission)

In [None]:
def get_ratios(coef=0.15):
    qs2 = np.log(qs / (1 - qs)) * coef
    ratios = stats.norm.cdf(qs2)
    ratios /= ratios[4]
    ratios = pd.Series(ratios, index=qs)

    return ratios.round(3)


def quantile_coefs(q, level):
    ratios = level_coef_dict[level]

    return ratios.loc[q].values


def get_group_preds(pred, level):
    df = pred.groupby(level)[cols].sum()
    q = np.repeat(qs, len(df))
    df = pd.concat([df] * 9, axis=0, sort=False)

    df.reset_index(inplace=True)

    df[cols] *= quantile_coefs(q, level)[:, None]

    if level != "id":
        df["id"] = [
            f"{lev}_X_{q:.3f}_evaluation" for lev, q in zip(df[level].values, q)
        ]
    else:
        df["id"] = [
            f"{lev.replace('_evaluation', '')}_{q:.3f}_evaluation"
            for lev, q in zip(df[level].values, q)
        ]

    df = df[["id"] + list(cols)]

    return df


def get_couple_group_preds(pred, level1, level2):
    df = pred.groupby([level1, level2])[cols].sum()
    q = np.repeat(qs, len(df))
    df = pd.concat([df] * 9, axis=0, sort=False)

    df.reset_index(inplace=True)

    df[cols] *= quantile_coefs(q, (level1, level2))[:, None]
    df["id"] = [
        f"{lev1}_{lev2}_{q:.3f}_evaluation"
        for lev1, lev2, q in zip(df[level1].values, df[level2].values, q)
    ]
    df = df[["id"] + list(cols)]

    return df


submission["all_id"] = 0

qs = np.array([0.005, 0.025, 0.165, 0.25, 0.5, 0.75, 0.835, 0.975, 0.995])

# coef between 0.05 and 0.24 is used, probably suboptimal values for now
level_coef_dict = {
    "all_id": get_ratios(coef=0.03),
    "state_id": get_ratios(coef=0.04),
    "store_id": get_ratios(coef=0.05),
    "cat_id": get_ratios(coef=0.04),
    "dept_id": get_ratios(coef=0.05),
    ("state_id", "cat_id"): get_ratios(coef=0.05),
    ("state_id", "dept_id"): get_ratios(coef=0.07),
    ("store_id", "cat_id"): get_ratios(coef=0.07),
    ("store_id", "dept_id"): get_ratios(coef=0.08),
    "item_id": get_ratios(coef=0.11),
    ("state_id", "item_id"): get_ratios(coef=0.15),
    "id": get_ratios(coef=0.25),
}

levels = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "all_id"]
couples = [
    ("state_id", "item_id"),
    ("state_id", "dept_id"),
    ("store_id", "dept_id"),
    ("state_id", "cat_id"),
    ("store_id", "cat_id"),
]
cols = [f"F{i}" for i in range(1, 29)]

df = []

for level in levels:
    df.append(get_group_preds(submission, level))

for level1, level2 in couples:
    df.append(get_couple_group_preds(submission, level1, level2))

df = pd.concat(df, axis=0, sort=False)

df.reset_index(drop=True, inplace=True)

df = pd.concat([df, df], axis=0, sort=False)

df.reset_index(drop=True, inplace=True)

is_valid = df.index < len(df.index) // 2
df.loc[is_valid, "id"] = df.loc[is_valid, "id"].str.replace(
    "_evaluation$", "_validation"
)

submission = df

In [None]:
submission.to_csv(submission_uncertainty_path, index=False)

In [None]:
if accuracy:
    kaggle.api.competition_submit(
        submission_accuracy_path,
        message=description,
        competition="m5-forecasting-accuracy",
    )

In [None]:
if uncertainty:
    kaggle.api.competition_submit(
        submission_uncertainty_path,
        message=description,
        competition="m5-forecasting-uncertainty",
    )