In [None]:
import pathlib
import sys

In [None]:
import joblib
import kaggle
import numpy as np
import pandas as pd
import scipy.stats as stats

In [None]:
root_dir = "../.."

In [None]:
root_dir_path = pathlib.Path(root_dir)
data_dir_path = root_dir_path / "data"
raw_dir_path = data_dir_path / "raw"
sample_submission_path = raw_dir_path / "sample_submission.csv"
models_dir_path = root_dir_path / "models"
model_path = models_dir_path / "lgbm_reg.joblib"
prediction_path = models_dir_path / "prediction.parquet"
submission_path = models_dir_path / "submission_uncertainty.csv.gz"
src_dir_path = root_dir_path / "src"

In [None]:
sys.path.append(str(src_dir_path))

In [None]:
from package.constants import *

In [None]:
sample_submission = pd.read_csv(sample_submission_path)

In [None]:
prediction = pd.read_parquet(prediction_path)

In [None]:
d = prediction["d"].str[2:]
d = d.astype("int")
d -= train_days + 1

is_evaluation = d >= test_days
d %= test_days

prediction["d"] = d
prediction.loc[is_evaluation, "id"] = prediction.loc[is_evaluation, "id"].str.replace(
    "_validation", "_evaluation"
)

In [None]:
submission = pd.pivot(prediction, index="id", columns="d", values=target)

In [None]:
submission.reset_index(inplace=True)

In [None]:
submission.columns = ["id"] + [f"F{i + 1}" for i in range(test_days)]

In [None]:
submission = pd.merge(sample_submission["id"], submission, how="left")

In [None]:
# This process will need to be fixed later
is_validation = submission["id"].str.endswith("_validation")
submission = submission[is_validation]

In [None]:
submission[["item_id", "store_id"]] = submission["id"].str.extract(
    r"(\w+_\d+_\d+)_(\w+_\d+)_\w+"
)
submission["dept_id"] = submission["item_id"].str.extract(r"(\w+_\d+)_\d+")
submission["cat_id"] = submission["dept_id"].str.extract(r"(\w+)_\d+")
submission["state_id"] = submission["store_id"].str.extract(r"(\w+)_\d+")

In [None]:
def get_ratios(coef=0.15):
    qs2 = np.log(qs / (1 - qs)) * coef
    ratios = stats.norm.cdf(qs2)
    ratios /= ratios[4]
    ratios = pd.Series(ratios, index=qs)

    return ratios.round(3)


def quantile_coefs(q, level):
    ratios = level_coef_dict[level]

    return ratios.loc[q].values


def get_group_preds(pred, level):
    df = pred.groupby(level)[cols].sum()
    q = np.repeat(qs, len(df))
    df = pd.concat([df] * 9, axis=0, sort=False)

    df.reset_index(inplace=True)

    df[cols] *= quantile_coefs(q, level)[:, None]

    if level != "id":
        df["id"] = [
            f"{lev}_X_{q:.3f}_validation" for lev, q in zip(df[level].values, q)
        ]
    else:
        df["id"] = [
            f"{lev.replace('_validation', '')}_{q:.3f}_validation"
            for lev, q in zip(df[level].values, q)
        ]

    df = df[["id"] + list(cols)]

    return df


def get_couple_group_preds(pred, level1, level2):
    df = pred.groupby([level1, level2])[cols].sum()
    q = np.repeat(qs, len(df))
    df = pd.concat([df] * 9, axis=0, sort=False)

    df.reset_index(inplace=True)

    df[cols] *= quantile_coefs(q, (level1, level2))[:, None]
    df["id"] = [
        f"{lev1}_{lev2}_{q:.3f}_validation"
        for lev1, lev2, q in zip(df[level1].values, df[level2].values, q)
    ]
    df = df[["id"] + list(cols)]

    return df


submission["_all_"] = "Total"

qs = np.array([0.005, 0.025, 0.165, 0.25, 0.5, 0.75, 0.835, 0.975, 0.995])

# coef between 0.05 and 0.24 is used, probably suboptimal values for now
level_coef_dict = {
    "id": get_ratios(coef=0.3),
    "item_id": get_ratios(coef=0.15),
    "dept_id": get_ratios(coef=0.08),
    "cat_id": get_ratios(coef=0.07),
    "store_id": get_ratios(coef=0.08),
    "state_id": get_ratios(coef=0.07),
    "_all_": get_ratios(coef=0.05),
    ("state_id", "item_id"): get_ratios(coef=0.19),
    ("state_id", "dept_id"): get_ratios(coef=0.1),
    ("store_id", "dept_id"): get_ratios(coef=0.11),
    ("state_id", "cat_id"): get_ratios(coef=0.08),
    ("store_id", "cat_id"): get_ratios(coef=0.1),
}

levels = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id", "_all_"]
couples = [
    ("state_id", "item_id"),
    ("state_id", "dept_id"),
    ("store_id", "dept_id"),
    ("state_id", "cat_id"),
    ("store_id", "cat_id"),
]
cols = [f"F{i}" for i in range(1, 29)]

df = []

for level in levels:
    df.append(get_group_preds(submission, level))

for level1, level2 in couples:
    df.append(get_couple_group_preds(submission, level1, level2))

df = pd.concat(df, axis=0, sort=False)

df.reset_index(drop=True, inplace=True)

df = pd.concat([df, df], axis=0, sort=False)

df.reset_index(drop=True, inplace=True)

df.loc[df.index >= len(df.index) // 2, "id"] = df.loc[
    df.index >= len(df.index) // 2, "id"
].str.replace("_validation$", "_evaluation")

submission = df

In [None]:
submission.to_csv(submission_path, index=False)

In [None]:
model = joblib.load(model_path)

In [None]:
kaggle.api.competition_submit(
    submission_path,
    message=str(model.best_score),
    competition="m5-forecasting-uncertainty",
)