In [None]:
%pip install matplotlib
%pip install darts
%pip install catboost

In [None]:
from datetime import datetime
from darts import TimeSeries
from darts.models import CatBoostModel
import pandas as pd
import numpy as np
import os
import kaggle_metric
import utils

In [None]:
random_state = 42
np.random.seed(random_state)

In [None]:
df = utils.create_df("./data/kernel/receivals.csv")
df.head()

In [None]:
series_lst = TimeSeries.from_group_dataframe(df, "rm_id", "date_arrival", "net_weight")

In [None]:
series_dict = {}
train_dict = {}
val_dict = {}
for i in range(len(series_lst)):
    train_i, val_i = series_lst[i].split_after(0.8)

    rm_id = (
        series_lst[i].static_covariates["rm_id"]
        if "rm_id" in series_lst[i].static_covariates
        else None
    )
    series_dict[rm_id.values[0]] = series_lst[i]
    train_dict[rm_id.values[0]] = train_i
    val_dict[rm_id.values[0]] = val_i

In [None]:
model_dict = {}
loss = []

lst = []
for key in train_dict:
    print(f"Training model for rm_id: {key}")
    train_values = train_dict[key].values()
    model = CatBoostModel(
        lags=30, use_static_covariates=False, random_state=random_state
    )
    try:
        model.fit(train_dict[key])
        model_dict[key] = model
        out = model_dict[key].predict(len(val_dict[key]), series=train_dict[key])
        actual_steps = out.values().clip(min=0)
        actual_steps = actual_steps.cumsum()
        actual_steps[actual_steps < 100] = 0

        # Store predictions and actual values in a dataframe
        df_eval = pd.DataFrame(
            {
                "predicted_weight": actual_steps.flatten(),
                "weight": val_dict[key].cumsum().values().flatten(),
            }
        )

    except Exception as e:
        df_eval = pd.DataFrame(
            {
                "predicted_weight": [0] * len(val_dict[key]),
                "weight": val_dict[key].cumsum().values().flatten(),
            }
        )
        print(f"Error training model for rm_id {key}: {e}")

    lst.append(df_eval)

df_eval = pd.concat(lst).reset_index(drop=True)
df_eval["ID"] = df_eval.index
score = kaggle_metric.score(
    df_eval[["ID", "weight"]], df_eval[["ID", "predicted_weight"]]
)
print(f"Validation score: {score}")

In [None]:
ax = df_eval.plot(y="weight", label="Actual")
df_eval.plot(y="predicted_weight", ax=ax, label="Predicted", alpha=0.5)

In [None]:
prediction_mapping = pd.read_csv("./data/prediction_mapping.csv")
prediction_mapping["forecast_start_date"] = (
    pd.to_datetime(prediction_mapping["forecast_start_date"], utc=True)
    .dt.tz_localize(None)
    .dt.normalize()
)
prediction_mapping["forecast_end_date"] = (
    pd.to_datetime(prediction_mapping["forecast_end_date"], utc=True)
    .dt.tz_localize(None)
    .dt.normalize()
)
prediction_mapping = prediction_mapping.sort_values(["rm_id", "forecast_end_date"])
max_date = pd.Timestamp("2025-05-31")
lst = []
for rm_id, group in prediction_mapping.groupby("rm_id"):
    print(f"Processing rm_id {rm_id}")
    if rm_id not in model_dict:
        print(f"No model for rm_id {rm_id}, setting predicted_weight to 0")
        group["predicted_weight"] = 0
    else:

        last_train = series_dict[rm_id].time_index[-1]
        n_steps = (max_date - last_train).days - 1
        print(
            f"Predicting {n_steps} steps for rm_id {rm_id} because last known date is {last_train}"
        )
        preds = model_dict[rm_id].predict(n_steps, series=series_dict[rm_id])
        actual_steps = preds.values().clip(min=0)
        actual_steps = actual_steps.cumsum()
        # group['predicted_weight'] = actual_steps
        actual_steps[actual_steps < 100] = 0
        group["predicted_weight"] = actual_steps
    lst.append(group)

df_final = pd.concat(lst)

In [None]:
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
submission_path = f"./data/submission/submission_{timestamp}.csv"
if not os.path.exists("./data/submission/"):
    os.makedirs("./data/submission/")


df_final[["ID", "predicted_weight"]].to_csv(submission_path, index=False)