In [1]:
# Load the imports
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pathlib import Path

In [2]:
# -*- coding: utf-8 -*-
"""
Helper file to locally compute Datathon 2025 Metrics.
This file is intended to be used by participants to test the metrics
using custom train/validation splits and also to generate submission files.

Metrics supported:
- Metric 1 (Phase 1-a): 0 actuals
- Metric 2 (Phase 1-b): 6 actuals
"""

# ------------------------------------------------------------------
# Metric 1 (Phase 1-a)
# ------------------------------------------------------------------

def _compute_pe_phase1a(group: pd.DataFrame) -> float:
    """Compute PE for one (country, brand, bucket) group following the corrected Metric 1 formula."""
    avg_vol = group["avg_vol"].iloc[0]
    if avg_vol == 0 or np.isnan(avg_vol):
        return np.nan

    def sum_abs_diff(month_start: int, month_end: int) -> float:
        """Sum of absolute differences sum(|actual - pred|)."""
        subset = group[(group["months_postgx"] >= month_start) & (group["months_postgx"] <= month_end)]
        return (subset["volume_actual"] - subset["volume_predict"]).abs().sum()
    
    def abs_sum_diff(month_start: int, month_end: int) -> float:
        """Absolute difference of |sum(actuals) - sum(pred)|."""
        subset = group[(group["months_postgx"] >= month_start) & (group["months_postgx"] <= month_end)]
        sum_actual = subset["volume_actual"].sum()
        sum_pred = subset["volume_predict"].sum()
        return abs(sum_actual - sum_pred)

    term1 = 0.2 * sum_abs_diff(0, 23) / (24 * avg_vol)
    term2 = 0.5 * abs_sum_diff(0, 5) / (6 * avg_vol)
    term3 = 0.2 * abs_sum_diff(6, 11) / (6 * avg_vol)
    term4 = 0.1 * abs_sum_diff(12, 23) / (12 * avg_vol)

    return term1 + term2 + term3 + term4


def _metric1(df_actual: pd.DataFrame, df_pred: pd.DataFrame, df_aux: pd.DataFrame) -> float:
    """Compute Metric 1 PE value.

    :param df_actual: Actual volume data
    :param df_pred: Predicted volume data
    :param df_aux: Auxiliary data with buckets and avg_vol
    :return: Weighted PE total (Phase 1)
    """
    merged = df_actual.merge(
        df_pred,
        on=["country", "brand_name", "months_postgx"],
        how="inner",
        suffixes=("_actual", "_predict")
    ).merge(df_aux, on=["country", "brand_name"], how="left")

    merged["start_month"] = merged.groupby(["country", "brand_name"])["months_postgx"].transform("min")
    merged = merged[merged["start_month"] == 0].copy()

    pe_results = (
        merged.groupby(["country", "brand_name", "bucket"])
        .apply(_compute_pe_phase1a)
        .reset_index(name="PE")
    )

    bucket1 = pe_results[pe_results["bucket"] == 1]
    bucket2 = pe_results[pe_results["bucket"] == 2]

    n1 = bucket1[["country", "brand_name"]].drop_duplicates().shape[0]
    n2 = bucket2[["country", "brand_name"]].drop_duplicates().shape[0]

    return (2/n1) * bucket1["PE"].sum() + (1/n2) * bucket2["PE"].sum()


def compute_metric1(
    df_actual: pd.DataFrame,
    df_pred: pd.DataFrame,
    df_aux: pd.DataFrame) -> float:
    """Compute Metric 1 (Phase 1).

    :param df_actual: Actual volume data
    :param df_pred: Predicted volume data
    :param df_aux: Auxiliary data with buckets and avg_vol
    :return: Computed Metric 1 value
    """
    return round(_metric1(df_actual, df_pred, df_aux), 4)


# ------------------------------------------------------------------
# Metric 2 (Phase 1-b)
# ------------------------------------------------------------------

def _compute_pe_phase1b(group: pd.DataFrame) -> float:
    """Compute PE for a specific country-brand-bucket group.

    :param group: DataFrame group with abs_diff and avg_vol columns
    :return: PE value for the group
    """
    avg_vol = group["avg_vol"].iloc[0]
    if avg_vol == 0 or np.isnan(avg_vol):
        return np.nan

    def sum_abs_diff(month_start: int, month_end: int) -> float:
        """Sum of absolute differences sum(|actual - pred|)."""
        subset = group[(group["months_postgx"] >= month_start) & (group["months_postgx"] <= month_end)]
        return (subset["volume_actual"] - subset["volume_predict"]).abs().sum()
    
    def abs_sum_diff(month_start: int, month_end: int) -> float:
        """Absolute difference of |sum(actuals) - sum(pred)|."""
        subset = group[(group["months_postgx"] >= month_start) & (group["months_postgx"] <= month_end)]
        sum_actual = subset["volume_actual"].sum()
        sum_pred = subset["volume_predict"].sum()
        return abs(sum_actual - sum_pred)

    term1 = 0.2 * sum_abs_diff(6, 23) / (18 * avg_vol)
    term2 = 0.5 * abs_sum_diff(6, 11) / (6 * avg_vol)
    term3 = 0.3 * abs_sum_diff(12, 23) / (12 * avg_vol)
    
    return term1 + term2 + term3


def _metric2(df_actual: pd.DataFrame, df_pred: pd.DataFrame, df_aux: pd.DataFrame) -> float:
    """Compute Metric 2 PE value.

    :param df_actual: Actual volume data
    :param df_pred: Predicted volume data
    :param df_aux: Auxiliary data with buckets and avg_vol
    :return: Weighted PE total (Phase 2)
    """
    merged_data = df_actual.merge(
        df_pred,
        on=["country", "brand_name", "months_postgx"],
        how="inner",
        suffixes=("_actual", "_predict")
    ).merge(df_aux, on=["country", "brand_name"], how="left")

    merged_data["start_month"] = merged_data.groupby(["country", "brand_name"])["months_postgx"].transform("min")
    merged_data = merged_data[merged_data["start_month"] == 6].copy()

    pe_results = (
        merged_data.groupby(["country", "brand_name", "bucket"])
        .apply(_compute_pe_phase1b)
        .reset_index(name="PE")
    )

    bucket1 = pe_results[pe_results["bucket"] == 1]
    bucket2 = pe_results[pe_results["bucket"] == 2]

    n1 = bucket1[["country", "brand_name"]].drop_duplicates().shape[0]
    n2 = bucket2[["country", "brand_name"]].drop_duplicates().shape[0]
    
    return (2/n1) * bucket1["PE"].sum() + (1/n2) * bucket2["PE"].sum()


def compute_metric2(
    df_actual: pd.DataFrame,
    df_pred: pd.DataFrame,
    df_aux: pd.DataFrame) -> float:
    """Compute Metric 2 (Phase 2).

    :param df_actual: Actual volume data
    :param df_pred: Predicted volume data
    :param df_aux: Auxiliary data with buckets and avg_vol
    :return: Computed Metric 2 value
    """
    return round(_metric2(df_actual, df_pred, df_aux), 4)

In [3]:
# Load datasets
df_train_full = pd.read_csv("../data/processed/df_train_merged_filled.csv")
df_test_full = pd.read_csv("../data/processed/df_test_merged_filled.csv")

# One-hot-encoding categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded = encoder.fit_transform(df_train_full[["month", "ther_area", "main_package", "biological", "small_molecule", "bucket"]])
encoder.get_feature_names_out(["month", "ther_area", "main_package", "biological", "small_molecule", "bucket"])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(), index=df_train_full.index)
df_train_full = df_train_full.drop(["month", "ther_area", "main_package", "biological", "small_molecule"], axis=1).join(encoded_df)

encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded = encoder.fit_transform(df_test_full[["month", "ther_area", "main_package", "biological", "small_molecule", "bucket"]])
encoder.get_feature_names_out(["month", "ther_area", "main_package", "biological", "small_molecule", "bucket"])
encoded_df = pd.DataFrame(encoded, columns=encoder.get_feature_names_out(), index=df_test_full.index)
df_test_full = df_test_full.drop(["month", "ther_area", "main_package", "biological", "small_molecule"], axis=1).join(encoded_df)

# Manually adding missing one-hot column
df_test_full["ther_area_Systemic_Hormones"] = 0.0

In [4]:
# Combine train + test with a flag
df_train_full = df_train_full.copy()
df_test_full = df_test_full.copy()

df_train_full["is_train"] = 1
df_test_full["is_train"] = 0

df_all = pd.concat([df_train_full, df_test_full], ignore_index=True)

In [5]:
# Sort by group + time
time_cols = ["country", "brand_name", "months_postgx"]
df_all = df_all.sort_values(time_cols)

group = df_all.groupby(["country", "brand_name"])

In [6]:
# Lag features
df_all["lag_1"] = group["volume"].shift(1)
df_all["lag_2"] = group["volume"].shift(2)
df_all["lag_3"] = group["volume"].shift(3)

# Last year's value (12 months ago in same group)
df_all["last_year_volume"] = group["volume"].shift(12)

In [7]:
# Rolling averages (past-only, per group)

# Helper: rolling mean of *past* values (shifted by 1)
roll3 = group["volume"].apply(
    lambda s: s.shift(1).rolling(window=3, min_periods=1).mean()
)
roll6 = group["volume"].apply(
    lambda s: s.shift(1).rolling(window=6, min_periods=1).mean()
)

# roll3/roll6 now have a MultiIndex (country, brand_name, original_index)
# Drop the first two levels so index lines up with df_all
df_all["roll3_past"] = roll3.reset_index(level=[0, 1], drop=True)
df_all["roll6_past"] = roll6.reset_index(level=[0, 1], drop=True)

In [8]:
# Group-level mean
df_all["group_volume_mean"] = group["volume"].transform("mean")

In [9]:
# Fill NaNs in lag/rolling features
lag_cols = [
    "lag_1", "lag_2", "lag_3",
    "roll3_past", "roll6_past",
    "last_year_volume",
]

df_all[lag_cols] = df_all[lag_cols].fillna(0.0)

In [10]:
# Split back into train and test
df_train_full = df_all[df_all["is_train"] == 1].drop(columns=["is_train"])
df_test_full  = df_all[df_all["is_train"] == 0].drop(columns=["is_train"])

In [11]:
# Save as .csv files
df_train_full.to_csv("../data/processed/df_train_unscaled.csv", index=False)
df_test_full.to_csv("../data/processed/df_test_unscaled.csv", index=False)

In [12]:
numeric_features = [
    "months_postgx",
    # "n_gxs",
    # "hospital_rate",
    # "mean_generic_erosion"
    "lag_1", 
    # "lag_2", 
    # "lag_3", 
    "roll3_past", 
    "roll6_past", 
    "group_volume_mean" 
    # "last_year_volume"
]

categorical_onehot_features = [
    # ther_area
    "ther_area_Anti-infectives",
    # "ther_area_Antineoplastic_and_immunology",
    # "ther_area_Cardiovascular_Metabolic",
    # "ther_area_Dermatology",
    "ther_area_Endocrinology_and_Metabolic_Disease",
    # "ther_area_Haematology",
    # "ther_area_Muscoskeletal_Rheumatology_and_Osteology",
    # "ther_area_Nervous_system",
    # "ther_area_Obstetrics_Gynaecology",
    "ther_area_Others",
    # "ther_area_Parasitology",
    "ther_area_Respiratory_and_Immuno-inflammatory",
    "ther_area_Sensory_organs",
    # "ther_area_Systemic_Hormones", 

    # main_package
    "main_package_PATCH"
    # "main_package_PILL",

    # biological / small molecule flags
    # "small_molecule_False", 
    # "small_molecule_True"
]

extra_features = [
    # main_package
    # "main_package_CREAM",
    # "main_package_EYE DROP",
    "main_package_INJECTION",
    # "main_package_Others",
    
    # biological / small molecule flags
    "biological_False" 
    # "biological_True"
    
    # Only for split to create df_aux
    # "country", 
    # "brand_name", 
    # "avg_vol", 
    # "bucket"
    # "bucket_1", 
    # "bucket_2"
]

features = numeric_features + categorical_onehot_features + extra_features
target = "volume"

In [13]:
gxs_categorical_onehot_features = [
    # ther_area
    "ther_area_Anti-infectives",
    # "ther_area_Antineoplastic_and_immunology",
    "ther_area_Cardiovascular_Metabolic",
    # "ther_area_Dermatology",
    # "ther_area_Endocrinology_and_Metabolic_Disease",
    # "ther_area_Haematology",
    "ther_area_Muscoskeletal_Rheumatology_and_Osteology",
    # "ther_area_Nervous_system",
    # "ther_area_Obstetrics_Gynaecology",
    # "ther_area_Others",
    # "ther_area_Parasitology",
    "ther_area_Respiratory_and_Immuno-inflammatory",
    # "ther_area_Sensory_organs",
    # "ther_area_Systemic_Hormones", 

    # main_package
    "main_package_PATCH",
    "main_package_PILL",

    # biological / small molecule flags
    "small_molecule_False", "small_molecule_True"
]

In [14]:
X = df_train_full[features]
y = np.log1p(df_train_full[target])

# Split for validation sets
X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    shuffle=True
)

In [15]:
# Define model
model = XGBRegressor(
    n_estimators=1200,
    learning_rate=0.07,
    max_depth=8,
    subsample=0.5,
    colsample_bytree=0.9,
    min_child_weight=9,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=42, 
    eval_metric="rmse",
    early_stopping_rounds=100,
    gamma=0,
    reg_lambda=10,
    reg_alpha=1
)

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)], 
    verbose=False
)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,100
,enable_categorical,False


In [16]:
# Evaluate on validation set
y_pred = model.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, y_pred))
mae  = mean_absolute_error(y_val, y_pred)
r2   = r2_score(y_val, y_pred)

print("RMSE:", rmse)
print("MAE:",  mae)
print("R²:",   r2)


# ---- Compute metrics on validation set ----
# m1 = compute_metric1(X_val, prediction, df_aux)
# m2 = compute_metric2(X_val, prediction, df_aux)

# print(f"Metric 1 - Phase 1-a (local validation): {m1}")
# print(f"Metric 2 - Phase 1-b (local validation): {m2}")

RMSE: 0.29549219133353566
MAE: 0.15947061338559618
R²: 0.9895326889120424


In [17]:
# Hyperparameter tuning through randomized search
param_grid = {
    "n_estimators": [800, 1000, 1200, 1500],
    "learning_rate": [0.03, 0.05, 0.07, 0.1],
    "max_depth": [5, 6, 7, 8],
    "min_child_weight": [5, 7, 9, 11],
    "subsample": [0.5, 0.6, 0.7, 0.8],
    "colsample_bytree": [0.7, 0.8, 0.9, 1.0],

    # Extra regularisation knobs
    "gamma": [0, 0.1, 0.3],
    "reg_lambda": [1, 3, 5, 10], 
    "reg_alpha": [0, 0.1, 0.5, 1]
}


search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_grid,
    n_iter=30,
    scoring="neg_root_mean_squared_error",
    cv=3,
    random_state=42,
    verbose=2
)

# search.fit(
#     X_train, y_train,
#     eval_set=[(X_val, y_val)], 
#     verbose=False
# )
# print("BEST:", search.best_params_)

In [18]:
# Predictions on test set
X_test = df_test_full[features]
y_test_pred = model.predict(X_test)

df_test_full[target] = np.expm1(y_test_pred)
df_test_full.to_csv("../data/processed/df_pred_unscaled.csv", index=False)

In [19]:
# Finding and filling existing values between submission example and predictions
submission_example = pd.read_csv("../examples_and_template/submission_example.csv")
pred_unscaled = pd.read_csv("../data/processed/df_pred_unscaled.csv")

existing_months = ["month_Jan", 
                   "month_Feb", 
                   "month_Mar", 
                   "month_Apr", 
                   "month_May", 
                   "month_Jun", 
                   "month_Jul", 
                   "month_Aug", 
                   "month_Sep", 
                   "month_Oct", 
                   "month_Nov", 
                   "month_Dec"
                   ]
         

existing_cols = ["hospital_rate", 
        "avg_vol", 
        "vol_norm", 
        "mean_generic_erosion",  
        "ther_area_Anti-infectives", 
        "ther_area_Antineoplastic_and_immunology", 
        "ther_area_Cardiovascular_Metabolic", 
        "ther_area_Dermatology", 
        "ther_area_Endocrinology_and_Metabolic_Disease", 
        "ther_area_Haematology", 
        "ther_area_Muscoskeletal_Rheumatology_and_Osteology", 
        "ther_area_Nervous_system", 
        "ther_area_Obstetrics_Gynaecology", 
        "ther_area_Others", 
        "ther_area_Parasitology", 
        "ther_area_Respiratory_and_Immuno-inflammatory", 
        "ther_area_Sensory_organs", 
        "ther_area_Systemic_Hormones", 
        "main_package_CREAM", 
        "main_package_EYE DROP", 
        "main_package_INJECTION", 
        "main_package_Others", 
        "main_package_PATCH", 
        "main_package_PILL", 
        "biological_False", 
        "biological_True", 
        "small_molecule_False", 
        "small_molecule_True", 
        "bucket_1", 
        "bucket_2",
        "lag_1", 
        "lag_2", 
        "lag_3", 
        "roll3_past", 
        "roll6_past", 
        "group_volume_mean", 
        "last_year_volume", 
        "bucket"
        ]

# Ensure all required columns exist with NaN initial values
for col in existing_months + existing_cols:
    if col not in submission_example.columns:
        submission_example[col] = np.nan

for row in submission_example.itertuples(index=False):
    country = row.country
    brand_name = row.brand_name
    months_postgx = row.months_postgx

    # ----- exact match on country + brand_name + months_postgx -----
    mask_pred_3 = (
        (pred_unscaled["country"] == country) &
        (pred_unscaled["brand_name"] == brand_name) &
        (pred_unscaled["months_postgx"] == months_postgx)
    )
    match_3 = pred_unscaled[mask_pred_3]
    # same mask on submission_example
    mask_sub_3 = (
        (submission_example["country"] == country) &
        (submission_example["brand_name"] == brand_name) &
        (submission_example["months_postgx"] == months_postgx)
    )

    if not match_3.empty:
        first3 = match_3.iloc[0]

        # fill only rows that match this exact triple
        submission_example.loc[mask_sub_3, existing_months] = (
            first3[existing_months].values
        )
        submission_example.loc[mask_sub_3, existing_cols] = (
            first3[existing_cols].values
        )

    else:
        # ----- fallback: match only on country + brand_name -----
        mask_pred_2 = (
            (pred_unscaled["country"] == country) &
            (pred_unscaled["brand_name"] == brand_name)
        )
        match_2 = pred_unscaled[mask_pred_2]

        if not match_2.empty:
            first2 = match_2.iloc[0]

            mask_sub_2 = (
                (submission_example["country"] == country) &
                (submission_example["brand_name"] == brand_name)
            )

            # only fill those rows in submission_example
            submission_example.loc[mask_sub_2, existing_cols] = (
                first2[existing_cols].values
            )

submission_example["n_gxs"] = np.nan
submission_example.to_csv("../submissions/submission_prep.csv", index=False)

In [20]:
# Imputation for the “no fully-filled row” case
default_start_month = "month_Jun"

month_to_idx = {m: i for i, m in enumerate(existing_months)}
n_months = len(existing_months)

def infer_months_for_group(group: pd.DataFrame) -> pd.DataFrame:
    # Case A: find a row where *all* month columns are non-null
    mask_full = group[existing_months].notna().all(axis=1)

    if mask_full.any():
        # Use the first such row as the base (should correspond to months_postgx == 0)
        base_row = group.loc[mask_full].iloc[0]
        # Determine which calendar month that row encodes (take the max / 1.0)
        base_month_col = base_row[existing_months].idxmax()
        base_idx = month_to_idx[base_month_col]
    else:
        # Case B: no fully-filled row -> assume months_postgx == 0 would be June
        base_idx = month_to_idx[default_start_month]

    # Compute the calendar month index for each row based on months_postgx
    offset = group["months_postgx"].astype(int)
    month_idx = (base_idx + offset) % n_months

    # Build one-hot encoding into the existing_months columns
    for i, col in enumerate(existing_months):
        group[col] = (month_idx == i).astype(float)

    return group

# Applying it to the whole dataframe
submission_example = (
    submission_example
    .groupby(["country", "brand_name"], group_keys=False)
    .apply(infer_months_for_group)
)

submission_example.to_csv("../submissions/submission_prep.csv", index=False)


  .apply(infer_months_for_group)


In [21]:
# Creating prediction model for imputation of missing n_gxs values

# Load datasets
df_train_full = pd.read_csv("../data/processed/df_train_unscaled.csv")
df_test_full = pd.read_csv("../data/processed/df_test_unscaled.csv")

gxs_numeric_features = [
    "months_postgx",
    #"hospital_rate",
    # "mean_generic_erosion"
    "lag_1", 
    "lag_2", 
    "lag_3", 
    "roll3_past", 
    "roll6_past", 
    "group_volume_mean", 
    "last_year_volume"
]

gxs_features = gxs_numeric_features + gxs_categorical_onehot_features
gxs_target = "n_gxs"

gxs_X = df_train_full[gxs_features]
gxs_y = df_train_full[gxs_target]

# Split for validation sets
gxs_X_train, gxs_X_val, gxs_y_train, gxs_y_val = train_test_split(
    gxs_X, gxs_y,
    test_size=0.30,
    random_state=42,
    shuffle=True
)

In [22]:
# Define model
gxs_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.07,
    max_depth=8,
    subsample=0.8,
    colsample_bytree=1.0,
    min_child_weight=5,
    objective="reg:squarederror",
    tree_method="hist",
    random_state=42, 
    eval_metric="rmse",
    early_stopping_rounds=100,
    gamma=0.1,
    reg_lambda=1,
    reg_alpha=0.5
)

gxs_model.fit(
    gxs_X_train, gxs_y_train,
    eval_set=[(gxs_X_val, gxs_y_val)], 
    verbose=False
)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,1.0
,device,
,early_stopping_rounds,100
,enable_categorical,False


In [23]:
# Evaluate on validation set
gxs_y_pred = gxs_model.predict(gxs_X_val)

gxs_rmse = np.sqrt(mean_squared_error(gxs_y_val, gxs_y_pred))
gxs_r2 = r2_score(gxs_y_val, gxs_y_pred)

print("gxs_RMSE:", gxs_rmse)
print("gxs_R²:", gxs_r2)

gxs_RMSE: 1.5627974236694089
gxs_R²: 0.7740153364001294


In [24]:
# Hyperparameter tuning through randomized search
# search.fit(
#     gxs_X_train, gxs_y_train,
#     eval_set=[(gxs_X_val, gxs_y_val)], 
#     verbose=False
# )

# print("BEST:", search.best_params_)

In [25]:
# Predictions on submission set for n_gxs
gxs_X_sub_prep = submission_example[gxs_features]
gxs_y_sub_prep = gxs_model.predict(gxs_X_sub_prep)

submission_example[gxs_target] = gxs_y_sub_prep

submission_example.to_csv("../submissions/submission_prep.csv", index=False)

In [None]:
# Predictions on submission set for volume
sub_prep = pd.read_csv("../submissions/submission_prep.csv")
vol_X_sub_prep = sub_prep[features]
vol_y_sub_prep = model.predict(vol_X_sub_prep)

# Convert from log space back to original scale
sub_prep[target] = np.expm1(vol_y_sub_prep)

# Fix negative volumes by interpolation
df = sub_prep.copy()

# Ensure groups stay in correct temporal order
df = df.sort_values(["country", "brand_name", "months_postgx"])

# Convert negative volumes to NaN
df.loc[df["volume"] < 0, "volume"] = np.nan

# Compute global fallback start/end values from groups that have valid volumes
grouped = df.groupby(["country", "brand_name"])["volume"]

first_valid = grouped.first().dropna()   # first non-null per group
last_valid  = grouped.last().dropna()    # last non-null per group

global_min_start = first_valid.min() if not first_valid.empty else 0.0
global_min_end = last_valid.min()  if not last_valid.empty else 0.0


# Function that repairs the volume column within each group
def fix_negative_volumes(group, min_start, min_end):
    g = group.copy()

    # Safety: ensure all negatives treated as NaN
    g.loc[g["volume"] < 0, "volume"] = np.nan

    if g["volume"].notna().any():
        # Normal case: interpolate within group
        g["volume"] = g["volume"].interpolate(
            method="linear",
            limit_direction="both"
        )
    else:
        # Edge case: group has NO valid volumes at all, create global ramp
        n = len(g)
        g["volume"] = np.linspace(min_start, min_end, n)

    return g


# Apply per (country, brand_name)
df_fixed = (
    df.groupby(["country", "brand_name"], group_keys=False)
      .apply(fix_negative_volumes, min_start=global_min_start, min_end=global_min_end)
)

# Insert fixed volumes back into the submission dataframe (optional)
sub_prep["volume"] = df_fixed["volume"]

sub_prep.to_csv("../submissions/submission_prep.csv", index=False)
required_columns = ["country", "brand_name", "months_postgx", "volume"]

# Final submission file
final_submission = sub_prep[required_columns]
final_submission.to_csv("../submissions/final_submission.csv", index=False)

In [27]:
# Feature importance extraction
model.get_booster().get_score(importance_type="gain")

{'months_postgx': 4.76936149597168,
 'lag_1': 61.83859634399414,
 'roll3_past': 134.41693115234375,
 'roll6_past': 5.761993408203125,
 'group_volume_mean': 226.92921447753906,
 'ther_area_Anti-infectives': 0.7941064238548279,
 'ther_area_Endocrinology_and_Metabolic_Disease': 0.5390650629997253,
 'ther_area_Others': 0.9380091428756714,
 'ther_area_Respiratory_and_Immuno-inflammatory': 0.48132774233818054,
 'ther_area_Sensory_organs': 0.378600150346756,
 'main_package_PATCH': 0.04217873141169548,
 'main_package_INJECTION': 2.2692606449127197,
 'biological_False': 0.7054699659347534}

In [28]:
# Feature importance extraction for n_gxs model
gxs_model.get_booster().get_score(importance_type="gain")


{'months_postgx': 52.113216400146484,
 'lag_1': 19.71152687072754,
 'lag_2': 13.525495529174805,
 'lag_3': 13.049736022949219,
 'roll3_past': 35.059539794921875,
 'roll6_past': 17.984695434570312,
 'group_volume_mean': 54.09865188598633,
 'last_year_volume': 12.360663414001465,
 'ther_area_Anti-infectives': 60.62411117553711,
 'ther_area_Cardiovascular_Metabolic': 37.40120315551758,
 'ther_area_Muscoskeletal_Rheumatology_and_Osteology': 36.840396881103516,
 'ther_area_Respiratory_and_Immuno-inflammatory': 39.66255187988281,
 'main_package_PATCH': 144.18020629882812,
 'main_package_PILL': 69.47817993164062,
 'small_molecule_False': 92.97964477539062}

In [29]:
best = pd.read_csv("../submissions/final_submission_best.csv")
eight = pd.read_csv("../submissions/final_submission_8.csv")
best["volume"] = (best["volume"] + eight["volume"]) / 2.0
best.to_csv("../submissions/final_submission_ensemble.csv", index=False)