In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor

# Scenario 1

In [2]:
# --------------- loading train data ---------------
df = pd.read_csv("merged_cleaned_train_data.csv")
#df = df.drop(columns=["Unnamed: 0"])

df.shape
print("Unique months_postgx in train:", sorted(df["months_postgx"].unique()))

Unique months_postgx in train: [-24, -23, -22, -21, -20, -19, -18, -17, -16, -15, -14, -13, -12, -11, -10, -9, -8, -7, -6, -5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23]


In [3]:
# --------------------- calculating avg_vol 12 months BEFORE/PRE generic entry ---------------------------------------
pre = df[df["months_postgx"].between(-12, -1)].copy() # pre-generic baseline which is the previous year before generic entry (forecast setup from novartis)

avg_vol = (pre.groupby(["country", "brand_name"])["volume"].mean().rename("avg_vol").reset_index())

df = df.merge(avg_vol, on=["country", "brand_name"], how="left")
df

Unnamed: 0,country,brand_name,month,months_postgx,volume,n_gxs,ther_area,hospital_rate,main_package,biological,small_molecule,avg_vol
0,COUNTRY_B6AE,BRAND_1C1E,Jul,-24,272594.392100,0,Muscoskeletal_Rheumatology_and_Osteology,0.002088,PILL,False,True,524817.004042
1,COUNTRY_B6AE,BRAND_1C1E,Aug,-23,351859.310300,0,Muscoskeletal_Rheumatology_and_Osteology,0.002088,PILL,False,True,524817.004042
2,COUNTRY_B6AE,BRAND_1C1E,Sep,-22,447953.481300,0,Muscoskeletal_Rheumatology_and_Osteology,0.002088,PILL,False,True,524817.004042
3,COUNTRY_B6AE,BRAND_1C1E,Oct,-21,411543.292400,0,Muscoskeletal_Rheumatology_and_Osteology,0.002088,PILL,False,True,524817.004042
4,COUNTRY_B6AE,BRAND_1C1E,Nov,-20,774594.454200,0,Muscoskeletal_Rheumatology_and_Osteology,0.002088,PILL,False,True,524817.004042
...,...,...,...,...,...,...,...,...,...,...,...,...
93739,COUNTRY_C89B,BRAND_E7D1,Jul,19,6079.701119,3,Anti-infectives,46.870110,INJECTION,False,True,14029.295487
93740,COUNTRY_C89B,BRAND_E7D1,Aug,20,6763.451949,3,Anti-infectives,46.870110,INJECTION,False,True,14029.295487
93741,COUNTRY_C89B,BRAND_E7D1,Sep,21,5983.521634,3,Anti-infectives,46.870110,INJECTION,False,True,14029.295487
93742,COUNTRY_C89B,BRAND_E7D1,Oct,22,7089.474661,3,Anti-infectives,46.870110,INJECTION,False,True,14029.295487


In [4]:
# ----------------- post generic filtering train (SCENARIO 1: months 0 to 23) ---------------------------
train_df = df[df["months_postgx"].between(0, 23)].copy() # filtering to only have months 0 to 23 (post-generic entry)
print("post generic months shape:", train_df.shape)

post generic months shape: (46872, 12)


In [5]:
# --------------------------------- feature engineering ---------------------------
# cat cols
cat_cols = ["month", "ther_area", "main_package"]
for col in cat_cols:
    train_df[col] = train_df[col].astype(str) # converting cat to string

# numeric cols
num_cols = ["months_postgx", "n_gxs", "hospital_rate", "avg_vol"]
train_df["hospital_rate"] = train_df["hospital_rate"].fillna(train_df["hospital_rate"].median()) # filling nans with median ??

# boolean like cols
bool_cols = ["biological", "small_molecule"]
for col in bool_cols:
    train_df[col] = train_df[col].astype(float) # converting to float

# ----------------= ONE-HOT ENCODING cat cols --------------------------------
X = pd.get_dummies(train_df[num_cols + bool_cols + cat_cols], drop_first=True)

feature_cols = X.columns

# target col
y = train_df["volume"].values

print("Feature matrix shape:", X.shape)

Feature matrix shape: (46872, 36)


In [6]:
# --------------------- data splitting and model training --------------------
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=300, max_depth=None, n_jobs=-1, random_state=42,)

model.fit(X_train, y_train)

print("Train R2:", model.score(X_train, y_train))
print("Val R2:", model.score(X_val, y_val))


Train R2: 0.9920374334507885
Val R2: 0.9708202736009313


### S1 Predictions on Test data

In [7]:
# ------------ loading test data -----------------
df_vol_test = pd.read_csv("df_volume_test1.csv")
df_gen_test = pd.read_csv("df_generics_test1.csv")
df_med_test = pd.read_csv("df_medicine_info_test1.csv")


# avg_vol from -12 to -1 months pre generics for test
pre_t = df_vol_test[df_vol_test["months_postgx"].between(-12, -1)].copy()

avg_vol_test = (pre_t.groupby(["country", "brand_name"])["volume"].mean().rename("avg_vol").reset_index())

In [8]:
# ------------------------- test prediction forecast (0-23 post-generic entry) -------------------------
# mergin datasets to match structure of model
test_horizon = (df_gen_test.merge(df_med_test, on=["country", "brand_name"], how="left")
    .merge(avg_vol_test, on=["country", "brand_name"], how="left"))

# adding month col from df_colume_test
test_horizon = test_horizon.merge(df_vol_test[["country", "brand_name", "months_postgx", "month"]],
    on=["country", "brand_name", "months_postgx"], how="left")

In [9]:
#------------------------- feature engineering test the same as train --------------------------
# SAME PROCESS AS DONE IN TRAIN SETS
for col in cat_cols:
    test_horizon[col] = test_horizon[col].astype(str)

test_horizon["hospital_rate"] = test_horizon["hospital_rate"].fillna(test_horizon["hospital_rate"].median())

for col in bool_cols:
    test_horizon[col] = test_horizon[col].astype(float)

# one-hot encoding 
X_test = pd.get_dummies(test_horizon[num_cols + bool_cols + cat_cols], drop_first=True)
X_test = X_test.reindex(columns=feature_cols, fill_value=0)

In [10]:
#-------------- predicting volumns for test set months 0 to 23 -----------------
preds = model.predict(X_test)
test_horizon["volume"] = preds

In [11]:
# test_df for s2
test_df = test_horizon.copy()
test_df = test_df.rename(columns={"volume": "volume_s1"})   # renaming to volume_s1 for s2 input

In [12]:
# ----------------- submission compilation for scenario 1 --------------------------
s1 = test_horizon[["country", "brand_name", "months_postgx", "volume"]].copy()
s1 = s1[s1["months_postgx"].between(0, 23)]

s1 = s1.sort_values(["country", "brand_name", "months_postgx"]).reset_index(drop=True)

print(s1.head())

s1.to_csv("s1_randomforest_baseline_vol_pred.csv", index=False)

        country  brand_name  months_postgx        volume
0  COUNTRY_0024  BRAND_31BE              0  59992.848903
1  COUNTRY_0024  BRAND_31BE              1  55904.373833
2  COUNTRY_0024  BRAND_31BE              2  51726.099351
3  COUNTRY_0024  BRAND_31BE              3  48870.090055
4  COUNTRY_0024  BRAND_31BE              4  42042.588756


# Scenario 2

In [13]:
# filtered months_postgx 0 to 23 and pivot to each country/brand_name cols
pivot_train = train_df.pivot_table(index=["country", "brand_name"], columns="months_postgx", values="volume")

needed_months = list(range(0, 24)) # keeping only months that have 0 to 23
pivot_train = pivot_train.dropna(subset=needed_months)

pivot_train.columns = [f"vol_{m}" for m in pivot_train.columns] # renaming month vol col

X_S2 = pivot_train[[f"vol_{m}" for m in range(0, 6)]].values # selecting first 6 months as input 

y_S2 = pivot_train[[f"vol_{m}" for m in range(6, 24)]].values # selecting 7 to 24 months as target 

# training multi-output RF model for s2
s2_model = MultiOutputRegressor(
    RandomForestRegressor(
        n_estimators=300,
        max_depth=None,
        n_jobs=-1,
        random_state=42,
    )
)

s2_model.fit(X_S2, y_S2)

### S2 Predictions on Test data

In [14]:
# ------------------- s2 pred ---------------------------------
# SAME PREPROCESSIGN IN TRAIN DATA
pivot_test_s1 = test_df.pivot_table(index=["country", "brand_name"],columns="months_postgx",values="volume_s1")

needed_early = list(range(0, 6))
pivot_test_s1 = pivot_test_s1.dropna(subset=needed_early)

pivot_test_s1.columns = [f"vol_{m}" for m in pivot_test_s1.columns]

X_S2_test = pivot_test_s1[[f"vol_{m}" for m in range(0, 6)]].values

print("S2 test input shape:", X_S2_test.shape)

# PREDICTING volumes for months 6 to 23 on test
s2_pred = s2_model.predict(X_S2_test)

months_future = list(range(6, 24))
s2_pred_df = pd.DataFrame(s2_pred,index=pivot_test_s1.index,columns=[f"vol_{m}" for m in months_future])

# transform back to long form structure
s2_pred_long = (s2_pred_df
      .reset_index()
      .melt(id_vars=["country", "brand_name"], var_name="vol_month", value_name="volume_s2")
)

# extracting numeric months_postgx
s2_pred_long["months_postgx"] = s2_pred_long["vol_month"].str.replace("vol_", "").astype(int)
s2_pred_long = s2_pred_long.drop(columns=["vol_month"])


S2 test input shape: (340, 6)


# Combining s1 and s2 predictions

In [15]:
# s1 pred
test_df["volume_final"] = test_df["volume_s1"]

# merging s2 preds (only for months 6 to 23, only for brands that had 0 to 5)
test_df = test_df.merge(
    s2_pred_long,
    on=["country", "brand_name", "months_postgx"],
    how="left"
)

# overwriting s1 preds where ther is s2 pred. (so months 0 to 5 is s1 pred, and months 6 to 23 should have s2 pred)
test_df.loc[test_df["volume_s2"].notnull(), "volume_final"] = test_df.loc[
    test_df["volume_s2"].notnull(), "volume_s2"
]

In [16]:
# ------------------------- final submission file --------------------------
submission = test_df[["country", "brand_name", "months_postgx", "volume_final"]].copy()
submission = submission[submission["months_postgx"].between(0, 23)]

submission = submission.rename(columns={"volume_final": "volume"}) # renaming for format
submission = submission.sort_values(["country", "brand_name", "months_postgx"]).reset_index(drop=True)

print(submission.head())

submission.to_csv("submission_s1_s2_combined.csv", index=False)

        country  brand_name  months_postgx        volume
0  COUNTRY_0024  BRAND_31BE              0  59992.848903
1  COUNTRY_0024  BRAND_31BE              1  55904.373833
2  COUNTRY_0024  BRAND_31BE              2  51726.099351
3  COUNTRY_0024  BRAND_31BE              3  48870.090055
4  COUNTRY_0024  BRAND_31BE              4  42042.588756
