## XGBoost Synthetic Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
np.random.seed(42)

In [3]:
products = [
    "ORS_Solution",        # temp-sensitive (heatwaves)
    "Paracetamol_500mg",   # flu season + outbreak flags
    "Antibiotic_A",        # doctor-driven + irregular demand
    "Sunscreen",           # summer + weather
    "Travel_Antidiarrheal" # travel/holiday sensitive
]

In [4]:
start_date = "2022-01-01"
end_date   = "2024-12-31"   # 3 years
dates = pd.date_range(start=start_date, end=end_date, freq="D")

In [5]:
# Create master dataset
data = []
for date in dates:
    temp = 20 + 10*np.sin(2*np.pi*date.timetuple().tm_yday/365) + np.random.normal(0,2)  # seasonal temp
    holiday = 1 if date.weekday() in [5,6] else 0  # weekend as holiday
    outbreak = 1 if (date.month in [1,2,12] and np.random.rand() < 0.05) else 0  # flu outbreak chance
    
    for product in products:
        # Base sales per product
        if product == "ORS_Solution":
            sales = 30 + 2*temp + 10*holiday + np.random.normal(0,5)
        elif product == "Paracetamol_500mg":
            sales = 40 + 15*outbreak + np.random.normal(0,8)
        elif product == "Antibiotic_A":
            sales = 25 + np.random.poisson(3) + np.random.normal(0,4)
        elif product == "Sunscreen":
            sales = 20 + 3*max(temp-25,0) + np.random.normal(0,5)
        elif product == "Travel/Antidiarrheal":
            sales = 15 + 12*holiday + np.random.normal(0,3)
        
        sales = float(np.ravel(sales)[0])  
        data.append([date, product, temp, holiday, outbreak, max(0, int(sales))])

In [6]:
df = pd.DataFrame(data, columns=["date","product","temperature","holiday","outbreak","sales"])

In [7]:
df.shape

(5480, 6)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5480 entries, 0 to 5479
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         5480 non-null   datetime64[ns]
 1   product      5480 non-null   object        
 2   temperature  5480 non-null   float64       
 3   holiday      5480 non-null   int64         
 4   outbreak     5480 non-null   int64         
 5   sales        5480 non-null   int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 257.0+ KB


In [9]:
df.sample(10)

Unnamed: 0,date,product,temperature,holiday,outbreak,sales
1457,2022-10-19,Antibiotic_A,9.946125,0,0,31
3932,2024-02-26,Antibiotic_A,29.790525,0,0,31
4801,2024-08-18,Paracetamol_500mg,15.291859,1,0,39
4490,2024-06-17,ORS_Solution,20.483677,0,0,74
1114,2022-08-11,Travel_Antidiarrheal,15.485726,0,0,11
2554,2023-05-26,Travel_Antidiarrheal,28.748027,0,0,31
4979,2024-09-22,Travel_Antidiarrheal,10.585379,1,0,20
731,2022-05-27,Paracetamol_500mg,27.665481,0,0,35
1929,2023-01-21,Travel_Antidiarrheal,22.031891,1,0,26
602,2022-05-01,Antibiotic_A,28.321157,1,0,35


In [10]:
train_df = df[df["date"] <= "2024-11-30"]
test_df  = df[(df["date"] >= "2024-12-01") & (df["date"] <= "2024-12-30")]

In [11]:
train_df.shape, test_df.shape

((5325, 6), (150, 6))

In [12]:
train_df.to_csv("C:/Users/Admin/IIS/Data/Synthetic/Training/xgb_data.csv", index=False)

In [13]:
test_df.to_csv("C:/Users/Admin/IIS/Data/Synthetic/Test/xbg_data.csv", index=False)