## LSTM Synthetic Dataset

In [15]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random

In [16]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [17]:
PRODUCTS = [
    "Paracetamol_500mg",
    "Vitamin_C",
    "Antibiotic_A",
    "Insulin_Supplies",
    "PainRelief_Gel"
]

In [18]:
start_date = pd.to_datetime("2022-01-01")
end_date = pd.to_datetime("2024-12-31") # ~3 years
dates = pd.date_range(start_date, end_date, freq='D')
n = len(dates)

In [19]:
all_products_data = []

In [20]:
for product in PRODUCTS:
    # Base demand and weekly/annual seasonality
    base = 50 + 5 * np.sin(2 * np.pi * dates.dayofyear / 365.25)
    weekly = 5 * np.sin(2 * np.pi * dates.dayofweek / 7)
    trend = np.linspace(0, 10, n)
    
    # Promotions: product-specific pattern
    promotion = np.zeros(n, dtype=int)
    for year in [2022, 2023, 2024]:
        year_dates = pd.date_range(f"{year}-01-01", f"{year}-12-31", freq='D')
        for _ in range(15):
            start = np.random.choice(year_dates[:-10])
            length = np.random.randint(3, 10)
            idx_start = (start - start_date).days
            promotion[idx_start: idx_start+length] = 1
    
    # Holidays (same across products)
    holiday_dates = [
        pd.to_datetime("2022-01-01"), pd.to_datetime("2022-12-25"),
        pd.to_datetime("2023-01-01"), pd.to_datetime("2023-11-12"),
        pd.to_datetime("2024-01-01"), pd.to_datetime("2024-11-01")
    ]
    holiday_flag = np.isin(dates, holiday_dates).astype(int)
    
    # Noise & promotion/holiday uplift
    noise = np.random.normal(0, 3, n)
    promo_uplift = promotion * (15 + np.random.normal(0, 5, n))
    holiday_uplift = holiday_flag * 20
    
    # Product-specific irregularity
    if product == "Paracetamol_500mg":
        spike = np.random.randint(0, 20, n)
    elif product == "PainRelief_Gel":
        spike = promotion * np.random.randint(10, 30, n)
    else:
        spike = np.zeros(n)
    
    sales = base + weekly + trend + promo_uplift + holiday_uplift + noise + spike
    sales = np.clip(sales.round().astype(int), a_min=0, a_max=None)
    
    df_product = pd.DataFrame({
        "date": dates,
        "product": product,
        "sales": sales,
        "promotion": promotion,
        "holiday": holiday_flag
    })
    
    all_products_data.append(df_product)

In [21]:
df_all = pd.concat(all_products_data).reset_index(drop=True)

In [22]:
df_all.sample(10)

Unnamed: 0,date,product,sales,promotion,holiday
2979,2024-02-27,Antibiotic_A,76,1,0
4691,2022-11-04,PainRelief_Gel,47,0,0
5316,2024-07-21,PainRelief_Gel,91,1,0
191,2022-07-11,Paracetamol_500mg,82,1,0
2833,2023-10-04,Antibiotic_A,56,0,0
1661,2023-07-20,Vitamin_C,68,1,0
3968,2023-11-12,Insulin_Supplies,44,0,0
687,2023-11-19,Paracetamol_500mg,52,0,0
2187,2024-12-27,Vitamin_C,59,0,0
608,2023-09-01,Paracetamol_500mg,67,0,0


In [23]:
df_all.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5480 entries, 0 to 5479
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   date       5480 non-null   datetime64[ns]
 1   product    5480 non-null   object        
 2   sales      5480 non-null   int64         
 3   promotion  5480 non-null   int64         
 4   holiday    5480 non-null   int64         
dtypes: datetime64[ns](1), int64(3), object(1)
memory usage: 214.2+ KB


In [24]:
df_all.shape

(5480, 5)

In [25]:
train_df = df_all[df_all["date"] <= "2024-11-30"]
test_df  = df_all[(df_all["date"] >= "2024-12-01") & (df_all["date"] <= "2024-12-30")]

In [26]:
train_df.shape, test_df.shape   

((5325, 5), (150, 5))

In [None]:
train_df.to_csv("C:/Users/Admin/NIYOJAN/Data/Synthetic/Training/Lstm_data.csv", index=False)

In [None]:
test_df.to_csv("C:/Users/Admin/NIYOJAN/Data/Synthetic/Test/Lstm_data.csv", index=False)