## Prophet Synthetic Dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
plt.rcParams['figure.figsize'] = (12, 6)
sns.set_style('whitegrid')

daily sales for 'Cough Syrup' over 3 years

In [3]:
def generate_synthetic_sales(product_name, start_date='2022-01-01', periods=3*365, freq='D'):
    """
    Generates synthetic daily sales data for a product with seasonal effects:
    - ORS: summer peak
    - Cough Syrup: winter peak
    - Vitamin C: cold/flu spikes in winter
    - Sunscreen: summer peak
    - Antihistamine: pollen/allergy season (spring)
    """
    dates = pd.date_range(start=start_date, periods=periods, freq=freq)
    n = len(dates)
    
    # Base level
    base_level = {
        'ORS_Solution': 15, 'Cough_Syrup': 20, 'Vitamin_C': 18,
        'Sunscreen': 10, 'Antihistamine': 12
    }.get(product_name, 15)
    
    # Seasonal strength
    seasonal_strength = 0.6
    weekly_strength = 0.2
    promo_spike_strength = 30
    noise_std = 3
    
    day_of_year = dates.dayofyear.values
    day_of_week = dates.dayofweek.values
    
    # Define yearly seasonality per product
    if product_name in ['Cough_Syrup', 'Vitamin_C']:
        # Winter peak: Nov-Feb
        yearly = seasonal_strength * (1 + np.cos((2*np.pi*(day_of_year - 15))/365))
    elif product_name in ['ORS_Solution', 'Sunscreen']:
        # Summer peak: May-Aug
        yearly = seasonal_strength * (1 + np.cos((2*np.pi*(day_of_year - 200))/365))
    elif product_name == 'Antihistamine':
        # Spring peak: Mar-May
        yearly = seasonal_strength * (1 + np.cos((2*np.pi*(day_of_year - 100))/365))
    else:
        yearly = np.zeros(n)
    
    # Weekly seasonality
    weekly = weekly_strength * (1 + 0.5 * ((day_of_week >= 0) & (day_of_week <= 4)).astype(float))
    
    # Trend: slight upward
    trend = 0.01 * np.arange(n) / 365.0
    
    # Base signal
    raw_signal = base_level * (1 + yearly + weekly + trend)
    
    # Promotional spikes
    promo = np.zeros(n)
    rng = np.random.default_rng(RANDOM_SEED)
    for i, dt in enumerate(dates):
        # chance of promo in peak season
        if ((product_name in ['Cough_Syrup', 'Vitamin_C']) and dt.month in [11,12,1,2]) or \
           ((product_name in ['ORS_Solution','Sunscreen']) and dt.month in [5,6,7,8]) or \
           ((product_name == 'Antihistamine') and dt.month in [3,4,5]):
            if rng.random() < 0.03:
                promo_length = rng.integers(2,7)
                promo[i:i+promo_length] += promo_spike_strength * (1 + rng.random())
        # occasional random promo
        elif rng.random() < 0.005:
            promo_length = rng.integers(1,4)
            promo[i:i+promo_length] += promo_spike_strength*0.6
    
    # Random anomalies
    anomalies = np.zeros(n)
    for i in range(n):
        if rng.random() < 0.002:
            anomalies[i] = -0.7 * raw_signal[i]
    
    # Noise
    noise = rng.normal(0, noise_std, n)
    
    y = raw_signal + promo + anomalies + noise
    y = np.clip(y, 0, None)
    y = np.round(y).astype(int)
    
    df = pd.DataFrame({
        'ds': dates,
        'y': y,
        'product': product_name
    })
    
    # Add regressors
    df['month'] = df['ds'].dt.month
    df['day_of_year'] = df['ds'].dt.dayofyear
    # Winter flag
    df['is_winter'] = df['month'].isin([11,12,1,2]).astype(int)
    # Synthetic temp (summer=hot, winter=cold)
    df['temp'] = 25 + 8*np.sin(2*np.pi*(df['day_of_year'] - 200)/365) + np.random.normal(0,1,len(df))
    
    return df

In [4]:
# Generate for all products
products = ['ORS_Solution', 'Cough_Syrup', 'Vitamin_C', 'Sunscreen', 'Antihistamine']
all_data = pd.concat([generate_synthetic_sales(p) for p in products], ignore_index=True)

In [5]:
all_data.sample(10)

Unnamed: 0,ds,y,product,month,day_of_year,is_winter,temp
914,2024-07-03,41,ORS_Solution,7,185,0,22.861878
1929,2024-04-14,33,Cough_Syrup,4,105,0,18.97501
4794,2023-02-19,28,Antihistamine,2,50,1,20.604043
2308,2022-04-29,36,Vitamin_C,4,119,0,17.658715
88,2022-03-30,25,ORS_Solution,3,89,0,16.928151
3339,2022-02-24,12,Sunscreen,2,55,1,21.983368
312,2022-11-09,26,ORS_Solution,11,313,1,33.271208
4873,2023-05-09,26,Antihistamine,5,129,0,17.71402
4851,2023-04-17,28,Antihistamine,4,107,0,19.552639
1375,2022-10-08,32,Cough_Syrup,10,281,0,32.666467


In [6]:
all_data.shape

(5475, 7)

In [7]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5475 entries, 0 to 5474
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   ds           5475 non-null   datetime64[ns]
 1   y            5475 non-null   int64         
 2   product      5475 non-null   object        
 3   month        5475 non-null   int32         
 4   day_of_year  5475 non-null   int32         
 5   is_winter    5475 non-null   int64         
 6   temp         5475 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int32(2), int64(2), object(1)
memory usage: 256.8+ KB


In [8]:
train_df = all_data[all_data["ds"] <= "2024-11-30"]
test_df  = all_data[(all_data["ds"] >= "2024-12-01") & (all_data["ds"] <= "2024-12-30")]

In [14]:
print(test_df['ds'].min(), test_df['ds'].max())

2024-12-01 00:00:00 2024-12-30 00:00:00


In [9]:
train_df.shape, test_df.shape

((5325, 7), (150, 7))

In [None]:
train_df.to_csv("C:/Users/Admin/NIYOJAN/Data/Synthetic/Training/prophet_data.csv", index=False)

In [None]:
test_df.to_csv("C:/Users/Admin/NIYOJAN/Data/Synthetic/Test/prophet_data.csv", index=False)