In [3]:
import pandas as pd
import numpy as np

In [9]:
df = pd.read_csv('../data/preprocessed/sales_and_purchase_prices.csv')
df['SalesDate'] = pd.to_datetime(df['SalesDate'])

In [11]:
jan_feb = df[df['SalesDate'].dt.month.isin([1, 2])]

In [13]:
unique_products = jan_feb[['Brand', 'Description']].drop_duplicates()
sampled_products = unique_products.sample(frac=0.5)

In [18]:
filtered = jan_feb.merge(sampled_products, on=['Brand', 'Description'])
mean_sales = filtered.groupby(['Brand', 'Description'])['SalesQuantity'].mean().reset_index()


In [20]:
date_range = pd.date_range('2016-03-01', '2018-02-28')

In [22]:
synthetic_data = []

for _, row in mean_sales.iterrows():
    for date in date_range:
        weekday = date.weekday()
        month = date.month

        seasonal = 1.0
        if weekday >= 5: seasonal *= 0.65
        if month in [6, 7, 8]: seasonal *= np.random.uniform(1.3, 1.6)
        elif month in [11, 12]: seasonal *= np.random.uniform(1.4, 1.8)
        elif month in [1, 2]: seasonal *= np.random.uniform(0.8, 1.0)
        else: seasonal *= np.random.uniform(0.9, 1.1)

        qty = max(0, int(np.random.poisson(row['SalesQuantity'] * seasonal)))
        if qty == 0:
            continue

        synthetic_data.append({
            'Brand': row['Brand'],
            'Description': row['Description'],
            'SalesDate': date.strftime('%Y-%m-%d'),
            'SalesQuantity': qty
        })

In [24]:
df_synthetic = pd.DataFrame(synthetic_data)


In [None]:
df_synthetic.to_csv('synthetic_sales_2_years.csv', index=False)