In [2]:
# read cleaned_sales_data.csv file and display first 5 rows
import pandas as pd
df_sales = pd.read_csv("cleaned_sales_data.csv")
df_sales.head()

Unnamed: 0,system_date,order_type,food_name,qty,gross_price,discount_rate,total_discount_price,total_price
0,2025-01-01,delivery,Veggie Roll,3,1568.42,0.0,206.74,1361.68
1,2025-01-01,takeaway,Deluxe Pizza 19,4,393.44,0.2,64.81,328.63
2,2025-01-01,takeaway,Sweet Chicken 19,2,10601.82,0.0,1016.21,9585.61
3,2025-01-01,takeaway,Veggie Roll 9,1,1540.33,0.25,217.19,1235.51
4,2025-01-02,table,Spicy Chicken 19,3,2264.26,0.25,1006.74,1257.52


In [3]:
# Summaries to capture distribution patterns for synthetic data generation
df_sales['system_date'] = pd.to_datetime(df_sales['system_date'])
summary = pd.Series({
    'row_count': len(df_sales),
    'date_start': df_sales['system_date'].min().date(),
    'date_end': df_sales['system_date'].max().date(),
    'daily_orders_median': df_sales.groupby('system_date')['qty'].sum().median(),
    'avg_items_per_order': df_sales['qty'].mean(),
    'avg_discount_rate': df_sales['discount_rate'].mean(),
    'avg_gross_price': df_sales['gross_price'].mean(),
    'avg_total_price': df_sales['total_price'].mean()
}).to_frame(name='value')
summary

Unnamed: 0,value
row_count,1021
date_start,2025-01-01
date_end,2025-09-10
daily_orders_median,11.0
avg_items_per_order,2.513222
avg_discount_rate,0.120911
avg_gross_price,2843.607179
avg_total_price,1980.226768


In [4]:
# Distribution breakdowns used for backfilling historical years
order_type_share = df_sales['order_type'].value_counts(normalize=True).to_frame('share')
weekday_pattern = df_sales.groupby(df_sales['system_date'].dt.day_name())['total_price'].sum().reindex(
    ['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
).to_frame('revenue').fillna(0)
monthly_volume = df_sales.groupby(df_sales['system_date'].dt.month)['qty'].sum().rename('total_qty')
top_menu = df_sales.groupby('food_name')['total_price'].sum().sort_values(ascending=False).head(10).to_frame('revenue')
order_type_share, weekday_pattern, monthly_volume, top_menu

(               share
 order_type          
 delivery    0.384917
 table       0.380020
 takeaway    0.235064,
                revenue
 system_date           
 Monday       317415.90
 Tuesday      383831.00
 Wednesday    295294.54
 Thursday     243939.70
 Friday       285450.46
 Saturday     250171.51
 Sunday       245708.42,
 system_date
 1    290
 2    239
 3    284
 4    334
 5    325
 6    383
 7    354
 8    285
 9     72
 Name: total_qty, dtype: int64,
                      revenue
 food_name                   
 Veggie Roll        103696.03
 Crispy Pasta 12     96253.36
 Spicy Sandwich 19   94707.69
 Spicy Chicken 19    93193.26
 Herb Drink 9        89231.29
 Spicy Burger        80890.76
 Deluxe Pizza        77196.57
 Herb Burger         69213.22
 Sweet Chicken 19    61321.98
 Item50              57698.35)

In [5]:
# Inspect discount consistency to inform synthetic calculations
df_sales['discount_ratio_effective'] = df_sales['total_discount_price'] / df_sales['gross_price']
discount_summary = df_sales[['discount_rate','discount_ratio_effective']].describe(percentiles=[0.25,0.5,0.75])
discount_summary

Unnamed: 0,discount_rate,discount_ratio_effective
count,1021.0,1021.0
mean,0.120911,0.282455
std,0.150899,0.17908
min,0.0,0.001582
25%,0.0,0.15128
50%,0.0,0.241947
75%,0.25,0.362366
max,0.5,1.026563


In [8]:
# Generate 2020-2024 synthetic history aligned with observed 2025 patterns
import numpy as np
rng = np.random.default_rng(20241214)
base_rows = df_sales.copy().reset_index(drop=True)
base_rows['discount_ratio_effective'] = base_rows['total_discount_price'] / base_rows['gross_price']
base_rows['discount_ratio_effective'] = base_rows['discount_ratio_effective'].replace([np.inf, -np.inf], np.nan).fillna(0.0).clip(0, 0.95)
row_weights = np.ones(len(base_rows), dtype=float)
row_weights = row_weights / row_weights.sum()
daily_counts = df_sales.groupby('system_date').size()
base_lambda = daily_counts.mean()
weekday_counts = df_sales.groupby(df_sales['system_date'].dt.dayofweek).size()
weekday_factors = (weekday_counts / weekday_counts.mean()).to_dict()
monthly_counts = df_sales.groupby(df_sales['system_date'].dt.month).size()
monthly_baseline = monthly_counts.mean()
month_factors = {}
month_price_lift = {1:0.92,2:0.92,3:0.94,4:0.97,5:0.99,6:1.01,7:1.02,8:1.02,9:1.01,10:1.03,11:1.04,12:1.07}
for m in range(1, 13):
    raw_val = monthly_counts.get(m)
    if raw_val is None or np.isnan(raw_val):
        if m == 4:
            raw_val = monthly_baseline * 1.08  # Sinhala and Tamil New Year uplift
        elif m == 12:
            raw_val = monthly_baseline * 1.18  # Festive surge
        elif m in (10, 11):
            raw_val = monthly_baseline * 0.95
        else:
            raw_val = monthly_baseline
    month_factors[m] = raw_val / monthly_baseline
discount_rate_probs = df_sales['discount_rate'].value_counts(normalize=True).sort_index()
discount_ratio_stats = (
    df_sales.groupby('discount_rate')['discount_ratio_effective']
    .agg(['mean', 'std'])
    .replace([np.inf, -np.inf], np.nan)
    .fillna({'std': 0.05})
    .to_dict('index')
)
fallback_ratio = {
    'mean': base_rows['discount_ratio_effective'].mean(),
    'std': base_rows['discount_ratio_effective'].std() or 0.1
}
year_intensity = {2024: 0.95, 2023: 0.9, 2022: 0.85, 2021: 0.78, 2020: 0.62}
price_deflators = {2024: 0.93, 2023: 0.88, 2022: 0.83, 2021: 0.77, 2020: 0.72}
order_type_profile = {
    2024: {'delivery': 0.39, 'table': 0.37, 'takeaway': 0.24},
    2023: {'delivery': 0.41, 'table': 0.35, 'takeaway': 0.24},
    2022: {'delivery': 0.43, 'table': 0.33, 'takeaway': 0.24},
    2021: {'delivery': 0.44, 'table': 0.31, 'takeaway': 0.25},
    2020: {'delivery': 0.48, 'table': 0.27, 'takeaway': 0.25}
}
def adjust_for_events(base_expected, current_date):
    adjusted = base_expected
    if current_date.year == 2020 and pd.Timestamp('2020-03-20') <= current_date <= pd.Timestamp('2020-06-15'):
        adjusted *= 0.45  # national lockdown period
    if current_date.year == 2022 and pd.Timestamp('2022-04-01') <= current_date <= pd.Timestamp('2022-08-15'):
        adjusted *= 0.75  # economic crisis dampening demand
    if current_date.month == 12 and current_date.day >= 10:
        adjusted *= 1.25  # festive lift
    if current_date.month == 4 and 10 <= current_date.day <= 17:
        adjusted *= 1.18  # Avurudu demand spike
    return adjusted
def sample_discount_ratio(rate):
    stats = discount_ratio_stats.get(rate, fallback_ratio)
    ratio = rng.normal(stats['mean'], stats['std'] / 1.8 if stats['std'] else 0.06)
    return float(np.clip(ratio, 0.01, 0.95))
synthetic_records = []
target_dates = pd.date_range('2020-01-10', '2024-12-31', freq='D')
for current_date in target_dates:
    year = current_date.year
    weekday_factor = weekday_factors.get(current_date.weekday(), 1.0)
    month_factor = month_factors.get(current_date.month, 1.0)
    expected_orders = base_lambda * year_intensity[year] * weekday_factor * month_factor
    expected_orders = adjust_for_events(expected_orders, current_date)
    lam = max(1.2, expected_orders)
    order_count = max(1, rng.poisson(lam))
    sampled_idx = rng.choice(base_rows.index, size=order_count, replace=True, p=row_weights)
    sampled = base_rows.loc[sampled_idx].copy().reset_index(drop=True)
    sampled['system_date'] = current_date
    profile = order_type_profile[year]
    sampled['order_type'] = rng.choice(list(profile.keys()), size=order_count, p=list(profile.values()))
    qty_noise = rng.normal(0, 0.45, size=order_count)
    sampled['qty'] = np.clip(np.rint(sampled['qty'] + qty_noise), 1, 8).astype(int)
    discount_rates = rng.choice(discount_rate_probs.index.to_numpy(), size=order_count, p=discount_rate_probs.values)
    sampled['discount_rate'] = np.clip(discount_rates + rng.normal(0, 0.015, size=order_count), 0, 0.5)
    ratios = np.array([sample_discount_ratio(rate) for rate in sampled['discount_rate']])
    price_factor = price_deflators[year] * month_price_lift.get(current_date.month, 1.0)
    price_noise = rng.normal(1.0, 0.05, size=order_count)
    gross_prices = sampled['gross_price'].to_numpy() * price_factor * price_noise
    sampled['gross_price'] = np.round(np.clip(gross_prices, 120, None), 2)
    sampled['total_discount_price'] = np.round(sampled['gross_price'] * ratios, 2)
    sampled['total_price'] = np.round(sampled['gross_price'] - sampled['total_discount_price'], 2)
    synthetic_records.append(sampled[['system_date','order_type','food_name','qty','gross_price','discount_rate','total_discount_price','total_price']])
historical_df = pd.concat(synthetic_records, ignore_index=True)
historical_df.sort_values('system_date', inplace=True)
historical_df.reset_index(drop=True, inplace=True)
historical_df.head()

Unnamed: 0,system_date,order_type,food_name,qty,gross_price,discount_rate,total_discount_price,total_price
0,2020-01-10,delivery,Sweet Pasta 12,5,6624.6,0.260369,3083.27,3541.33
1,2020-01-10,takeaway,Sweet Roll 9,4,2077.09,0.24833,711.3,1365.79
2,2020-01-11,delivery,Item50,1,3963.95,0.022974,1461.42,2502.53
3,2020-01-12,table,Crispy Pasta 12,1,1770.43,0.274482,17.7,1752.73
4,2020-01-12,table,Deluxe Pizza,4,1334.02,0.226195,324.49,1009.53


In [9]:
# Quick validation of synthetic history vs baseline
hist_summary = pd.Series({
    'row_count': len(historical_df),
    'date_start': historical_df['system_date'].min().date(),
    'date_end': historical_df['system_date'].max().date(),
    'avg_daily_orders': historical_df.groupby('system_date').size().mean(),
    'avg_qty': historical_df['qty'].mean(),
    'avg_gross_price': historical_df['gross_price'].mean(),
    'avg_total_price': historical_df['total_price'].mean(),
    'delivery_share': (historical_df['order_type'] == 'delivery').mean(),
    'table_share': (historical_df['order_type'] == 'table').mean(),
    'takeaway_share': (historical_df['order_type'] == 'takeaway').mean()
}).to_frame('value')
hist_summary

Unnamed: 0,value
row_count,6867
date_start,2020-01-10
date_end,2024-12-31
avg_daily_orders,3.777228
avg_qty,2.509684
avg_gross_price,2286.539211
avg_total_price,1631.693111
delivery_share,0.422892
table_share,0.328236
takeaway_share,0.248871


In [10]:
# Persist generated history
from pathlib import Path
output_path = Path('historical_sales_2020_2024.csv')
historical_df.to_csv(output_path, index=False)
output_path, historical_df.shape

(PosixPath('historical_sales_2020_2024.csv'), (6867, 8))

In [11]:
# Merge historical backfill with actual 2025 data
combined_df = pd.concat([historical_df, df_sales], ignore_index=True)
combined_df['system_date'] = pd.to_datetime(combined_df['system_date'])
combined_df.sort_values('system_date', inplace=True)
combined_df.reset_index(drop=True, inplace=True)
combined_summary = pd.Series({
    'row_count': len(combined_df),
    'date_start': combined_df['system_date'].min().date(),
    'date_end': combined_df['system_date'].max().date(),
    'avg_daily_orders': combined_df.groupby('system_date').size().mean(),
    'avg_qty': combined_df['qty'].mean(),
    'avg_gross_price': combined_df['gross_price'].mean(),
    'avg_total_price': combined_df['total_price'].mean(),
    'delivery_share': (combined_df['order_type'] == 'delivery').mean(),
    'table_share': (combined_df['order_type'] == 'table').mean(),
    'takeaway_share': (combined_df['order_type'] == 'takeaway').mean()
}).to_frame('value')
combined_summary

Unnamed: 0,value
row_count,7888
date_start,2020-01-10
date_end,2025-09-10
avg_daily_orders,3.855327
avg_qty,2.510142
avg_gross_price,2358.644484
avg_total_price,1676.806303
delivery_share,0.417977
table_share,0.334939
takeaway_share,0.247084


In [12]:
# Export merged dataset with historical coverage
combined_path = Path('sales_with_history_2020_2025.csv')
combined_df.to_csv(combined_path, index=False)
combined_path, combined_df.shape

(PosixPath('sales_with_history_2020_2025.csv'), (7888, 9))