In [None]:
import pandas as pd
import numpy as np

In [None]:
ts_df = pd.read_csv('data/processed_data.csv')

In [None]:
imputation_df = ts_df.groupby(['ean_code', 'order_date']).agg({'quantity': 'sum'}).reset_index()

In [None]:
imputation_df.head()

In [None]:
def calculate_annual_growth(imputation_df, ean_codes, year_from, year_to):
    growth_rates = {}
    for ean in ean_codes:
        df_ean = imputation_df[imputation_df['ean_code'] == ean].copy()
        df_ean['year'] = df_ean['order_date'].dt.year
        
        mean_from = df_ean[df_ean['year'] == year_from]['quantity'].mean()
        mean_to = df_ean[df_ean['year'] == year_to]['quantity'].mean()
        
        if mean_from and pd.notna(mean_from) and mean_to and pd.notna(mean_to):
            growth = (mean_to / mean_from) - 1
        else:
            growth = 0
        growth_rates[ean] = growth
    return growth_rates

In [None]:
def impute_last_3_months_2022(imputation_df, ean_codes):
    results = []
    for ean in ean_codes:
        df_ean = imputation_df[imputation_df['ean_code'] == ean].copy()
        df_ean['year'] = df_ean['order_date'].dt.year
        df_ean['month'] = df_ean['order_date'].dt.month
        
        jan_sep_2021 = df_ean[(df_ean['year'] == 2021) & (df_ean['month'] <= 9)]
        jan_sep_2022 = df_ean[(df_ean['year'] == 2022) & (df_ean['month'] <= 9)]
        
        mean_2021 = jan_sep_2021['quantity'].mean()
        mean_2022 = jan_sep_2022['quantity'].mean()
        growth_2022 = ((mean_2022 / mean_2021) - 1) if mean_2021 else 0
        
        oct_dec_2021 = df_ean[(df_ean['year'] == 2021) & (df_ean['month'].isin([10,11,12]))]
        
        for _, row in oct_dec_2021.iterrows():
            new_date = row['order_date'] + pd.DateOffset(years=1)
            imputed_qty = row['quantity'] * (1 + growth_2022)
            results.append({'ean_code': ean, 'order_date': new_date, 'quantity': imputed_qty})
    return pd.DataFrame(results)

In [None]:
def impute_full_year(imputation_df, ean_codes, year_to_impute, prev_year_data, growth_rates):
    results = []
    forecast_dates = pd.date_range(start=f"{year_to_impute}-01-01", end=f"{year_to_impute}-12-31")
    
    for ean in ean_codes:
        growth = growth_rates.get(ean, 0)
        prev_data_ean = prev_year_data[prev_year_data['ean_code'] == ean]
        
        for d in forecast_dates:
            ref_date = d - pd.DateOffset(years=1)
            ref_row = prev_data_ean[prev_data_ean['order_date'] == ref_date]
            
            if not ref_row.empty:
                qty = ref_row.iloc[0]['quantity']
                imputed_qty = qty * (1 + growth)
            else:
                imputed_qty = 0
            results.append({'ean_code': ean, 'order_date': d, 'quantity': imputed_qty})
            
    return pd.DataFrame(results)

In [None]:
ean_codes = top_products
oct_dec_2022_imputed = impute_last_3_months_2022(imputation_df, ean_codes)

In [None]:
prev_year_2022 = pd.concat([
    imputation_df[(imputation_df['order_date'].dt.year == 2022) & (imputation_df['ean_code'].isin(ean_codes))],
    oct_dec_2022_imputed
], ignore_index=True)

In [None]:
growth_2021_2022 = calculate_annual_growth(imputation_df, ean_codes, 2021, 2022)
imputed_2023 = impute_full_year(imputation_df, ean_codes, 2023, prev_year_2022, growth_2021_2022)
combined_for_growth = pd.concat([imputation_df, imputed_2023], ignore_index=True)
growth_2022_2023 = calculate_annual_growth(combined_for_growth, ean_codes, 2022, 2023)
# growth_2022_2023['6902395722403'] = 0.52
imputed_2024 = impute_full_year(imputation_df, ean_codes, 2024, imputed_2023, growth_2022_2023)

In [None]:
for key, value in growth_2021_2022.items():
    print(key, value)

In [None]:
for key, value in growth_2022_2023.items():
    print(key, value)

In [None]:
print(oct_dec_2022_imputed.head())
print(imputed_2023.head())
print(imputed_2024.tail())

In [None]:
all_actuals_df = pd.concat([oct_dec_2022_imputed, imputed_2023, imputed_2024], ignore_index=True)
forecast_df['order_date'] = pd.to_datetime(forecast_df['order_date'])
forecast_df_renamed = forecast_df.rename(columns={'predicted_quantity': 'quantity'})

In [None]:
merged_df = pd.merge(
    all_actuals_df,
    forecast_df_renamed[['ean_code', 'order_date', 'quantity']],
    on=['ean_code', 'order_date'],
    how='inner',
    suffixes=('_actual', '_forecast')
)
merged_df['quantity_actual'] = merged_df['quantity_actual'].astype(int)
merged_df['quantity_forecast'] = merged_df['quantity_forecast'].astype(int)
merged_df = merged_df.sort_values(by=['ean_code', 'order_date']).reset_index(drop=True)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style('whitegrid')

products_to_plot = top_products  # Use your top_products list

for product in products_to_plot:
    product_data = merged_df[merged_df['ean_code'] == product]
    plt.figure(figsize=(12, 5))
    
    plt.plot(product_data['order_date'], product_data['quantity_actual'], label='Actual', marker='o')
    plt.plot(product_data['order_date'], product_data['quantity_forecast'], label='Forecast', marker='x')
    
    plt.title(f'Actual vs Forecasted Quantity for EAN: {product}')
    plt.xlabel('Date')
    plt.ylabel('Quantity')
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
merged_df.head(50)

In [None]:
def smooth_group(group):
        rolled = group['quantity_actual'].rolling(window=5, min_periods=1).mean()
        return rolled

merged_df['quantity_actual_smoothed'] = merged_df.groupby('ean_code', group_keys=False).apply(smooth_group)

In [None]:
merged_df.head(825)

In [None]:
mae = mean_absolute_error(merged_df['quantity_actual'], merged_df['quantity_forecast'])
mse = mean_squared_error(merged_df['quantity_actual'], merged_df['quantity_forecast'])
rmse = np.sqrt(mse)
r2 = r2_score(merged_df['quantity_actual'], merged_df['quantity_forecast'])

print(f"Accuracy of combined imputed actuals vs forecast:")
print(f"MAE: {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²: {r2:.3f}")

In [None]:
# Combine the earlier data (train_data) and forecasted data (forecast_df)
combined_df = pd.concat([
    agg_df.reset_index()[['ean_code', 'quantity', 'order_date']],
    forecast_df.rename(columns={'predicted_quantity': 'quantity'})[['ean_code', 'quantity', 'order_date']]
], ignore_index=True)

combined_df = combined_df[['ean_code', 'order_date', 'quantity']].sort_values(by=['ean_code', 'order_date'])
combined_df = combined_df.reset_index(drop=True)
combined_df.head(2011)


In [None]:
combined_df.to_csv('5year_data.csv', index=False)

In [None]:
forecast_df.head()

Unnamed: 0_level_0,index,ean_code,quantity,ean_code_encoded
order_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2022-10-01,0,8904362500005,435,4
2022-10-02,1,8904362500005,432,4
2022-10-03,2,8904362500005,458,4
2022-10-04,3,8904362500005,492,4
2022-10-05,4,8904362500005,568,4
