In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/train.csv')
df['Date'] = pd.to_datetime(df['Order Date'], dayfirst=True)
df = df.sort_values('Date').reset_index(drop=True)

daily_sales = df.groupby('Date')['Sales'].sum().reset_index()
daily_sales.set_index('Date', inplace=True)
daily_sales = daily_sales.asfreq('D')
daily_sales['Sales'] = daily_sales['Sales'].fillna(daily_sales['Sales'].median())
daily_sales['Sales'] = np.log1p(daily_sales['Sales'])  # Log transform
daily_sales.reset_index(inplace=True)

# Feature Engineering
daily_sales['dayofweek'] = daily_sales['Date'].dt.dayofweek
daily_sales['day']       = daily_sales['Date'].dt.day
daily_sales['month']     = daily_sales['Date'].dt.month
daily_sales['quarter']   = daily_sales['Date'].dt.quarter
daily_sales['year']      = daily_sales['Date'].dt.year
daily_sales['is_weekend'] = daily_sales['dayofweek'].isin([5, 6]).astype(int)


# Data Augmentation

augmented_data = daily_sales.copy()

noise = np.random.normal(loc=0.0, scale=0.05, size=len(daily_sales))
aug_1 = daily_sales.copy()
aug_1['Sales'] = aug_1['Sales'] + noise

aug_2 = daily_sales.copy()
aug_2['Sales'] = aug_2['Sales'].rolling(window=3, min_periods=1).mean()

aug_3 = daily_sales.copy()
aug_3['Sales'] = aug_3['Sales'].shift(1).fillna(method='bfill')

n_original = len(daily_sales)
n_aug = int(0.3 * n_original)

augmented = pd.concat([
    aug_1.sample(n=n_aug // 3, random_state=42),
    aug_2.sample(n=n_aug // 3, random_state=42),
    aug_3.sample(n=n_aug - 2 * (n_aug // 3), random_state=42)
])


daily_sales_augmented = pd.concat([daily_sales, augmented]).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Data augmentation complete. New size: {len(daily_sales_augmented)} rows ({len(daily_sales)} original + {len(daily_sales_augmented) - len(daily_sales)} augmented)")


Data augmentation complete. New size: 1895 rows (1458 original + 437 augmented)


  aug_3['Sales'] = aug_3['Sales'].shift(1).fillna(method='bfill')


In [2]:
import pandas as pd
import numpy as np
import os

df = pd.read_csv('/content/train.csv')
df['Date'] = pd.to_datetime(df['Order Date'], dayfirst=True)
df = df.sort_values('Date').reset_index(drop=True)

daily_sales = df.groupby('Date')['Sales'].sum().reset_index()
daily_sales.set_index('Date', inplace=True)
daily_sales = daily_sales.asfreq('D')
daily_sales['Sales'] = daily_sales['Sales'].fillna(daily_sales['Sales'].median())
daily_sales['Sales'] = np.log1p(daily_sales['Sales'])  # log transform
daily_sales.reset_index(inplace=True)

daily_sales['dayofweek']  = daily_sales['Date'].dt.dayofweek
daily_sales['day']        = daily_sales['Date'].dt.day
daily_sales['month']      = daily_sales['Date'].dt.month
daily_sales['quarter']    = daily_sales['Date'].dt.quarter
daily_sales['year']       = daily_sales['Date'].dt.year
daily_sales['is_weekend'] = daily_sales['dayofweek'].isin([5,6]).astype(int)


aug_noise = daily_sales.copy()
aug_noise['Sales'] += np.random.normal(0, 0.05, size=len(aug_noise))
aug_noise['label'] = 'gaussian_noise'


aug_roll = daily_sales.copy()
aug_roll['Sales'] = aug_roll['Sales'].rolling(window=3, min_periods=1).mean()
aug_roll['label'] = 'rolling_mean'

aug_shift = daily_sales.copy()
aug_shift['Sales'] = aug_shift['Sales'].shift(1).fillna(method='bfill')
aug_shift['label'] = 'time_shift'


augmented_all = pd.concat([aug_noise, aug_roll, aug_shift])
augmented_all = augmented_all.sample(frac=1, random_state=42).reset_index(drop=True)


target_size = int(0.3 * len(daily_sales))
augmented_final = augmented_all.iloc[:target_size]


os.makedirs("Augmented Data", exist_ok=True)


augmented_final.to_csv("Augmented Data/augmented_data.csv", index=False)


augmented_final[['Date', 'label']].to_csv("Augmented Data/augmented_labels.csv", index=False)

print(" Augmented data saved to 'Augmented Data/' folder.")


 Augmented data saved to 'Augmented Data/' folder.


  aug_shift['Sales'] = aug_shift['Sales'].shift(1).fillna(method='bfill')


In [3]:
from google.colab import files
files.download("Augmented Data/augmented_data.csv")
files.download("Augmented Data/augmented_labels.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>