In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv("retail_store_inventory.csv")

# 1. Convert Date to Datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# 2. Filter for MVP: Select a single category and sort the data
# Sorting is CRITICAL for time series feature creation (lags and targets)
MVP_CATEGORY = 'Groceries'
df_mvp = df[df['Category'] == MVP_CATEGORY].sort_values(['Product ID', 'Date']).copy()

# 3. Feature & Target Engineering (Grouped by Product ID)
# We must apply these time-based calculations *per product* to avoid data leakage
# (i.e., we don't want Product P0001's lag to use P0002's data)

def create_ts_features(group):
    # --- LAG FEATURES (The Past) ---
    # Lag 7: Sales from 7 days ago
    group['Lag_Sales_D-7'] = group['Units Sold'].shift(7)
    # Lag 1: Inventory from 1 day ago (the stock the model "knows" about today)
    group['Lag_Inventory_D-1'] = group['Inventory Level'].shift(1)

    # --- TARGET VARIABLE (The Future) ---
    # Target: Total Sales for the next 7 days (This is what the model must predict)
    # The rolling sum is calculated over the next 7 periods (6 periods after the current day + the current day)
    # We use .shift(-6) to align the 7-day future sum with the current row's date
    group['Target_Sales_D+7'] = group['Units Sold'].rolling(
        window=7, closed='left'
    ).sum().shift(-6)

    # We must also create a simple time feature (Day of Week)
    group['Day_of_Week'] = group['Date'].dt.dayofweek
    return group

df_processed = df_mvp.groupby('Product ID', group_keys=False).apply(create_ts_features)

# Clean up rows that now have missing values due to lagging/shifting
# (These are the first 7 rows of each time series that cannot have a full history/future)
df_processed.dropna(subset=['Target_Sales_D+7', 'Lag_Sales_D-7', 'Lag_Inventory_D-1'], inplace=True)

# Display the resulting head to see the new features
print("--- Processed Data with Features & Target ---")
print(df_processed[['Date', 'Product ID', 'Units Sold', 'Inventory Level', 'Lag_Sales_D-7', 'Lag_Inventory_D-1', 'Target_Sales_D+7', 'Day_of_Week']].head(10).to_markdown(index=False, numalign="left", stralign="left"))

# Check final shape and data types
print("\n--- Processed Data Info ---")
print(df_processed.info())

  df_processed = df_mvp.groupby('Product ID', group_keys=False).apply(create_ts_features)


--- Processed Data with Features & Target ---
| Date                | Product ID   | Units Sold   | Inventory Level   | Lag_Sales_D-7   | Lag_Inventory_D-1   | Target_Sales_D+7   | Day_of_Week   |
|:--------------------|:-------------|:-------------|:------------------|:----------------|:--------------------|:-------------------|:--------------|
| 2022-01-05 00:00:00 | P0001        | 147          | 238               | 127             | 85                  | 941                | 2             |
| 2022-01-06 00:00:00 | P0001        | 101          | 185               | 104             | 238                 | 1163               | 3             |
| 2022-01-07 00:00:00 | P0001        | 97           | 227               | 81              | 185                 | 1018               | 4             |
| 2022-01-08 00:00:00 | P0001        | 341          | 349               | 67              | 227                 | 1088               | 5             |
| 2022-01-09 00:00:00 | P0001        | 155      

In [3]:
df_processed.to_csv('ProcessedRetailDataset.csv', index=False)

In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Load the dataset
df = pd.read_csv("retail_store_inventory.csv")

# 1. Convert Date to Datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# 2. Select Multiple Categories for Scaling Up
SELECTED_CATEGORIES = ['Groceries', 'Toys', 'Clothing']
df_combined = df[df['Category'].isin(SELECTED_CATEGORIES)].sort_values(['Product ID', 'Date']).copy()

# 3. Feature & Target Engineering (Adding D-1 and D-2 Lags)
def create_ts_features_v2(group):
    # --- LAG FEATURES (The Past & Present) ---
    # Primary Lags for immediate momentum (NEW)
    group['Lag_Sales_D-1'] = group['Units Sold'].shift(1)
    group['Lag_Sales_D-2'] = group['Units Sold'].shift(2)

    # Existing Lags
    group['Lag_Sales_D-7'] = group['Units Sold'].shift(7)
    group['Lag_Inventory_D-1'] = group['Inventory Level'].shift(1)

    # --- ROLLING STATISTICAL FEATURE ---
    group['Rolling_Mean_7D'] = group['Units Sold'].shift(1).rolling(window=7).mean()

    # --- TARGET VARIABLE (The Future) ---
    # Target: Total Sales for the next 7 days (D+1 to D+7)
    group['Target_Sales_D+7'] = group['Units Sold'].rolling(
        window=7, closed='left'
    ).sum().shift(-6)

    return group

# Apply the new feature creation function
df_processed_combined = df_combined.groupby('Product ID', group_keys=False).apply(create_ts_features_v2)

# 4. Create Categorical ID Feature
le = LabelEncoder()
df_processed_combined['Category_ID'] = le.fit_transform(df_processed_combined['Category'])

# 5. Clean up rows that now have missing values due to lagging/rolling
df_processed_combined.dropna(subset=[
    'Target_Sales_D+7',
    'Lag_Sales_D-7',
    'Lag_Sales_D-1',
    'Lag_Sales_D-2',
    'Lag_Inventory_D-1',
    'Rolling_Mean_7D'
], inplace=True)

# 6. Save the processed DataFrame to a new CSV file
output_file_name = "Combined_3_Categories_Processed_Data_V2.csv"
df_processed_combined.to_csv(output_file_name, index=False)

  df_processed_combined = df_combined.groupby('Product ID', group_keys=False).apply(create_ts_features_v2)


In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np

# Load the original dataset
df = pd.read_csv("retail_store_inventory.csv")

# 1. Convert Date to Datetime objects
df['Date'] = pd.to_datetime(df['Date'])

# 2. Prepare ALL Categories and sort the data
# Sorting by Product ID and Date is crucial for correct lag calculations
df_all = df.sort_values(['Product ID', 'Date']).copy()

# 3. Feature & Target Engineering Function
def create_ts_features_v3(group):
    """Calculates lag, rolling mean, and future target sales for a single product group."""

    # --- LAG FEATURES (The Past & Present) ---
    # Lag D-1 and D-2 for immediate momentum (to minimize phase lag)
    group['Lag_Sales_D-1'] = group['Units Sold'].shift(1)
    group['Lag_Sales_D-2'] = group['Units Sold'].shift(2)

    # Weekly lag for seasonality
    group['Lag_Sales_D-7'] = group['Units Sold'].shift(7)

    # Inventory from previous day (The Present)
    group['Lag_Inventory_D-1'] = group['Inventory Level'].shift(1)

    # --- ROLLING STATISTICAL FEATURE ---
    # 7-day Rolling Mean of Sales (shifted by 1 to prevent data leakage)
    group['Rolling_Mean_7D'] = group['Units Sold'].shift(1).rolling(window=7).mean()

    # --- TARGET VARIABLE (The Future) ---
    # Target: Total Sales for the next 7 days (Label is placed on the date the prediction is made)
    # Uses 'closed=left' so the sum includes sales from the current row + next 6 rows.
    group['Target_Sales_D+7'] = group['Units Sold'].rolling(
        window=7, closed='left'
    ).sum().shift(-6)

    return group

# Apply the feature creation function across ALL product IDs
df_processed_all = df_all.groupby('Product ID', group_keys=False).apply(create_ts_features_v3)

# 4. Create Categorical ID Feature
# This is used for the Embedding layer in the final neural network
le = LabelEncoder()
df_processed_all['Category_ID'] = le.fit_transform(df_processed_all['Category'])

# 5. Clean up rows with NaN values resulting from the lags (first 7-8 rows of each time series)
df_processed_all.dropna(subset=[
    'Target_Sales_D+7',
    'Lag_Sales_D-7',
    'Lag_Sales_D-1',
    'Lag_Sales_D-2',
    'Lag_Inventory_D-1',
    'Rolling_Mean_7D'
], inplace=True)

# 6. Save the final processed DataFrame
output_file_name = "Combined_5_Categories_Processed_Data_V3.csv"
df_processed_all.to_csv(output_file_name, index=False)

print(f"The final, complete dataset has been generated and saved as '{output_file_name}'.")

  df_processed_all = df_all.groupby('Product ID', group_keys=False).apply(create_ts_features_v3)


The final, complete dataset has been generated and saved as 'Combined_5_Categories_Processed_Data_V3.csv'.
