### Load the dataset

In [2]:
import pandas as pd


In [3]:
weekly_data = pd.read_csv(r'C:\Users\wwwsu\Desktop\All folders\Logistics_demand\data\processed\weekly_aggregated_sales.csv')
print(weekly_data.head())

       item_id store_id  wm_yr_wk  sales  sell_price  month  year
0  FOODS_1_001     CA_1     11101     10         2.0      1  2011
1  FOODS_1_001     CA_1     11101     10         2.0      2  2011
2  FOODS_1_001     CA_1     11102      6         2.0      2  2011
3  FOODS_1_001     CA_1     11103     10         2.0      2  2011
4  FOODS_1_001     CA_1     11104     13         2.0      2  2011


In [4]:
print(weekly_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10183660 entries, 0 to 10183659
Data columns (total 7 columns):
 #   Column      Dtype  
---  ------      -----  
 0   item_id     object 
 1   store_id    object 
 2   wm_yr_wk    int64  
 3   sales       int64  
 4   sell_price  float64
 5   month       int64  
 6   year        int64  
dtypes: float64(1), int64(4), object(2)
memory usage: 543.9+ MB
None


### Encode Categorical Variables

LSTM cannot directly handle strings, so we convert item_id and store_id to integer indices. Later, the model will use embeddings for these.

In [9]:

# Create categorical mappings
weekly_data['item_id_cat'] = weekly_data['item_id'].astype('category').cat.codes
weekly_data['store_id_cat'] = weekly_data['store_id'].astype('category').cat.codes

# Save mappings if needed for inference
item_id_mapping = dict(enumerate(weekly_data['item_id'].astype('category').cat.categories))
store_id_mapping = dict(enumerate(weekly_data['store_id'].astype('category').cat.categories))

print(f"Number of unique items: {weekly_data['item_id_cat'].nunique()}")
print(f"Number of unique stores: {weekly_data['store_id_cat'].nunique()}")


Number of unique items: 3049
Number of unique stores: 10


### Feature Engineering and Scaling

We will use the following features:

Numeric: sales, sell_price, month, year

Categorical (encoded): item_id_cat, store_id_cat

Scale numeric features using MinMaxScaler to normalize values between 0 and 1.

In [14]:
!pip install scikit-learn


Collecting scikit-learn
  Obtaining dependency information for scikit-learn from https://files.pythonhosted.org/packages/62/27/585859e72e117fe861c2079bcba35591a84f801e21bc1ab85bce6ce60305/scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata
  Using cached scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Obtaining dependency information for joblib>=1.2.0 from https://files.pythonhosted.org/packages/da/d3/13ee227a148af1c693654932b8b0b02ed64af5e1f7406d56b088b57574cd/joblib-1.5.0-py3-none-any.whl.metadata
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Obtaining dependency information for threadpoolctl>=3.1.0 from https://files.pythonhosted.org/packages/32/d5/f9a850d79b0851d1d4ef6456097579a9005b31fea68726a4ae5f2d82ddd9/threadpoolctl-3.6.0-py3-none-any.whl.metadata
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1


[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
from sklearn.preprocessing import MinMaxScaler

# Sort data by item, store, and time to keep order
weekly_data = weekly_data.sort_values(['item_id_cat', 'store_id_cat', 'wm_yr_wk'])

# Select numeric features to scale
numeric_features = ['sales', 'sell_price', 'month', 'year']

scalers = {}
for feature in numeric_features:
    scaler = MinMaxScaler()
    weekly_data[feature + '_scaled'] = scaler.fit_transform(weekly_data[[feature]])
    scalers[feature] = scaler  # Save scaler for inverse transform later

print(weekly_data[[f + '_scaled' for f in numeric_features]].head())


   sales_scaled  sell_price_scaled  month_scaled  year_scaled
0      0.002826           0.018544      0.000000          0.0
1      0.002826           0.018544      0.090909          0.0
2      0.001695           0.018544      0.090909          0.0
3      0.002826           0.018544      0.090909          0.0
4      0.003673           0.018544      0.090909          0.0


### Create Supervised Learning Sequences (Sliding Windows)
We will create sequences of length n_steps (e.g., 10 weeks) of features to predict the sales of the next week.

Important: Since we have multiple time series (item-store pairs), we create sequences per item-store and then combine them.

In [16]:
!pip install numpy




[notice] A new release of pip is available: 23.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import numpy as np

n_steps = 10  # number of past weeks to use as input

def create_sequences(df, n_steps):
    X_numeric, X_item, X_store, y = [], [], [], []
    
    # Group by item-store
    grouped = df.groupby(['item_id_cat', 'store_id_cat'])
    
    for (item_cat, store_cat), group in grouped:
        group = group.sort_values('wm_yr_wk')
        
        # Extract scaled numeric features as numpy array
        features = group[[f + '_scaled' for f in numeric_features]].values
        
        # Extract categorical codes repeated for each time step
        item_array = np.full((len(group),), item_cat)
        store_array = np.full((len(group),), store_cat)
        
        # Create sequences
        for i in range(len(group) - n_steps):
            X_numeric.append(features[i:i+n_steps])
            X_item.append(item_array[i+n_steps])   # categorical for prediction time
            X_store.append(store_array[i+n_steps])
            y.append(features[i+n_steps][0])       # scaled sales at next time step
    
    # Convert to numpy arrays
    X_numeric = np.array(X_numeric)
    X_item = np.array(X_item)
    X_store = np.array(X_store)
    y = np.array(y)
    
    return X_numeric, X_item, X_store, y

X_numeric, X_item, X_store, y = create_sequences(weekly_data, n_steps)

print(f"Numeric input shape: {X_numeric.shape}")
print(f"Item input shape: {X_item.shape}")
print(f"Store input shape: {X_store.shape}")
print(f"Target shape: {y.shape}")


MemoryError: 