In [7]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout

# Load data
stores = pd.read_csv('stores.csv')
train = pd.read_csv('train.csv')
features = pd.read_csv('features.csv')

In [8]:
# Merge datasets
data = pd.merge(train, stores, how='left', on='Store')
data = pd.merge(data, features, how='left', on=['Store', 'Date'])

# Data preprocessing
data['Date'] = pd.to_datetime(data['Date'])
data = data.sort_values(by='Date')
data = data.fillna(0)  # Fill missing values with 0 for simplicity


In [9]:
# Feature engineering
# Add lag features for weekly sales
#data['Lag_1_Weekly_Sales'] = data.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(1)
#data['Lag_2_Weekly_Sales'] = data.groupby(['Store', 'Dept'])['Weekly_Sales'].shift(2)


In [10]:
# Normalization
scaler = MinMaxScaler()
data[['Temperature', 'Fuel_Price',
      'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']] = \
    scaler.fit_transform(data[[
                               'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
                               'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']])



In [11]:
#scaler = MinMaxScaler()
#data[['Weekly_Sales', 'Lag_1_Weekly_Sales', 'Lag_2_Weekly_Sales', 'Temperature', 'Fuel_Price',
#      'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']] = \
#   scaler.fit_transform(data[['Weekly_Sales', 'Lag_1_Weekly_Sales', 'Lag_2_Weekly_Sales',
#                               'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
#                               'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']])


In [12]:
data=data.fillna(0)
data.head()
data['Date'] = data['Date'].dt.day

In [13]:
categorical_columns = ['IsHoliday_x', 'Type', 'IsHoliday_y']
data[categorical_columns] = data[categorical_columns].astype('category').apply(lambda x: x.cat.codes)

In [14]:
data.head()

Unnamed: 0,Store,Dept,Date,Weekly_Sales,IsHoliday_x,Type,Size,Temperature,Fuel_Price,MarkDown1,MarkDown2,MarkDown3,MarkDown4,MarkDown5,CPI,Unemployment,IsHoliday_y
0,1,1,5,24924.5,0,0,151315,0.434149,0.0501,0.0,0.002536,0.000205,0.0,0.0,0.8405,0.405118,0
277665,29,5,5,15552.08,0,1,93638,0.258513,0.158317,0.0,0.002536,0.000205,0.0,0.0,0.054008,0.592774,0
277808,29,6,5,3200.22,0,1,93638,0.258513,0.158317,0.0,0.002536,0.000205,0.0,0.0,0.054008,0.592774,0
277951,29,7,5,10820.05,0,1,93638,0.258513,0.158317,0.0,0.002536,0.000205,0.0,0.0,0.054008,0.592774,0
278094,29,8,5,20055.64,0,1,93638,0.258513,0.158317,0.0,0.002536,0.000205,0.0,0.0,0.054008,0.592774,0


In [15]:
# Sequence preparation
X = data[['Temperature', 'Fuel_Price',
           'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']].values
y = data['Weekly_Sales'].values

X = X.reshape(X.shape[0], 1, X.shape[1])


In [16]:
y

array([24924.5 , 15552.08,  3200.22, ...,  3128.17,  5740.14,  1076.8 ])

In [17]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential()
model.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(units=50, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(units=50))
model.add(Dropout(0.2))
model.add(Dense(units=1))
model.compile(optimizer='adam', loss='mean_squared_error')


In [18]:
# Train the model
model.fit(X_train, y_train, epochs=1, batch_size=32, validation_data=(X_val, y_val))




<keras.src.callbacks.History at 0x21833c6f950>

In [19]:
# ... (previous code)

# Load the test data
test_data = pd.read_csv('test.csv')
# Make predictions on the test dataset
test_data = pd.merge(test_data, stores, how='left', on='Store')
test_data = pd.merge(test_data, features, how='left', on=['Store', 'Date'])
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data = test_data.sort_values(by='Date')
test_data = test_data.fillna(0)



test_data[['Temperature', 'Fuel_Price',
           'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']] = \
    scaler.fit_transform(test_data[[
                                'Temperature', 'Fuel_Price', 'MarkDown1', 'MarkDown2', 'MarkDown3',
                                'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']])





In [20]:

test_data=test_data.fillna(0)
test_data.head()
test_data['Date'] = test_data['Date'].dt.day

In [21]:
categorical_columns = ['IsHoliday_x', 'Type', 'IsHoliday_y']
test_data[categorical_columns] = test_data[categorical_columns].astype('category').apply(lambda x: x.cat.codes)

In [22]:
test_data['Temperature'].unique()

array([0.])

In [23]:
X_test = test_data[['Temperature', 'Fuel_Price',
           'MarkDown1', 'MarkDown2', 'MarkDown3', 'MarkDown4', 'MarkDown5', 'CPI', 'Unemployment']].values

X_test = X_test.reshape(X_test.shape[0], 1, X_test.shape[1])


In [24]:
predictions = model.predict(X_test)

# Reshape predictions and apply inverse_transform
predictions = predictions.reshape(-1, 1)




In [25]:
X_test

array([[[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       ...,

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]],

       [[0., 0., 0., ..., 0., 0., 0.]]])