In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import TimeseriesGenerator

In [None]:
train = pd.read_csv("/content/train.csv", parse_dates=["date"])
train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [25]:
test = pd.read_csv("/content/test.csv", parse_dates=["date"])
test

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


In [26]:
stores = pd.read_csv("/content/stores.csv")
stores

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


In [27]:
oil = pd.read_csv("/content/oil.csv", parse_dates=["date"])
oil

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


In [28]:
holidays = pd.read_csv("/content/holidays_events.csv", parse_dates=["date"])
holidays

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [29]:
transactions = pd.read_csv("/content/transactions.csv", parse_dates=["date"])
transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932



**Convert date columns to datetime format before merging**

In [31]:
for df in [train, test, oil, holidays, transactions]:
    df["date"] = pd.to_datetime(df["date"])

**Fill missing oil prices**

In [32]:
oil["dcoilwtico"] = oil["dcoilwtico"].ffill().bfill()

**Merge datasets**

In [33]:
train = train.merge(stores, on="store_nbr", how="left")
train = train.merge(oil, on="date", how="left")

if "transactions" in transactions.columns:
    train = train.merge(transactions, on=["date", "store_nbr"], how="left")
else:
    print("Column 'transactions' not found. Skipping merge.")

train = train.merge(holidays, on="date", how="left")



**Fill NaNs**

In [34]:
train.fillna(0, inplace=True)


**Feature Engineering**

In [35]:
train["day"] = train["date"].dt.day
train["month"] = train["date"].dt.month
train["year"] = train["date"].dt.year
train["day_of_week"] = train["date"].dt.dayofweek


**Encode categorical features**

In [36]:
encoder = LabelEncoder()
train["family"] = encoder.fit_transform(train["family"])
train["type_x"] = encoder.fit_transform(train["type_x"])
train["type_y"] = train["type_y"].astype("str")
train["type_y"] = encoder.fit_transform(train["type_y"])

**Normalize data**

In [37]:
scaler_features = MinMaxScaler()
scaler_target = MinMaxScaler()

train[features] = scaler_features.fit_transform(train[features])
train[target] = scaler_target.fit_transform(train[target])


**Ensure time-series order**

In [38]:

train = train.sort_values(by=["store_nbr", "family", "date"])

**Prepare data for LSTM**

In [None]:

sequence_length = 10
X = train[features].values
y = train[target].values

generator = TimeseriesGenerator(X, y, length=sequence_length, batch_size=64)


**Build LSTM Model**

In [None]:

model = Sequential([
    LSTM(50, activation="relu", return_sequences=True, input_shape=(sequence_length, len(features))),
    Dropout(0.2),
    LSTM(50, activation="relu"),
    Dense(25, activation="relu"),
    Dense(1)
])



  super().__init__(**kwargs)


**Compile model with lower learning rate**

In [None]:
model.compile(optimizer=Adam(learning_rate=0.01), loss="mse")

**Train the model**

In [None]:
model.fit(generator, epochs=10, verbose=1)


Epoch 1/10


  self._warn_if_super_not_called()


[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m183s[0m 4ms/step - loss: 1.2271e-04
Epoch 2/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 4ms/step - loss: 1.1033e-04
Epoch 3/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 4ms/step - loss: 1.1414e-04
Epoch 4/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 4ms/step - loss: 1.1093e-04
Epoch 5/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 4ms/step - loss: 1.1865e-04
Epoch 6/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 4ms/step - loss: 1.0907e-04
Epoch 7/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 4ms/step - loss: 1.1605e-04
Epoch 8/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 4ms/step - loss: 1.1607e-04
Epoch 9/10
[1m47725/47725[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 4ms/step - loss: 1.1329e-04
Epoch 10/10


<keras.src.callbacks.history.History at 0x79562f5be2d0>

**Save the model**

In [None]:
model.save("sales_prediction_lstm.h5")

print("Model training complete!")



Model training complete!
