<a href="https://colab.research.google.com/github/Ptuancuong/TH-TimeSeries.csv/blob/main/LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# --------- 1. Import thư viện cần thiết ---------
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error

# --------- 2. Bước 1: Đọc dữ liệu và tạo cột 'date' ---------
df = pd.read_csv('/content/store.csv')  # file favorita_monthly đã đổi tên thành store.csv
df['date'] = pd.to_datetime(dict(year=df['year'], month=df['month'], day=1))
df = df.sort_values('date').reset_index(drop=True)

# --------- 3. Bước 2: Chọn một chuỗi (store_nbr, family) ---------
combo_counts = (
    df.groupby(['store_nbr', 'family'])
      .size()
      .reset_index(name='count')
      .sort_values('count', ascending=False)
)
store_nbr, family = combo_counts.iloc[0][['store_nbr', 'family']]

df_series = df[(df['store_nbr'] == store_nbr) & (df['family'] == family)].copy()
df_series = df_series[['date', 'sales_monthly']].reset_index(drop=True)

# --------- 4. Bước 3: Scale sales_monthly chỉ fit trên TRAIN ---------
sales_values = df_series['sales_monthly'].values.reshape(-1, 1)  # (n_total, 1)

n_total = len(sales_values)
n_train = int(0.7 * n_total)
n_val   = int(0.15 * n_total)
n_test  = n_total - n_train - n_val

train_values = sales_values[:n_train]            # (n_train, 1)
scaler = MinMaxScaler().fit(train_values)         # chỉ fit trên train

sales_scaled = scaler.transform(sales_values).flatten()  # (n_total,)

# --------- 5. Bước 4: Tạo dataset (windows) cho LSTM ---------
B = 24  # backcast length (24 tháng vào)
H = 6   # horizon (6 tháng ra)

X_list, y_list = [], []
for i in range(n_total - B - H + 1):
    X_list.append(sales_scaled[i : i + B])           # (B,)
    y_list.append(sales_scaled[i + B : i + B + H])   # (H,)

X_all = np.stack(X_list, axis=0)  # (N, 24)
y_all = np.stack(y_list, axis=0)  # (N,  6)

# --------- 6. Bước 5: Chia train/val/test windows ---------
N = X_all.shape[0]                          # số window
n_train_window = int(0.7 * N)
n_val_window   = int(0.15 * N)
n_test_window  = N - n_train_window - n_val_window

X_train = X_all[:n_train_window]
y_train = y_all[:n_train_window]
X_val   = X_all[n_train_window : n_train_window + n_val_window]
y_val   = y_all[n_train_window : n_train_window + n_val_window]
X_test  = X_all[n_train_window + n_val_window :]
y_test  = y_all[n_train_window + n_val_window :]

# Chuyển sang PyTorch DataLoader
train_ds = TensorDataset(torch.tensor(X_train, dtype=torch.float32),
                         torch.tensor(y_train, dtype=torch.float32))
val_ds   = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                         torch.tensor(y_val, dtype=torch.float32))
test_ds  = TensorDataset(torch.tensor(X_test, dtype=torch.float32),
                         torch.tensor(y_test, dtype=torch.float32))

train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=8, shuffle=False)
test_loader  = DataLoader(test_ds,  batch_size=8, shuffle=False)

# --------- 7. Bước 6: Định nghĩa kiến trúc LSTM ---------
class LSTMNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_length):
        super().__init__()
        # input_size = 1 (mỗi bước là một giá trị scaled)
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size,
                            num_layers=num_layers,
                            batch_first=True)
        # Sau khi lấy hidden cuối, map xuống output_length
        self.fc = nn.Linear(hidden_size, output_length)

    def forward(self, x):
        # x shape = (batch, B), reshape thành (batch, B, 1)
        x = x.unsqueeze(-1)  # (batch, B, 1)
        # Khởi tạo ẩn và trạng thái tế bào ban đầu bằng zero
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size, device=x.device)
        # out shape = (batch, B, hidden_size)
        # h_n shape = (num_layers, batch, hidden_size)
        out, (h_n, c_n) = self.lstm(x, (h0, c0))
        # Lấy hidden state cuối của tầng cuối (tương đương h_n[-1])
        last_hidden = h_n[-1]  # (batch, hidden_size)
        y_hat = self.fc(last_hidden)  # (batch, output_length)
        return y_hat

# --------- 8. Bước 7: Khởi tạo model và di chuyển lên device ---------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Tham số LSTM mẫu: hidden_size=64, num_layers=2
lstm_params = {
    'input_size': 1,
    'hidden_size': 64,
    'num_layers': 2,
    'output_length': H
}

model = LSTMNet(**lstm_params).to(device)

# --------- 9. Bước 8: Thiết lập optimizer, loss, early stopping ---------
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)

patience = 5
max_epochs = 50

# --------- 10. Bước 9: Vòng huấn luyện + early stopping ---------
best_val_rmse = float('inf')
best_state = None
wait = 0

for epoch in range(1, max_epochs + 1):
    model.train()
    for xb, yb in train_loader:
        xb = xb.to(device)  # (batch, B)
        yb = yb.to(device)  # (batch, H)
        optimizer.zero_grad()
        y_pred = model(xb)  # (batch, H)
        loss = criterion(y_pred, yb)
        loss.backward()
        optimizer.step()

    model.eval()
    val_preds, val_trues = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            yb = yb.to(device)
            y_pred = model(xb)
            val_preds.append(y_pred.cpu().numpy())
            val_trues.append(yb.cpu().numpy())

    val_preds = np.concatenate(val_preds, axis=0)  # (n_val_windows, H)
    val_trues = np.concatenate(val_trues, axis=0)  # (n_val_windows, H)
    val_rmse = np.sqrt(mean_squared_error(val_trues.flatten(), val_preds.flatten()))
    print(f"Epoch {epoch}/{max_epochs} - Val RMSE: {val_rmse:.4f}")

    if val_rmse < best_val_rmse - 1e-6:
        best_val_rmse = val_rmse
        best_state = model.state_dict()
        wait = 0
    else:
        wait += 1
        if wait >= patience:
            print(f"Early stopping at epoch {epoch}")
            break

model.load_state_dict(best_state)

# --------- 11. Bước 10: Đánh giá trên tập TEST ---------
model.eval()
test_preds, test_trues = [], []
with torch.no_grad():
    for xb, yb in test_loader:
        xb = xb.to(device)
        yb = yb.to(device)
        y_pred = model(xb)
        test_preds.append(y_pred.cpu().numpy())
        test_trues.append(yb.cpu().numpy())

test_preds = np.concatenate(test_preds, axis=0)  # (n_test_windows, H)
test_trues = np.concatenate(test_trues, axis=0)  # (n_test_windows, H)

# Flatten và inverse scale về giá trị gốc
test_preds_flat = test_preds.flatten().reshape(-1, 1)
test_trues_flat = test_trues.flatten().reshape(-1, 1)
test_preds_inv = scaler.inverse_transform(test_preds_flat).flatten()
test_trues_inv = scaler.inverse_transform(test_trues_flat).flatten()

test_mape = mean_absolute_percentage_error(test_trues_inv, test_preds_inv)
test_rmse = np.sqrt(mean_squared_error(test_trues_inv, test_preds_inv))
print(f"Test MAPE: {test_mape:.4f}, Test RMSE: {test_rmse:.2f}")

# --------- 12. Bước 11: Dự báo 6 tháng tương lai ---------
# Lấy 24 giá trị cuối cùng của sales_scaled làm input
sales_scaled_full = sales_scaled
x_input = torch.tensor(sales_scaled_full[-B:], dtype=torch.float32).unsqueeze(0).to(device)  # shape (1, B)
model.eval()
with torch.no_grad():
    future_scaled = model(x_input).cpu().numpy().flatten()  # (H,)
future_preds = scaler.inverse_transform(future_scaled.reshape(-1, 1)).flatten()
print("Forecast for the next 6 months:", future_preds)


Epoch 1/50 - Val RMSE: 0.3595
Epoch 2/50 - Val RMSE: 0.3467
Epoch 3/50 - Val RMSE: 0.3337
Epoch 4/50 - Val RMSE: 0.3194
Epoch 5/50 - Val RMSE: 0.3032
Epoch 6/50 - Val RMSE: 0.2838
Epoch 7/50 - Val RMSE: 0.2591
Epoch 8/50 - Val RMSE: 0.2265
Epoch 9/50 - Val RMSE: 0.1880
Epoch 10/50 - Val RMSE: 0.1911
Epoch 11/50 - Val RMSE: 0.2634
Epoch 12/50 - Val RMSE: 0.2396
Epoch 13/50 - Val RMSE: 0.1863
Epoch 14/50 - Val RMSE: 0.1615
Epoch 15/50 - Val RMSE: 0.1581
Epoch 16/50 - Val RMSE: 0.1609
Epoch 17/50 - Val RMSE: 0.1671
Epoch 18/50 - Val RMSE: 0.1772
Epoch 19/50 - Val RMSE: 0.1867
Epoch 20/50 - Val RMSE: 0.1883
Early stopping at epoch 20
Test MAPE: 0.3415, Test RMSE: 29.89
Forecast for the next 6 months: [58.485577 54.768143 56.693386 46.893032 48.606804 51.869053]
