In [4]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [7]:
df = pd.read_csv('train.csv', sep=',')
df

Unnamed: 0,id,date,country,store,product,num_sold
0,0,2010-01-01,Canada,Discount Stickers,Holographic Goose,
1,1,2010-01-01,Canada,Discount Stickers,Kaggle,973.0
2,2,2010-01-01,Canada,Discount Stickers,Kaggle Tiers,906.0
3,3,2010-01-01,Canada,Discount Stickers,Kerneler,423.0
4,4,2010-01-01,Canada,Discount Stickers,Kerneler Dark Mode,491.0
...,...,...,...,...,...,...
230125,230125,2016-12-31,Singapore,Premium Sticker Mart,Holographic Goose,466.0
230126,230126,2016-12-31,Singapore,Premium Sticker Mart,Kaggle,2907.0
230127,230127,2016-12-31,Singapore,Premium Sticker Mart,Kaggle Tiers,2299.0
230128,230128,2016-12-31,Singapore,Premium Sticker Mart,Kerneler,1242.0


In [17]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Wczytaj dane
data = pd.read_csv('train.csv')
data['date'] = pd.to_datetime(data['date'])

data = data.dropna()

# Rozbij daty na cechy
data['year'] = data['date'].dt.year
data['month'] = data['date'].dt.month
data['day'] = data['date'].dt.day
data['weekday'] = data['date'].dt.weekday

# One-hot encoding dla kategorii
encoder = OneHotEncoder()
categories = encoder.fit_transform(data[['country', 'store', 'product']]).toarray()

# Połącz cechy
features = np.hstack([
    data[['year', 'month', 'day', 'weekday']].values,
    categories,
    data['num_sold'].values.reshape(-1, 1)
])

# Tworzenie sekwencji
def create_sequences(data, sequence_length):
    sequences = []
    for i in range(len(data) - sequence_length):
        sequences.append(data[i:i + sequence_length])
    return np.array(sequences)

sequence_length = 30
X = create_sequences(features, sequence_length)
y = data['num_sold'][sequence_length:].values

print(X.shape)  # [num_samples, sequence_length, num_features]
print(y.shape)  # [num_samples]


(221229, 30, 19)
(221229,)


In [18]:
import numpy as np

def mean_absolute_percentage_error(y_true, y_pred):
    """Oblicza MAPE. Dodaje eps, by unikać dzielenia przez zero."""
    eps = 1e-10  # Mała wartość zapobiegająca dzieleniu przez zero
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true + eps))) * 100


In [19]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

# Dane wejściowe (X) i etykiety (y)
X_tensor = torch.tensor(X, dtype=torch.float32)
y_tensor = torch.tensor(y, dtype=torch.float32)

# Tworzenie datasetu i loadera
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# Definicja modelu LSTM
class SalesLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(SalesLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)  # hn to ostatni ukryty stan
        return self.fc(hn[-1])     # Przekształcenie do wymiaru wyjściowego

def evaluate_model(model, loader):
    model.eval()  # Ustawienie modelu w tryb ewaluacji
    true_values = []
    predictions = []
    
    with torch.no_grad():  # Wyłączenie gradientów dla szybszej ewaluacji
        for X_batch, y_batch in loader:
            y_pred = model(X_batch).squeeze()
            true_values.extend(y_batch.numpy())
            predictions.extend(y_pred.numpy())
    
    # Obliczenie MAPE
    mape = mean_absolute_percentage_error(true_values, predictions)
    return mape

# Model
input_size = 19  # Liczba cech
hidden_size = 50  # Liczba neuronów w warstwie ukrytej
num_layers = 2    # Liczba warstw LSTM
output_size = 1   # Jedna wartość przewidywana (num_sold)
model = SalesLSTM(input_size, hidden_size, num_layers, output_size)

# Loss i optymalizator
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Trenowanie modelu
for epoch in range(10):  # Liczba epok
    model.train()  # Ustawienie modelu w tryb treningowy
    for X_batch, y_batch in loader:
        optimizer.zero_grad()
        predictions = model(X_batch).squeeze()
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()

    # Ocena modelu na zbiorze treningowym (lub walidacyjnym)
    train_mape = evaluate_model(model, loader)
    print(f'Epoch {epoch+1}, Loss: {loss.item()}, Train MAPE: {train_mape:.2f}%')

Epoch 1, Loss: 1347450.625, Train MAPE: 236.03%
Epoch 2, Loss: 421107.03125, Train MAPE: 100.21%
Epoch 3, Loss: 365866.59375, Train MAPE: 110.76%
Epoch 4, Loss: 824197.5, Train MAPE: 140.93%
Epoch 5, Loss: 800457.25, Train MAPE: 139.04%
Epoch 6, Loss: 220519.984375, Train MAPE: 151.12%
Epoch 7, Loss: 290839.96875, Train MAPE: 154.33%


KeyboardInterrupt: 

array([ 681.,  627.,  340., ..., 2299., 1242., 1622.])

In [25]:
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset

scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

y_scaled = y
# 2. Tworzenie tensora PyTorch
X_tensor = torch.tensor(X_scaled, dtype=torch.float32)
y_tensor = torch.tensor(y_scaled, dtype=torch.float32)

# 3. Tworzenie datasetu i loadera
dataset = TensorDataset(X_tensor, y_tensor)
loader = DataLoader(dataset, batch_size=64, shuffle=True)

# 4. Definicja modelu LSTM
class SalesLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(SalesLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)  # hn to ostatni ukryty stan
        return self.fc(hn[-1])     # Przekształcenie do wymiaru wyjściowego

# 5. Inicjalizacja modelu
input_size = 19  # Liczba cech
hidden_size = 50  # Liczba neuronów w warstwie ukrytej
num_layers = 2    # Liczba warstw LSTM
output_size = 1   # Jedna wartość przewidywana (num_sold)
model = SalesLSTM(input_size, hidden_size, num_layers, output_size)

# 6. Loss i optymalizator
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# 7. Funkcja do obliczania MAPE
def mean_absolute_percentage_error(y_true, y_pred):
    eps = 1e-10  # Mała wartość zapobiegająca dzieleniu przez zero
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / (y_true + eps))) * 100

# 8. Funkcja ewaluacji modelu i obliczania MAPE
def evaluate_model(model, loader):
    model.eval()  # Ustawienie modelu w tryb ewaluacji
    true_values = []
    predictions = []
    
    with torch.no_grad():  # Wyłączenie gradientów dla szybszej ewaluacji
        for X_batch, y_batch in loader:
            y_pred = model(X_batch).squeeze()
            true_values.extend(y_batch.numpy())
            predictions.extend(y_pred.numpy())
    
    # Obliczenie MAPE
    mape = mean_absolute_percentage_error(true_values, predictions)
    return mape

# 9. Trenowanie modelu
num_epochs = 10
for epoch in range(num_epochs):  # Liczba epok
    model.train()  # Ustawienie modelu w tryb treningowy
    for X_batch, y_batch in loader:
        optimizer.zero_grad()
        predictions = model(X_batch).squeeze()
        loss = criterion(predictions, y_batch)
        loss.backward()
        optimizer.step()

    # Ocena modelu na zbiorze treningowym (lub walidacyjnym)
    train_mape = evaluate_model(model, loader)
    print(f'Epoch {epoch+1}, Loss: {loss.item()}, Train MAPE: {train_mape:.2f}%')

Epoch 1, Loss: 1074921.75, Train MAPE: 235.48%
Epoch 2, Loss: 550978.25, Train MAPE: 408.76%
Epoch 3, Loss: 777644.875, Train MAPE: 583.14%
Epoch 4, Loss: 309161.65625, Train MAPE: 744.21%
Epoch 5, Loss: 413642.875, Train MAPE: 858.66%
Epoch 6, Loss: 433414.8125, Train MAPE: 899.96%
Epoch 7, Loss: 368106.90625, Train MAPE: 906.34%
Epoch 8, Loss: 891496.375, Train MAPE: 906.73%
Epoch 9, Loss: 487629.875, Train MAPE: 908.21%
Epoch 10, Loss: 513807.03125, Train MAPE: 907.10%


In [24]:
y_scaled.shape()

TypeError: 'tuple' object is not callable