In [1]:
import numpy as np
import pandas as pd
import time
import datetime as datetime
import matplotlib.pyplot as plt
import math
import pickle
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as Ft
import random
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from joblib import dump, load

In [2]:
data = pd.read_csv('drive/MyDrive/data_processed_plant_item_10_03_22.csv')
print(data.shape)

(365785, 11)


In [3]:
print(data.shape[0] / 35)

10451.0


In [4]:
print(data.head())

   Unnamed: 0       ParentItemID PlantID  Volume        Date  time_idx  month  \
0        6636  AI19565015TWICA22    5M01       4  2019-02-01         0      2   
1        6637  AI19565015TWICA22    5M01       0  2019-03-01         1      3   
2        6638  AI19565015TWICA22    5M01       0  2019-04-01         2      4   
3        6639  AI19565015TWICA22    5M01       0  2019-05-01         3      5   
4        6640  AI19565015TWICA22    5M01      52  2019-06-01         4      6   

   log_volume  avg_volume_by_material  avg_volume_by_plant  timeseries  
0    1.386294                2.222222           165.928938          80  
1  -18.420681                0.000000           248.715325          80  
2  -18.420681                0.000000           146.664384          80  
3  -18.420681                0.444444           151.811216          80  
4    3.951244               15.555556           193.533818          80  


In [43]:
def data_scaler(data, val_period):
  data = data.drop(['month', 'time_idx'], axis=1)
  train = data.iloc[0:data.shape[0] - val_period]
  test = data.iloc[data.shape[0] - val_period::]
  scaler = StandardScaler()
  train_scaled = scaler.fit_transform(train)
  test_scaled = scaler.transform(test)
  timeseries_scaled = np.append(train_scaled, test_scaled, axis=0)
  return scaler, timeseries_scaled


def reshaper(data, cols_to_keep, val_period):
  timeseries_list = list(data['timeseries'].unique())
  print('Number of timeseries:', len(timeseries_list))
  time_index_len = len(data['time_idx'].unique())
  LSTM_data = np.zeros((len(timeseries_list), time_index_len, len(cols_to_keep) - 3))
  scaler_dict = {}
  for i in range(len(timeseries_list)):
    series_num = timeseries_list[i]
    timeseries_data = data[data['timeseries'] == series_num]
    timeseries_data = timeseries_data.drop('timeseries', axis=1)
    scaler, timeseries_scaled = data_scaler(timeseries_data, val_period)
    scaler_dict[i] = scaler
    LSTM_data[i, :, :] = timeseries_scaled
  return scaler_dict, LSTM_data


def data_preprocess(data, cols_to_keep, val_period):
  data = data[cols_to_keep]
  scaler_dict, LSTM_data = reshaper(data, cols_to_keep, val_period)
  print('Shape of LSTM data: Num sequences', LSTM_data.shape[0], 'Len sequence:', LSTM_data.shape[1], 'Num vars:', LSTM_data.shape[2])
  return LSTM_data, scaler_dict

In [44]:
COLS_TO_KEEP = ['timeseries', 'time_idx', 'month', 'avg_volume_by_material', 'avg_volume_by_plant', 'Volume']
VAL_PERIOD = 6
max_time_idx = np.max(data['time_idx'].unique())
train_data = data[data['time_idx'] <= max_time_idx - VAL_PERIOD]
val_data = data[data['time_idx'] >= VAL_PERIOD]
LSTM_train, _ = data_preprocess(train_data, COLS_TO_KEEP, VAL_PERIOD)
LSTM_val, val_scaler_dict = data_preprocess(val_data, COLS_TO_KEEP, VAL_PERIOD)

Number of timeseries: 10451
Shape of LSTM data: Num sequences 10451 Len sequence: 29 Num vars: 3
Number of timeseries: 10451
Shape of LSTM data: Num sequences 10451 Len sequence: 29 Num vars: 3


In [54]:
# Build LSTM
torch.manual_seed(42)
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm1 = nn.LSTM(input_size=3, hidden_size=10, num_layers=1,
                             batch_first=True)
        #self.fc1 = nn.Linear(15, 10)
        #self.tanh = nn.Tanh()
        self.fc1 = nn.Linear(10, 6)
        self.initialise_weights()


    def forward(self, x):
        output, (_, _) = self.lstm1(x)
        x = output[:, -1, :]
        x = self.fc1(x)
        #x = self.tanh(x)
        #x = self.fc2(x)
        return x


    def initialise_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                torch.nn.init.uniform_(m.weight, -0.1, 0.1)

In [55]:
# Train LSTM
LSTM =LSTM()
running_loss = 0
count = 0
max_epochs = 100
batchSize = 32
optimiser = torch.optim.Adam(LSTM.parameters(), lr=0.005)
criterion = nn.MSELoss()
optimiser.zero_grad()
trainloader = torch.utils.data.DataLoader(LSTM_train, batch_size=batchSize,
                                          shuffle=True)

print('')
print('Training LSTM...')
print('')

for epoch in range(1, max_epochs):
    for batch in trainloader:
        optimiser.zero_grad()
        y = batch[:, -6::, -1]
        x = batch[:, 0:-6, :]
        y_pred = LSTM(x.float())
        y_pred = np.squeeze(y_pred)
        loss = criterion(y_pred, y.float())
        loss.backward()
        optimiser.step()
        loss = loss.item()
        running_loss += loss
        count += 1
    average_loss = (running_loss / batchSize) / count
    print('Average loss at epoch', epoch, ':', round(average_loss, 5))
    running_loss = 0
    count = 0


Training LSTM...

Average loss at epoch 1 : 494.73167
Average loss at epoch 2 : 492.60898
Average loss at epoch 3 : 492.92896
Average loss at epoch 4 : 490.63687
Average loss at epoch 5 : 490.07539
Average loss at epoch 6 : 489.75852
Average loss at epoch 7 : 489.39249
Average loss at epoch 8 : 492.02473
Average loss at epoch 9 : 490.19761
Average loss at epoch 10 : 489.92491
Average loss at epoch 11 : 489.26649
Average loss at epoch 12 : 488.48499
Average loss at epoch 13 : 486.7539
Average loss at epoch 14 : 485.21904
Average loss at epoch 15 : 484.86884
Average loss at epoch 16 : 485.64772
Average loss at epoch 17 : 484.77164
Average loss at epoch 18 : 483.85816
Average loss at epoch 19 : 483.16324
Average loss at epoch 20 : 482.18953
Average loss at epoch 21 : 481.12419
Average loss at epoch 22 : 480.77497
Average loss at epoch 23 : 479.86689
Average loss at epoch 24 : 478.90189
Average loss at epoch 25 : 478.24922
Average loss at epoch 26 : 483.96911
Average loss at epoch 27 : 48

In [56]:
# Get test predictions
print('')
print('Testing LSTM...')
print('')

testloader = torch.utils.data.DataLoader(LSTM_val, batch_size=np.shape(LSTM_val)[0],
                                         shuffle=False)

with torch.no_grad():
    for batch in testloader:
        optimiser.zero_grad()
        y = batch[:, -6::, -1]
        x = batch[:, 0:-6, :]
        y_pred = LSTM(x.float())
        y_pred = np.squeeze(y_pred)
        loss = criterion(y_pred, y.float())


Testing LSTM...



In [66]:
def get_true_preds(y_preds, scaler_dict):
  true_y_preds = np.zeros((y_preds.shape[0], y_preds.shape[1]))
  for i in range(y_preds.shape[0]):
    dummy_array = np.zeros((6, 3))
    pred = y_preds[i, :]
    dummy_array[:, -1] = pred
    scaler = scaler_dict[i]
    true_pred = scaler.inverse_transform(dummy_array)[:, -1]
    true_pred[true_pred < 0] = 0
    true_y_preds[i, :] = true_pred
  return true_y_preds


true_y_preds = get_true_preds(y_pred, val_scaler_dict)

In [67]:
print(true_y_preds[0:5])

[[141.04963653   0.           0.           0.           0.
    0.        ]
 [  0.           0.           0.           0.           0.
    0.        ]
 [  0.           0.           0.           0.           0.
    0.        ]
 [  0.           0.           0.           0.           0.
    0.        ]
 [  0.           0.           0.           0.           0.
    0.        ]]


In [68]:
np.save('drive/MyDrive/LSTM_predictions', true_y_preds)