Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn.preprocessing import StandardScaler
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [41]:
# # Import data
# data = pd.read_csv("combined3_102_sorted.csv")
# data2 = pd.read_csv("102_with_weather.csv")
# # scaler = MinMaxScaler()
# # scaled_data = scaler.fit_transform(data[['delay', 'scheduled_time', 'day', 'day_of_year']])

# weather = data2["Weather"].values
# encoder = LabelEncoder()
# unique = encoder.fit_transform(weather).reshape(-1,1)
# unique += 1

# # Data prep and seperation
# data = data[['delay', 'scheduled_time', 'day', 'day_of_year']].values
# data = np.concatenate((data, unique), axis=1)

# Obtain data
df = pd.read_csv("filename.csv")
df = df.sample(n=62700, random_state=42)
data = df[['delay','stop_id','scheduled_time','vehicle_id','day_of_year']]
embeds = df[['day','Weather']]
data = data.fillna(0) # Fill with 0s if nan

# Z score normalization
data_scaler = StandardScaler() # Own seperate scaler for the delay, so we can inverse transform the output
scaler = StandardScaler()

data['delay'] = data_scaler.fit_transform(data[['delay']])
data[['stop_id','scheduled_time','vehicle_id','day_of_year']] = scaler.fit_transform(data[['stop_id','scheduled_time','vehicle_id','day_of_year']])

data = pd.concat([data, embeds], axis=1)
print(data.head())

# Split into train and test
train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

# Sort so there is some sort of order in the sequences
train_set = train_set.sort_values(by=['day_of_year','scheduled_time'])
test_set = test_set.sort_values(by=['day_of_year','scheduled_time'])






           delay   stop_id  scheduled_time  vehicle_id  day_of_year  day  \
84893  -0.865890 -1.033712        1.434993   -1.200873     0.476832    4   
114424 -0.364519  1.316584        0.061886   -0.205904     0.652537    0   
11449   0.918059  0.442568       -0.585123    0.465941    -0.122634    2   
27538  -0.396583  2.261599       -0.483204    0.523527     0.022065    2   
57351  -0.279985 -0.066663       -0.536361    0.318775     0.301126    1   

        Weather  
84893         3  
114424        4  
11449        13  
27538        13  
57351        13  


In [43]:
# Setting device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")
batch_size = 120

# Batches
train_data = torch.tensor(train_set.values, dtype=torch.float32).to(device)
test_data = torch.tensor(test_set.values, dtype=torch.float32).to(device)

train_batchs = DataLoader(dataset=train_data, batch_size=batch_size, shuffle=False,pin_memory=True)
test_batchs = DataLoader(dataset=test_data, batch_size=batch_size, shuffle=False, pin_memory=True)

# Sequence creator
def createSequences(data, seq_length):
    x, y = [], []
    for i in range(len(data) - seq_length):
        x_data = data[i:(seq_length+i)]
        y_data = data[seq_length+i][0]
        if len(x_data) < seq_length:
            for i in range (seq_length - len(x_data)):
                x_data.append(torch.zeros([1, 7]))
        x.append(x_data)
        y.append(y_data)
    return torch.stack(x, dim=0), torch.stack(y, dim=0)

Device: cpu


Normal LSTM

In [6]:
class LSTM(nn.Module):
    def __init__(self, inputdim, outputdim, layerdim, dropout):
        super(LSTM, self).__init__()
        self.layerdim = layerdim
        self.lstm1 = nn.LSTM(inputdim, 108, layerdim, batch_first=True)
        self.batchnorm = nn.BatchNorm1d(108)
        self.dropout = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(108, 56, layerdim, batch_first=True)
        self.layers = nn.Sequential(
            nn.Linear(56,32),
            nn.ReLU(),
            nn.Linear(32, outputdim)
        )
    
    def forward(self, x, h1=None, c1=None, h2=None, c2=None):
        if h1 is None or c1 is None or h2 is None or c2 is None:
            h1 = torch.zeros(self.layerdim, x.size(0), 108)
            c1 = torch.zeros(self.layerdim, x.size(0), 108)
            h2 = torch.zeros(self.layerdim, x.size(0), 56)
            c2 = torch.zeros(self.layerdim, x.size(0), 56)
        
        out,(h1, c1) = self.lstm1(x, (h1,c1))

        # Batch Normalization
        batch_size, seq_len, hidd_size = out.shape
        out = out.reshape(batch_size * seq_len, hidd_size)
        out = self.batchnorm(out)
        out = out.reshape(batch_size, seq_len, hidd_size)

        # Dropout between layers
        out = self.dropout(out)
        # Second LSTM
        out, (h2, c2) = self.lstm2(out, (h2, c2))
        # Dense layers 
        out = self.layers(out)
        out = out[:, -1, :]
        return out, h1, c1, h2, c2

BiLSTM

In [7]:
class BiLSTM(nn.Module):
    def __init__(self, inputdim, outputdim, layerdim, dropout):
        super(BiLSTM, self).__init__()
        self.layerdim = layerdim
        self.embedding = nn.Embedding(num_embeddings=25, embedding_dim=1)

        self.lstm1 = nn.LSTM(inputdim, 108, layerdim, batch_first=True, bidirectional=True)
        self.batchnorm = nn.BatchNorm1d(216)
        self.dropout = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(108 * 2, 56, layerdim, batch_first=True, bidirectional=True)
        self.layers = nn.Sequential(
            nn.Linear(56 * 2,56),
            nn.ReLU(),
            nn.Linear(56, 32),
            nn.ReLU(),
            nn.Linear(32, outputdim)
        )
    
    def forward(self, x, h1=None, c1=None, h2=None, c2=None):
        if h1 is None or c1 is None or h2 is None or c2 is None:
            h1 = torch.zeros(self.layerdim*2, x.size(0), 108)
            c1 = torch.zeros(self.layerdim*2, x.size(0), 108)
            h2 = torch.zeros(self.layerdim*2, x.size(0), 56)
            c2 = torch.zeros(self.layerdim*2, x.size(0), 56)
        
        emb = x[:, :, 4].to(torch.long)

        embed = self.embedding(emb).to(torch.float32)
        x = x[:, :, :4]
        
        x = torch.cat([embed, x], dim=2)
        out,(h1, c1) = self.lstm1(x, (h1,c1))

        # Batch Normilization
        batch_size, seq_len, hidd_size = out.shape
        out = out.reshape(batch_size * seq_len, hidd_size)
        out = self.batchnorm(out)
        out = out.reshape(batch_size, seq_len, hidd_size)

        # Drop out between layers
        out = self.dropout(out)
        # Second LSTM layer
        out, (h2, c2) = self.lstm2(out, (h2, c2))
        out = self.layers(out)
        out = out[:, -1, :56]
        return out, h1, c1, h2, c2

Attention

In [45]:
class AttentionBiLSTM(nn.Module):
    def __init__(self, inputdim, hiddendim1, hiddendim2, outputdim, numheads, layerdim, dropout):
        super(AttentionBiLSTM, self).__init__()
        self.layerdim = layerdim
        
        self.embedding1 = nn.Embedding(num_embeddings=7, embedding_dim=1)
        self.embedding2 = nn.Embedding(num_embeddings=25, embedding_dim=1)

        self.lstm1 = nn.LSTM(inputdim, hiddendim1, layerdim, batch_first=True, bidirectional=True)
        self.batchnorm = nn.BatchNorm1d(hiddendim1*2)
        self.batchnorm2 = nn.BatchNorm1d(hiddendim2*2)
        self.dropout = nn.Dropout(dropout)
        self.lstm2 = nn.LSTM(hiddendim1*2, hiddendim2, layerdim, batch_first=True, bidirectional=True)
        self.attention = nn.MultiheadAttention(embed_dim=120, num_heads=numheads, batch_first=True)
        self.layers = nn.Sequential(
            nn.Linear(120,60),
            nn.BatchNorm1d(60),
            nn.ReLU(),
            nn.Linear(60,30),
            nn.Dropout(dropout),
            nn.BatchNorm1d(30),
            nn.ReLU(),
            nn.Linear(30, outputdim)
        )
    
    def forward(self, x, h1=None, c1=None, h2=None, c2=None):
        if h1 is None or c1 is None or h2 is None or c2 is None:
            h1 = torch.zeros(self.layerdim*2, x.size(0), 120)
            c1 = torch.zeros(self.layerdim*2, x.size(0), 120)
            h2 = torch.zeros(self.layerdim*2, x.size(0), 60)
            c2 = torch.zeros(self.layerdim*2, x.size(0), 60)
        
        # Embeddings
        emb1 = x[:, :, 5].to(torch.long)
        emb2 = x[:, :, 6].to(torch.long)
        
        embed1 = self.embedding1(emb1).to(torch.float32)
        embed2 = self.embedding2(emb2).to(torch.float32)
        x = x[:, :, :5]
        
        x = torch.cat([x, embed1, embed2], dim=2)

        # First LSTM
        out,(h1, c1) = self.lstm1(x, (h1,c1))


        # Drop out between layers
        out = self.dropout(out)

        # print("First ", out.mean().item(), "std", out.std().item())
        # Batch Normilization
        batch_size, seq_len, hidd_size = out.shape
        out = out.reshape(batch_size * seq_len, hidd_size)
        out = self.batchnorm(out)
        out = out.reshape(batch_size, seq_len, hidd_size)


        # Second LSTM layer
        out, (h2, c2) = self.lstm2(out, (h2, c2))
        # print("second ", out.mean().item(), "std", out.std().item())

        # Add attention layer
        out, attn_weights = self.attention(query=out, key=out,value=out)
        
        # print("after attention ", out.mean().item(), "std", out.std().item())

        # last time step output
        out = out[:, -1, :]
        
        out = self.batchnorm2(out)
        # Final dense layers
        out = self.dropout(out)
        out = self.layers(out)
        # print("last ", out.mean().item(), "std", out.std().item())
        return out, h1, c1, h2, c2

model

In [46]:
# Model
# model = LSTM(inputdim=4, outputdim=1, layerdim=1, dropout=0.2)  # NON Bidirectional
# model = BiLSTM(inputdim=5, outputdim=1, layerdim=1, dropout=0.2)  # Bi Directional
model = AttentionBiLSTM(inputdim=7, hiddendim1=120, hiddendim2=60, outputdim=1, numheads=30, layerdim=1, dropout=0.2).to(device) # Bi Directional with Attention
# loss_fcn = nn.SmoothL1Loss()
loss_fcn = nn.MSELoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)


In [47]:
# Training
# print(len(train_batchs))
print(next(model.parameters()).device)
h1, c1, h2, c2 = None, None, None, None
epochs = 10
model.train()
for epoch in range(epochs):
    epoch_loss = 0

    for batch in train_batchs:
        # print(batch.device)
        optimizer.zero_grad() # Reset your gradient
        # Create sequences
        X_train, y_train = createSequences(batch, 30)
        y_train = y_train.reshape(-1,1)
        X_train = X_train.float()
        # print(X_train.shape, y_train.shape

        # print(X_train.device)
        # Train
        pred, h1, c1, h2, c2 = model(X_train, h1, c1, h2, c2)
        # print(pred)
        loss = loss_fcn(pred, y_train)
        
        loss.backward(retain_graph=True)  
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0) # gradient clipping

        epoch_loss += loss.item()

        h1 = h1.detach()
        c1 = c1.detach()
        h2 = h2.detach()
        c2 = c2.detach()
        
        optimizer.step()
    
    print(f"Epoch {epoch+1}/{epochs}, loss {epoch_loss:.5f}")


cpu
Epoch 1/10, loss 432.71337
Epoch 2/10, loss 422.13685
Epoch 3/10, loss 416.02741
Epoch 4/10, loss 407.18045
Epoch 5/10, loss 402.20538
Epoch 6/10, loss 397.55653
Epoch 7/10, loss 389.99403
Epoch 8/10, loss 381.32036
Epoch 9/10, loss 369.33726
Epoch 10/10, loss 360.47917


In [48]:
y_test_list, y_pred_list = [], []
h1, c1, h2, c2 = None, None, None, None
test_loss = 0
model.eval()
with torch.no_grad():
    for batch in test_batchs:
        X_test, y_test= createSequences(batch, 30)

        # Pad with zeros
        if len(X_test) < batch_size-30:
            # zeros = torch.zeros([batch_size-30 - len(X_test),30, 6])
            # X_test = torch.cat([X_test, zeros], dim=0)     

            # test_zeros = torch.zeros([batch_size- 30 - len(y_test)])
            # y_test = torch.cat([y_test, test_zeros], dim=0)
            break
        
        y_test = y_test.reshape(-1,1)
        X_test = X_test.float()

        # print(X_test.shape)

        y_pred, h1, c1, h2, c2 = model(X_test, h1, c1, h2, c2)
       
        test_loss += loss_fcn(y_pred, y_test).item()

        y_pred_list.append(y_pred)
        y_test_list.append(y_test)

y_pred_list = np.array(y_pred_list).flatten().reshape(-1,1)
y_test_list = np.array(y_test_list).flatten().reshape(-1,1)

y_pred_list = data_scaler.inverse_transform(y_pred_list)
y_test_list = data_scaler.inverse_transform(y_test_list)
# test_loss = inverse_Z_Score(test_loss)

data_verify = pd.DataFrame(y_test_list.tolist(), columns=["Test"])
data_predicted = pd.DataFrame(y_pred_list.tolist(),columns=['Predictions'])

final_output = pd.concat([data_verify, data_predicted], axis=1)
final_output['difference'] = final_output['Test'] - final_output['Predictions']
final_output = final_output.round(3)
final_output.to_csv('Protoype outputs.csv', index=False)
print(final_output)

total_difference = np.sum(np.abs(final_output['difference']))
pred_dev = np.std(final_output['Predictions'])
pred_mean = np.mean(final_output['Predictions'])

print(f"Loss: {test_loss:.5f}")
print(f"Average Difference: {total_difference/len(final_output['difference']):.5f}")
print(f"Standard deviation for the predictions: {pred_dev:.5f}")
print(f"Prediction mean: {pred_mean:.5f}")
# print(final_output['Predictions'])

       Test  Predictions  difference
0    -157.0       76.987    -233.987
1      15.0       65.810     -50.810
2     -18.0       64.943     -82.943
3     -59.0       40.583     -99.583
4     -60.0       49.773    -109.773
...     ...          ...         ...
9355  -30.0       57.321     -87.321
9356    0.0       58.347     -58.347
9357    0.0       58.623     -58.623
9358  -92.0       59.117    -151.117
9359  -95.0       59.649    -154.648

[9360 rows x 3 columns]
Loss: 100.10737
Average Difference: 183.66653
Standard deviation for the predictions: 54.36328
Prediction mean: 98.21492
