In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

'''# Load Dataset
file_path = "filename.csv"
df = pd.read_csv(file_path)

df = df.sample(n=20000, random_state=42)

# Features & Target
features = ["vehicle_id", "stop_id", "scheduled_time", "actual_time", "day", "day_of_year", "Weather"]
target = "delay"

X = df[features].fillna(0).values  # Fill missing values if any
y = df[target].values

# Normalize Features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-Test Split (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch Tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Create DataLoaders
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define Bi-LSTM Model with Dropout and Batch Normalization
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.bn = nn.BatchNorm1d(2 * hidden_dim)
        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(2 * hidden_dim, 1)  # Output 1 value (delay prediction)

    def forward(self, x):
        x = x.unsqueeze(1)  # Reshape for LSTM [batch, seq_len=1, features]
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take last time step output
        lstm_out = self.bn(lstm_out)
        lstm_out = self.dropout(lstm_out)
        out = self.fc(lstm_out)
        return out

# Define Objective Function for Bayesian Optimization
def objective(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 16, 128)
    dropout_rate = trial.suggest_float("dropout", 0.1, 0.5)
    learning_rate = trial.suggest_float("lr", 1e-4, 1e-2, log=True)

    model = BiLSTM(input_dim=X_train.shape[1], hidden_dim=hidden_dim, dropout_rate=dropout_rate)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train Model
    model.train()
    for epoch in range(10):  # Train for 10 epochs
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on Test Set
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensor)
        test_loss = criterion(test_preds, y_test_tensor).item()
    
    return test_loss  # Minimize MSE Loss

# Run Bayesian Optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=20)  # Run for 20 trials

# Get Best Hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train Final Model with Best Hyperparameters
best_model = BiLSTM(input_dim=X_train.shape[1], hidden_dim=best_params["hidden_dim"], dropout_rate=best_params["dropout"])
optimizer = optim.Adam(best_model.parameters(), lr=best_params["lr"])
criterion = nn.MSELoss()

# Training Loop
num_epochs = 50
for epoch in range(num_epochs):
    best_model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# Evaluate Final Model
best_model.eval()
with torch.no_grad():
    final_preds = best_model(X_test_tensor)
    final_loss = criterion(final_preds, y_test_tensor).item()

print(f"Final Test Loss (MSE): {final_loss:.4f}")'''

'# Load Dataset\nfile_path = "filename.csv"\ndf = pd.read_csv(file_path)\n\ndf = df.sample(n=20000, random_state=42)\n\n# Features & Target\nfeatures = ["vehicle_id", "stop_id", "scheduled_time", "actual_time", "day", "day_of_year", "Weather"]\ntarget = "delay"\n\nX = df[features].fillna(0).values  # Fill missing values if any\ny = df[target].values\n\n# Normalize Features\nscaler = StandardScaler()\nX = scaler.fit_transform(X)\n\n# Train-Test Split (80%-20%)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)\n\n# Convert to PyTorch Tensors\nX_train_tensor = torch.tensor(X_train, dtype=torch.float32)\ny_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)\nX_test_tensor = torch.tensor(X_test, dtype=torch.float32)\ny_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)\n\n# Create DataLoaders\nbatch_size = 64\ntrain_dataset = TensorDataset(X_train_tensor, y_train_tensor)\ntest_dataset = TensorDataset(X_test_tensor, 

Lower Learning Rate Bound (1e-5 to 1e-3)	Prevents instability in updates

Hidden Dim Range (32-96)	Prevents overfitting from large models

Dropout Before BatchNorm	More effective regularization

Smaller Dropout Range (0.2-0.4)	Keeps useful information

Increased Batch Size (128)	Stabilizes gradients

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

file_path = "filename.csv"
df = pd.read_csv(file_path)

df = df.sample(n=50000, random_state=42)

features = ["vehicle_id", "stop_id", "scheduled_time", "day", "day_of_year", "Weather"]
target = "delay"

X = df[features].fillna(0).values  # Fill missing values if any
y = df[target].values

#Normalization
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

#Here I convert the data to PyTorch Tensors. Also, I tried to move tensors to GPU for faster training
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

#Wraps datasets into TensorDataset objects
#Also batches data (from the dropout paper)
batch_size = 128 
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

#Define the BiLSTM model
#
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)  # Dropout before batch norm
        self.bn = nn.BatchNorm1d(2 * hidden_dim)
        self.fc = nn.Linear(2 * hidden_dim, 1)  # Output 1 value (delay prediction)

    def forward(self, x):
        x = x.unsqueeze(1)  # Reshape for LSTM [batch, seq_len=1, features]
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take last time step output
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        lstm_out = self.bn(lstm_out)  # Batch normalization
        out = self.fc(lstm_out)
        return out

#Define Objective Function for Bayesian Optimization. I used optuna because chatGPT recommended it. Seems to be good. Otherwise, I can use BayesSearchCV.
#Runs 5 training epochs per trial to quickly evaluate hyperparameters.
#Returns MSE loss as the optimization target.
def objective(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 32, 96)  # Reduced upper limit
    dropout_rate = trial.suggest_float("dropout", 0.2, 0.4)  # Better range
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)  # Lower max lr

    model = BiLSTM(input_dim=X_train.shape[1], hidden_dim=hidden_dim, dropout_rate=dropout_rate).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train Model (Only 5 epochs for speed)
    model.train()
    for epoch in range(5):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on Test Set
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensor)
        test_loss = criterion(test_preds, y_test_tensor).item()
    
    return test_loss  # Minimize MSE Loss

#I ran 10 trials to find the best hyperparameters
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)  # Reduced to 10 trials

#Get Best Hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

#Train Final Model with Best Hyperparameters
best_model = BiLSTM(input_dim=X_train.shape[1], hidden_dim=best_params["hidden_dim"], dropout_rate=best_params["dropout"]).to(device)
#Initializes the Adam optimizer with the best hyperparameters found by Bayesian optimization.
optimizer = optim.Adam(best_model.parameters(), lr=best_params["lr"])
criterion = nn.MSELoss()

# Training Loop
num_epochs = 50  #Might be overfitting?
for epoch in range(num_epochs):
    best_model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

#Evaluate Final Model
best_model.eval()
with torch.no_grad():
    final_preds = best_model(X_test_tensor)
    final_loss = criterion(final_preds, y_test_tensor).item()

print(f"Final Test Loss (MSE): {final_loss:.4f}")

[I 2025-01-31 14:28:27,768] A new study created in memory with name: no-name-01ffb94a-ae53-4efe-adc5-a958c4af7fd5
[I 2025-01-31 14:28:42,443] Trial 0 finished with value: 132006.53125 and parameters: {'hidden_dim': 71, 'dropout': 0.3695921266185635, 'lr': 8.386963754610721e-05}. Best is trial 0 with value: 132006.53125.
[I 2025-01-31 14:28:54,763] Trial 1 finished with value: 129694.5390625 and parameters: {'hidden_dim': 84, 'dropout': 0.3228789702969039, 'lr': 0.0001340818041813791}. Best is trial 1 with value: 129694.5390625.
[I 2025-01-31 14:29:07,155] Trial 2 finished with value: 121205.578125 and parameters: {'hidden_dim': 83, 'dropout': 0.26485704225349616, 'lr': 0.0002358154738175493}. Best is trial 2 with value: 121205.578125.
[I 2025-01-31 14:29:17,514] Trial 3 finished with value: 118734.6328125 and parameters: {'hidden_dim': 60, 'dropout': 0.3813602196079009, 'lr': 0.00030542572445757814}. Best is trial 3 with value: 118734.6328125.
[I 2025-01-31 14:29:31,020] Trial 4 finish

Best Hyperparameters: {'hidden_dim': 71, 'dropout': 0.20344230250433148, 'lr': 0.0008526008726960454}
Epoch 1/50, Loss: 41736327.2500
Epoch 2/50, Loss: 38309068.8086
Epoch 3/50, Loss: 34710026.3477
Epoch 4/50, Loss: 33570935.8711
Epoch 5/50, Loss: 33289395.4375
Epoch 6/50, Loss: 33143050.3320
Epoch 7/50, Loss: 33013247.7188
Epoch 8/50, Loss: 32942009.1797
Epoch 9/50, Loss: 32857708.7656
Epoch 10/50, Loss: 32816788.5000
Epoch 11/50, Loss: 32754266.6133
Epoch 12/50, Loss: 32723470.0703
Epoch 13/50, Loss: 32666991.4824
Epoch 14/50, Loss: 32545641.3555
Epoch 15/50, Loss: 32502949.4961
Epoch 16/50, Loss: 32413435.3750
Epoch 17/50, Loss: 32420363.8789
Epoch 18/50, Loss: 32280328.2969
Epoch 19/50, Loss: 32245303.2969
Epoch 20/50, Loss: 32117850.8535
Epoch 21/50, Loss: 32110246.2891
Epoch 22/50, Loss: 32061190.8047
Epoch 23/50, Loss: 31868108.8418
Epoch 24/50, Loss: 31815936.5430
Epoch 25/50, Loss: 31713113.1035
Epoch 26/50, Loss: 31780600.6992
Epoch 27/50, Loss: 31530830.8574
Epoch 28/50, Los

REDUCED EPOCH (50 to 15) TO SEE IF OVERFITTING

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import optuna
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset

# Load Dataset
file_path = "filename.csv"
df = pd.read_csv(file_path)

df = df.sample(n=20000, random_state=42)

# Features & Target
features = ["vehicle_id", "stop_id", "scheduled_time", "day", "day_of_year", "Weather"]
target = "delay"

X = df[features].fillna(0).values  # Fill missing values if any
y = df[target].values

# Normalize Features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Train-Test Split (80%-20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert to PyTorch Tensors
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).view(-1, 1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).view(-1, 1).to(device)

# Create DataLoaders
batch_size = 128  # Increased batch size for stability
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define Optimized Bi-LSTM Model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, dropout_rate):
        super(BiLSTM, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)  # Dropout before batch norm
        self.bn = nn.BatchNorm1d(2 * hidden_dim)
        self.fc = nn.Linear(2 * hidden_dim, 1)  # Output 1 value (delay prediction)

    def forward(self, x):
        x = x.unsqueeze(1)  # Reshape for LSTM [batch, seq_len=1, features]
        lstm_out, _ = self.lstm(x)
        lstm_out = lstm_out[:, -1, :]  # Take last time step output
        lstm_out = self.dropout(lstm_out)  # Apply dropout
        lstm_out = self.bn(lstm_out)  # Batch normalization
        out = self.fc(lstm_out)
        return out

# Define Objective Function for Bayesian Optimization
def objective(trial):
    hidden_dim = trial.suggest_int("hidden_dim", 32, 96)  # Reduced upper limit
    dropout_rate = trial.suggest_float("dropout", 0.2, 0.4)  # Better range
    learning_rate = trial.suggest_float("lr", 1e-5, 1e-3, log=True)  # Lower max lr

    model = BiLSTM(input_dim=X_train.shape[1], hidden_dim=hidden_dim, dropout_rate=dropout_rate).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    # Train Model (Only 5 epochs for speed)
    model.train()
    for epoch in range(5):
        for batch_X, batch_y in train_loader:
            optimizer.zero_grad()
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()

    # Evaluate on Test Set
    model.eval()
    with torch.no_grad():
        test_preds = model(X_test_tensor)
        test_loss = criterion(test_preds, y_test_tensor).item()
    
    return test_loss  # Minimize MSE Loss

# Run Bayesian Optimization
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)  # Reduced to 10 trials

# Get Best Hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# Train Final Model with Best Hyperparameters (15 epochs for better learning)
best_model = BiLSTM(input_dim=X_train.shape[1], hidden_dim=best_params["hidden_dim"], dropout_rate=best_params["dropout"]).to(device)
optimizer = optim.Adam(best_model.parameters(), lr=best_params["lr"])
criterion = nn.MSELoss()

# Training Loop
num_epochs = 15  # Reduced from 50 to avoid overfitting
for epoch in range(num_epochs):
    best_model.train()
    epoch_loss = 0
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        outputs = best_model(batch_X)
        loss = criterion(outputs, batch_y)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# Evaluate Final Model
best_model.eval()
with torch.no_grad():
    final_preds = best_model(X_test_tensor)
    final_loss = criterion(final_preds, y_test_tensor).item()

print(f"Final Test Loss (MSE): {final_loss:.4f}")

[I 2025-01-31 14:32:47,657] A new study created in memory with name: no-name-6e0e31e5-963d-4415-aefa-dfce65a3a687
[I 2025-01-31 14:32:53,927] Trial 0 finished with value: 162494.40625 and parameters: {'hidden_dim': 92, 'dropout': 0.24392271372279434, 'lr': 1.4368823640642128e-05}. Best is trial 0 with value: 162494.40625.
[I 2025-01-31 14:32:57,558] Trial 1 finished with value: 162465.421875 and parameters: {'hidden_dim': 35, 'dropout': 0.2440503454236962, 'lr': 2.625622702034492e-05}. Best is trial 1 with value: 162465.421875.
[I 2025-01-31 14:33:02,816] Trial 2 finished with value: 153268.515625 and parameters: {'hidden_dim': 91, 'dropout': 0.29517088354898396, 'lr': 0.0005056394121858034}. Best is trial 2 with value: 153268.515625.
[I 2025-01-31 14:33:06,843] Trial 3 finished with value: 160515.734375 and parameters: {'hidden_dim': 46, 'dropout': 0.31909395448807154, 'lr': 0.0003356216927087496}. Best is trial 2 with value: 153268.515625.
[I 2025-01-31 14:33:12,565] Trial 4 finished

Best Hyperparameters: {'hidden_dim': 91, 'dropout': 0.29517088354898396, 'lr': 0.0005056394121858034}
Epoch 1/15, Loss: 16110318.1641
Epoch 2/15, Loss: 16042438.3477
Epoch 3/15, Loss: 15889039.3828
Epoch 4/15, Loss: 15625438.8477
Epoch 5/15, Loss: 15240629.8398
Epoch 6/15, Loss: 14768443.8125
Epoch 7/15, Loss: 14276772.4102
Epoch 8/15, Loss: 13842328.8125
Epoch 9/15, Loss: 13491036.2891
Epoch 10/15, Loss: 13222277.5781
Epoch 11/15, Loss: 13030676.9492
Epoch 12/15, Loss: 12927351.0156
Epoch 13/15, Loss: 12826640.8242
Epoch 14/15, Loss: 12799336.8125
Epoch 15/15, Loss: 12751756.0508
Final Test Loss (MSE): 132760.0781
