In [19]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

df =  pd.read_csv('California_Houses.csv')
df.dropna(inplace=True)

In [24]:
df = (df - df.mean()) / df.std()
df = df[(np.abs(df) < 4).all(axis=1)]

In [25]:
import pandas as pd
import numpy as np
from scipy.stats import boxcox

# Adjust and transform
for column in df.columns:
    # Ensure all data is positive by shifting the column if necessary
    min_value = df[column].min()
    shift = -min_value + 1 if min_value <= 0 else 0
    try:
        if shift > 0:
            df[column], fitted_lambda = boxcox(df[column] + shift)
        else:
            df[column], fitted_lambda = boxcox(df[column])
        print(f"Transformed {column} with lambda: {fitted_lambda}")
    except ValueError:
        print(f"Column {column} cannot be transformed. It may contain zero or negative values after adjustment.")


Transformed Y with lambda: 0.8607471157190429
Transformed Median_Income with lambda: 0.960220475167138
Transformed Median_Age with lambda: 0.9799683619027184
Transformed Tot_Rooms with lambda: 0.9981594230014806
Transformed Tot_Bedrooms with lambda: 0.9954765433151639
Transformed Population with lambda: 0.98965430677062
Transformed Households with lambda: 1.0003037457599773
Transformed Latitude with lambda: 0.7268953160836403
Transformed Longitude with lambda: 1.0857071618165173
Transformed Distance_to_coast with lambda: 0.36338880601400225
Transformed Distance_to_LA with lambda: 0.29445961769220624
Transformed Distance_to_SanDiego with lambda: 0.6823721769637111
Transformed Distance_to_SanJose with lambda: 0.9824361962141578
Transformed Distance_to_SanFrancisco with lambda: 1.0010415989248287


In [17]:
# Assuming 'df' is your DataFrame and 'Y' is the target column
X = df.drop('Y', axis=1)
y = df['Y']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

models = {
    'Linear Regression': LinearRegression(),
    'KNN': KNeighborsRegressor(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(),
    'CatBoost': CatBoostRegressor(verbose=0)  # verbose=0 to keep output clean
}

# Dictionary to store results
results = {}

for name, model in models.items():
    start_time = time.time()
    model.fit(X_train_scaled, y_train)
    training_time = time.time() - start_time

    start_time = time.time()
    y_pred = model.predict(X_test_scaled)
    inference_time = time.time() - start_time

    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    results[name] = {'RMSE': rmse, 'Training Time': training_time, 'Inference Time': inference_time}

# Display results
for model, metrics in results.items():
    print(f"{model}: RMSE = {metrics['RMSE']:.3f}, Training Time = {metrics['Training Time']:.3f}s, Inference Time = {metrics['Inference Time']:.3f}s")


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3113
[LightGBM] [Info] Number of data points in the train set: 16124, number of used features: 13
[LightGBM] [Info] Start training from score 0.761867
Linear Regression: RMSE = 0.136, Training Time = 0.008s, Inference Time = 0.000s
KNN: RMSE = 0.121, Training Time = 0.073s, Inference Time = 0.243s
Decision Tree: RMSE = 0.137, Training Time = 0.256s, Inference Time = 0.003s
Random Forest: RMSE = 0.098, Training Time = 28.587s, Inference Time = 0.131s
XGBoost: RMSE = 0.097, Training Time = 0.362s, Inference Time = 0.006s
LightGBM: RMSE = 0.098, Training Time = 0.211s, Inference Time = 0.015s
CatBoost: RMSE = 0.093, Training Time = 5.457s, Inference Time = 0.004s


In [34]:
# Assuming 'df' is your DataFrame and 'Y' is the target column
X = df.drop('Y', axis=1)
y = df['Y']

# Data preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [27]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import mean_squared_error

# Build the model
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(1)  # Output layer for regression; no activation function
])

# Compile the model
model.compile(optimizer=Adam(), loss='mean_squared_error', metrics=['mean_squared_error'])

# Train the model
start_time = time.time()
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
training_time = time.time() - start_time
print(f"Training completed in: {training_time:.3f} seconds")

# Evaluate the model
start_time = time.time()
loss, mse = model.evaluate(X_test, y_test)
inference_time = time.time() - start_time
print(f"Inference time for evaluation: {inference_time:.3f} seconds")
print(f"Test MSE: {mse}")

# Make predictions and calculate MSE manually
start_time = time.time()
predictions = model.predict(X_test)
inference_time_per_sample = (time.time() - start_time) / len(X_test)
mse_manual = mean_squared_error(y_test, predictions)
print(f"Inference time per sample: {inference_time_per_sample:.6f} seconds")
print(f"Calculated Test MSE: {mse_manual}")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - loss: 1.2557 - mean_squared_error: 1.2557 - val_loss: 0.2185 - val_mean_squared_error: 0.2185
Epoch 2/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1980 - mean_squared_error: 0.1980 - val_loss: 0.1762 - val_mean_squared_error: 0.1762
Epoch 3/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1681 - mean_squared_error: 0.1681 - val_loss: 0.1664 - val_mean_squared_error: 0.1664
Epoch 4/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1701 - mean_squared_error: 0.1701 - val_loss: 0.1579 - val_mean_squared_error: 0.1579
Epoch 5/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1533 - mean_squared_error: 0.1533 - val_loss: 0.1608 - val_mean_squared_error: 0.1608
Epoch 6/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1512 

In [30]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, add
from tensorflow.keras.optimizers import Adam

def build_rln_model(input_dim):
    # Input layer
    inputs = Input(shape=(input_dim,))
    
    # First layer
    x = Dense(64, activation='relu')(inputs)
    
    # Residual block 1
    x1 = Dense(64, activation='relu')(x)
    x1 = Dense(64, activation='relu')(x1)
    block_1_out = add([x, x1])  # Adding input of the block to its output

    # Residual block 2
    x2 = Dense(64, activation='relu')(block_1_out)
    x2 = Dense(64, activation='relu')(x2)
    block_2_out = add([block_1_out, x2])  # Adding input of the block to its output

    # Output layer for regression
    outputs = Dense(1)(block_2_out)
    
    model = Model(inputs, outputs)
    model.compile(optimizer=Adam(), loss='mse')
    
    return model

# Assuming input_dim is the number of features in X_train
model = build_rln_model(input_dim=X_train.shape[1])

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model
mse = model.evaluate(X_test, y_test)
print(f'Mean Squared Error: {mse}')


Epoch 1/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 1.3604 - val_loss: 0.2142
Epoch 2/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1864 - val_loss: 0.1708
Epoch 3/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1640 - val_loss: 0.1627
Epoch 4/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1544 - val_loss: 0.1663
Epoch 5/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1535 - val_loss: 0.1771
Epoch 6/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1436 - val_loss: 0.1432
Epoch 7/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 0.1382 - val_loss: 0.1432
Epoch 8/50
[1m390/390[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1337 - val_loss: 0.1420
Epoch 9/50
[1m390/390[0m [32m━━━━━━━━

In [37]:
X = df.drop('Y', axis=1).values
y = df['Y'].values

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Reshape y_train and y_test to be 2D
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)


In [39]:
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

# Set up TabNet Regressor
# Note: Make sure to adjust the hyperparameters according to your specific needs
model = TabNetRegressor(optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=2e-2),
                        scheduler_params={"step_size":50, "gamma":0.9},
                        scheduler_fn=torch.optim.lr_scheduler.StepLR,
                        mask_type='sparsemax'  # This will be used for sparse features
                        )

# Fit the model
max_epochs = 30 if not torch.cuda.is_available() else 30  # Adjust epochs based on your GPU availability
model.fit(
    X_train=X_train, y_train=y_train,
    eval_set=[(X_train, y_train), (X_test, y_test)],
    eval_name=['train', 'valid'],
    eval_metric=['rmse'],
    max_epochs=max_epochs,
    patience=50,  # Early stopping patience
    batch_size=1024,  # Adjust based on your GPU capacity
    virtual_batch_size=128,  # Size of the mini batches for Ghost Batch Normalization
    num_workers=0,  # Based on your system: can increase if you have more CPU cores
    drop_last=False
)

# Predicting and Evaluating
predictions = model.predict(X_test)
mse = np.mean((y_test - predictions)**2)
print(f'Mean Squared Error: {mse}')




epoch 0  | loss: 4.06955 | train_rmse: 1.68216 | valid_rmse: 1.66649 |  0:00:02s
epoch 1  | loss: 0.38353 | train_rmse: 1.2345  | valid_rmse: 1.21493 |  0:00:05s
epoch 2  | loss: 0.29676 | train_rmse: 1.01229 | valid_rmse: 1.02259 |  0:00:07s
epoch 3  | loss: 0.26111 | train_rmse: 0.84398 | valid_rmse: 0.86743 |  0:00:09s
epoch 4  | loss: 0.24028 | train_rmse: 0.82805 | valid_rmse: 0.84625 |  0:00:12s
epoch 5  | loss: 0.22179 | train_rmse: 0.7278  | valid_rmse: 0.7438  |  0:00:14s
epoch 6  | loss: 0.20889 | train_rmse: 0.69366 | valid_rmse: 0.69468 |  0:00:17s
epoch 7  | loss: 0.20865 | train_rmse: 0.62879 | valid_rmse: 0.63297 |  0:00:19s
epoch 8  | loss: 0.19562 | train_rmse: 0.58878 | valid_rmse: 0.58628 |  0:00:21s
epoch 9  | loss: 0.18968 | train_rmse: 0.54019 | valid_rmse: 0.53812 |  0:00:24s
epoch 10 | loss: 0.18495 | train_rmse: 0.53108 | valid_rmse: 0.5292  |  0:00:26s
epoch 11 | loss: 0.18026 | train_rmse: 0.58089 | valid_rmse: 0.57294 |  0:00:28s
epoch 12 | loss: 0.17908 | t



Mean Squared Error: 0.13819212327366764


In [48]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from pytorch_tabnet.tab_model import TabNetRegressor
import torch
import time  # Import time to measure training and inference time

X = df.drop('Y', axis=1).values
y = df['Y'].values.reshape(-1, 1)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize TabNetRegressor
model = TabNetRegressor(verbose=1, optimizer_fn=torch.optim.Adam,
                        optimizer_params=dict(lr=2e-2))

# Start timing for training
start_time = time.time()
model.fit(
    X_train=X_train_scaled, y_train=y_train,
    eval_set=[(X_test_scaled, y_test)],
    eval_name=['test'],
    eval_metric=['rmse'],
    max_epochs=30,
    patience=50,  # Early stopping
    batch_size=256,  # Mini-batch size for training
    virtual_batch_size=128  # Size of the mini-batches for the "Ghost Batch Normalization"
)
training_time = time.time() - start_time
print(f"Training completed in: {training_time:.3f} seconds")

# Start timing for inference
start_time = time.time()
preds = model.predict(X_test_scaled)
inference_time = time.time() - start_time
print(f"Inference completed in: {inference_time:.3f} seconds")

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, preds)
print(f"Test MSE: {mse}")




epoch 0  | loss: 1.73203 | test_rmse: 0.61274 |  0:00:02s
epoch 1  | loss: 0.3013  | test_rmse: 0.50425 |  0:00:05s
epoch 2  | loss: 0.24849 | test_rmse: 0.47658 |  0:00:08s
epoch 3  | loss: 0.22508 | test_rmse: 0.44943 |  0:00:11s
epoch 4  | loss: 0.21512 | test_rmse: 0.48218 |  0:00:14s
epoch 5  | loss: 0.20837 | test_rmse: 0.44457 |  0:00:17s
epoch 6  | loss: 0.20282 | test_rmse: 0.43617 |  0:00:20s
epoch 7  | loss: 0.2091  | test_rmse: 0.43699 |  0:00:22s
epoch 8  | loss: 0.19905 | test_rmse: 0.46584 |  0:00:25s
epoch 9  | loss: 0.21054 | test_rmse: 0.44556 |  0:00:28s
epoch 10 | loss: 0.19675 | test_rmse: 0.42091 |  0:00:30s
epoch 11 | loss: 0.18898 | test_rmse: 0.41133 |  0:00:33s
epoch 12 | loss: 0.18459 | test_rmse: 0.42966 |  0:00:36s
epoch 13 | loss: 0.18579 | test_rmse: 0.41667 |  0:00:39s
epoch 14 | loss: 0.18434 | test_rmse: 0.3992  |  0:00:41s
epoch 15 | loss: 0.17325 | test_rmse: 0.39603 |  0:00:44s
epoch 16 | loss: 0.17265 | test_rmse: 0.39341 |  0:00:48s
epoch 17 | los



Training completed in: 86.503 seconds
Inference completed in: 0.263 seconds
Test MSE: 0.14101276557535194


In [49]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import time

# Example DataFrame setup (assuming df is predefined)
Y = df['Y'].values  # Target variable
X = df.drop('Y', axis=1).values  # Features

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Converting to PyTorch tensors
X_train_torch = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
X_test_torch = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)

# DataLoader setup
train_data = DataLoader(TensorDataset(X_train_torch, y_train_torch), batch_size=64, shuffle=True)
test_data = DataLoader(TensorDataset(X_test_torch, y_test_torch), batch_size=64, shuffle=False)

# Model setup
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 50)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(50, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

model = SimpleNN()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Training the model with time measurement
start_time = time.time()
epochs = 100
for epoch in range(epochs):
    for inputs, targets in train_data:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
training_time = time.time() - start_time
print(f"Training completed in: {training_time:.3f} seconds")

# Inference with time measurement
model.eval()
start_time = time.time()
with torch.no_grad():
    predictions = []
    for inputs, targets in test_data:
        outputs = model(inputs)
        predictions.append(outputs)
predictions = torch.cat(predictions)
inference_time = time.time() - start_time
print(f"Inference completed in: {inference_time:.3f} seconds")

# Calculate Mean Squared Error
mse = criterion(predictions, y_test_torch)
print(f"Test MSE: {mse.item()}")


Training completed in: 41.608 seconds
Inference completed in: 0.039 seconds
Test MSE: 0.1275930404663086


In [50]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import time

# Assuming df is your DataFrame and 'Y' is your target column
Y = df['Y'].values
X = df.drop('Y', axis=1).values

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_torch = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
X_test_torch = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)

# DataLoader setup
train_data = DataLoader(TensorDataset(X_train_torch, y_train_torch), batch_size=64, shuffle=True)
test_data = DataLoader(TensorDataset(X_test_torch, y_test_torch), batch_size=64, shuffle=False)

# Define a neural network with stochastic gates (dropout)
class StochasticNet(nn.Module):
    def __init__(self):
        super(StochasticNet, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 100)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)  # 50% dropout
        self.fc2 = nn.Linear(100, 50)
        self.fc3 = nn.Linear(50, 1)

    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.fc3(x)
        return x

model = StochasticNet()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Train the model with timing
start_time = time.time()
epochs = 100
model.train()
for epoch in range(epochs):
    for inputs, targets in train_data:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
training_time = time.time() - start_time
print(f"Training completed in: {training_time:.3f} seconds")

# Inference with timing
start_time = time.time()
model.eval()
with torch.no_grad():
    predictions = []
    for inputs, _ in test_data:
        outputs = model(inputs)
        predictions.append(outputs)
predictions = torch.cat(predictions)
inference_time = time.time() - start_time
print(f"Inference completed in: {inference_time:.3f} seconds")

# Calculate Mean Squared Error
mse = criterion(predictions, y_test_torch)
print(f"Test MSE: {mse.item()}")


Training completed in: 94.881 seconds
Inference completed in: 0.081 seconds
Test MSE: 0.14129550755023956


In [56]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import DataLoader, TensorDataset
import time

# Sample data preparation
Y = df['Y'].values
X = df.drop('Y', axis=1).values

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_torch = torch.tensor(X_train_scaled, dtype=torch.float32)
y_train_torch = torch.tensor(y_train.reshape(-1, 1), dtype=torch.float32)
X_test_torch = torch.tensor(X_test_scaled, dtype=torch.float32)
y_test_torch = torch.tensor(y_test.reshape(-1, 1), dtype=torch.float32)

# DataLoader
train_loader = DataLoader(TensorDataset(X_train_torch, y_train_torch), batch_size=32, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test_torch, y_test_torch), batch_size=32, shuffle=False)


In [57]:
from torch import nn

class SimpleTransformer(nn.Module):
    def __init__(self, input_dim, num_heads, num_layers, dropout_rate=0.1):
        super(SimpleTransformer, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, input_dim),
            nn.Dropout(dropout_rate),
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads, dropout=dropout_rate),
            nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model=input_dim, nhead=num_heads), num_layers=num_layers),
            nn.Linear(input_dim, 1)
        )

    def forward(self, x):
        return self.layers(x)

model = SimpleTransformer(input_dim=X_train_scaled.shape[1], num_heads=2, num_layers=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()


AssertionError: embed_dim must be divisible by num_heads

In [58]:
df.to_csv("df", index=False)