In [55]:
import polars as pl
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import StandardScaler
from typing import Tuple
from sklearn.metrics import root_mean_squared_error

In [49]:
class LinearModel(nn.Module):
    def __init__(
            self,
            n_features: int,
    ) -> None:
        super().__init__()
        self.fc1 = nn.Linear(
            in_features=n_features,
            out_features=1,
        )  # Just 1 fully connected layer without activation, i.e. a linear regression.

    def forward(
        self,  
        X: torch.Tensor,  
    ) -> torch.Tensor:
        y = self.fc1(X)
        return y
        
class CustomDataset(Dataset):
    def __init__(
            self,
            X: torch.Tensor,
            y: torch.Tensor,
        ) -> None:
        self.X = X
        self.y = y

    def __len__(
            self
    ) -> int:
        return self.X.shape[0]
    
    def __getitem__(
            self,
            idx: int,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        X_item = self.X[idx,:]
        y_item = self.y[idx,:]
        return X_item, y_item

In [50]:
X_train_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/train_X.parquet"
)
X_test_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/test_X.parquet"
)
y_train_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/train_y.parquet"
)
y_test_raw = pl.read_parquet(
    "/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/test_y.parquet"
)

X_scaler = StandardScaler()
y_scaler = StandardScaler()

X_train = X_scaler.fit_transform(X_train_raw.select(pl.exclude("datetime_utc")))
X_test = X_scaler.transform(X_test_raw.select(pl.exclude("datetime_utc")))

y_train = y_scaler.fit_transform(y_train_raw.select(pl.exclude("datetime_utc")))
y_test = y_scaler.transform(y_test_raw.select(pl.exclude("datetime_utc")))


In [51]:
epochs = 1000
lr = 1e-4
batch_size = 100

dataloader = DataLoader(
    CustomDataset(
        torch.Tensor(X_train).float(),
        torch.Tensor(y_train).float(),
    ),
    batch_size=batch_size,
    shuffle=True,
)
model_rmse = LinearModel(
    n_features=X_train.shape[1],
)
optimizer = torch.optim.Adam(
    model_rmse.parameters(),
    lr = lr,
)
criterion = nn.MSELoss()
for epoch in range(epochs):
    epoch_loss = 0
    model_rmse.train()
    for i,(X_batch, y_batch) in enumerate(dataloader):
        prediction = model_rmse(X_batch)
        optimizer.zero_grad()
        loss = criterion(prediction, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    average_loss = epoch_loss/len(dataloader)
    print(f"Average epoch loss: {average_loss}")
    print(f"Epoch {epoch} done.")

model_rmse.eval()

Average epoch loss: 1.231078137145486
Epoch 0 done.
Average epoch loss: 0.8161134118604105
Epoch 1 done.
Average epoch loss: 0.7417036042830278
Epoch 2 done.
Average epoch loss: 0.7147588066236917
Epoch 3 done.
Average epoch loss: 0.6999981929223205
Epoch 4 done.
Average epoch loss: 0.6921028755605221
Epoch 5 done.
Average epoch loss: 0.6875725153746993
Epoch 6 done.
Average epoch loss: 0.6838822046857934
Epoch 7 done.
Average epoch loss: 0.6808387968949107
Epoch 8 done.
Average epoch loss: 0.6785526994702428
Epoch 9 done.
Average epoch loss: 0.6762664727000303
Epoch 10 done.
Average epoch loss: 0.6748057202717592
Epoch 11 done.
Average epoch loss: 0.6736215833834437
Epoch 12 done.
Average epoch loss: 0.6720810442469841
Epoch 13 done.
Average epoch loss: 0.6713149423689343
Epoch 14 done.
Average epoch loss: 0.6698903160732846
Epoch 15 done.
Average epoch loss: 0.6689823347815248
Epoch 16 done.
Average epoch loss: 0.6678554710260657
Epoch 17 done.
Average epoch loss: 0.6673728523905887


LinearModel(
  (fc1): Linear(in_features=90, out_features=1, bias=True)
)

In [52]:
test_prediction = y_scaler.inverse_transform(
    model_rmse(torch.Tensor(X_test)).detach().numpy(),
)
test_prediction

array([[  69.141525],
       [  37.860134],
       [ -19.942675],
       ...,
       [-119.748535],
       [-114.986755],
       [ -24.587585]], dtype=float32)

In [53]:
test_prediction_df = y_test_raw.with_columns(pl.Series(name="prediction_rmse_loss", values = test_prediction.squeeze()))
test_prediction_df

datetime_utc,target,prediction_rmse_loss
"datetime[μs, UTC]",f32,f32
2024-01-03 00:14:00 UTC,-50.097,69.141525
2024-01-03 00:29:00 UTC,41.205002,37.860134
2024-01-03 00:44:00 UTC,10.788,-19.942675
2024-01-03 00:59:00 UTC,28.538,87.03669
2024-01-03 01:14:00 UTC,11.015,-27.748714
…,…,…
2024-01-30 22:44:00 UTC,-140.723007,-97.002151
2024-01-30 22:59:00 UTC,-105.958,-116.967293
2024-01-30 23:14:00 UTC,-7.133,-119.748535
2024-01-30 23:29:00 UTC,2.132,-114.986755


In [56]:
root_mean_squared_error(
    y_true = test_prediction_df.select(pl.col("target")),
    y_pred = test_prediction_df.select(pl.col("prediction_rmse_loss"))
)

113.290115

In [57]:
test_prediction_df.write_parquet("/home/thomas/repos/simplify_deployment/data/potential_features/mi_variables/results/rmse_loss.parquet")