In [315]:
import pandas as pd
import numpy as np
import torch.nn as nn 
from sklearn.datasets import make_regression
from torch.utils.data import Dataset, DataLoader
import torch
from typing import Tuple
from sklearn.linear_model import LinearRegression, HuberRegressor
from sklearn.preprocessing import StandardScaler
import plotly.express as px
from pathlib import Path
from sklearn.metrics import root_mean_squared_error

In [316]:
fold = 0
data_folder = Path(
    "/home/thomas/repos/simplify_deployment/data/data_science/50_gen_s1/folds_converted_with_genome",
)
X_train = pd.read_parquet(data_folder/f"X_train_fold_{fold}.parquet")
y_train = pd.read_parquet(data_folder/f"y_train_fold_{fold}.parquet").squeeze()
X_test = pd.read_parquet(data_folder/f"X_test_fold_{fold}.parquet")
y_test = pd.read_parquet(data_folder/f"y_test_fold_{fold}.parquet").squeeze()
n_features = X_train.shape[1]

s1 model for comparison

In [317]:
s1_folder = Path(
    "/home/thomas/repos/simplify_deployment/data/simplify_1_0/folds_converted_with_genome",
)
X_train_s1 = pd.read_parquet(s1_folder/f"X_train_fold_{fold}.parquet")
y_train_s1 = pd.read_parquet(s1_folder/f"y_train_fold_{fold}.parquet").squeeze()
X_test_s1 = pd.read_parquet(s1_folder/f"X_test_fold_{fold}.parquet")
y_test_s1 = pd.read_parquet(s1_folder/f"y_test_fold_{fold}.parquet").squeeze()

In [None]:
s1_model = LinearRegression()
s1_model.fit(X_train_s1, y_train_s1)
s1_prediction = pd.DataFrame(
    {
        "y_true": y_test_s1,
        "y_pred_s1": s1_model.predict(X_test_s1)
    },
    index = y_test_s1.index
)
s1_prediction

## Standard linear regression
Let's start by fitting a standard linear regression and plotting the results.

In [None]:
standard_model = LinearRegression()
standard_model.fit(X_train, y_train)
prediction_df = pd.DataFrame(
    {
        "y_true": y_test,
        "y_pred_standard_model": standard_model.predict(X_test),
    },
    index = y_test.index
)
prediction_df

## Custom loss model
This time we will fit a linear model with our custom loss function. It should perform less well in regards to RMSE as this it not what it optimizes for.
We should see less max errors, or less time above threshold, ... depending on what we prioritize in the loss function.

For the optimizer to work well, it is best to standardscale the data. This brings some added complexity in the code, but is necessary to speed up convergence.

Let's starts by setting up some classes we will need:

In [320]:
class Model(nn.Module):
    def __init__(
            self,
            n_features: int,
    ) -> None:
        super().__init__()
        self.fc1 = nn.Linear(
            in_features=n_features,
            out_features=1,
        )  # Just 1 fully connected layer without activation, i.e. a linear regression.

    def forward(
        self,  
        X: torch.Tensor,  
    ) -> torch.Tensor:
        y = self.fc1(X)
        return y.flatten()

In [321]:
class CustomLoss(nn.Module):
    def __init__(
            self,
            threshold: float = 0.685,
            weight_max_error: float = 1,
            weight_percentage_above_threshold:float = 1,
            weight_wrong_sign: float = 1,
            sigmoid_steepness: float = 1,
    ) -> None:
        super().__init__()
        self.steepness = sigmoid_steepness
        self.threshold = threshold
        # Normalize weights and assign them
        sum_weights = (
            weight_max_error
            + weight_percentage_above_threshold
            + weight_wrong_sign
        )
        self.weight_max_error = (
            weight_max_error / sum_weights
        )
        self.weight_percentage_above_threshold = (
            weight_percentage_above_threshold / sum_weights
        )
        self.weight_wrong_sign = (
            weight_wrong_sign / sum_weights
        )

    def forward(
            self, 
            inputs: torch.Tensor, 
            targets: torch.Tensor,
        ) -> torch.Tensor:

        residuals = targets - inputs
        # Maximum abs error
        max_error = residuals.abs().max()

        # Percentage of time above threshold value
        percentage_of_time_above_x = (
            1/(1+torch.e**(-self.steepness*(residuals.abs()-self.threshold)))
        ).mean()

        # Percentage of time wrong sign
        loss_percentage_of_time_wrong_sign = (
            1/(1+torch.e**(-self.steepness*(inputs*targets)))
        ).mean()
        
        # Total loss
        total_loss = (
            self.weight_max_error * max_error
            + self.weight_percentage_above_threshold * percentage_of_time_above_x
            + self.weight_wrong_sign * loss_percentage_of_time_wrong_sign
        )
        return total_loss

In [322]:
class CustomLossRelu(nn.Module):
    def __init__(
            self,
            threshold: float = 0.685,
            weight_max_error: float = 1,
            weight_percentage_above_threshold:float = 1,
            weight_wrong_sign: float = 1,
            sigmoid_steepness: float = 1,
    ) -> None:
        super().__init__()
        self.threshold = threshold
        # Normalize weights and assign them
        sum_weights = (
            weight_max_error
            + weight_percentage_above_threshold
            + weight_wrong_sign
        )
        self.weight_max_error = (
            weight_max_error / sum_weights
        )
        self.weight_percentage_above_threshold = (
            weight_percentage_above_threshold / sum_weights
        )
        self.weight_wrong_sign = (
            weight_wrong_sign / sum_weights
        )

    def forward(
            self, 
            inputs: torch.Tensor, 
            targets: torch.Tensor,
        ) -> torch.Tensor:

        residuals = targets - inputs
        # Maximum abs error
        max_error = residuals.abs().max()

        # Percentage of time above threshold value
        percentage_of_time_above_x = nn.functional.relu(
            residuals.abs()-self.threshold
        ).mean()

        # Percentage of time wrong sign
        loss_percentage_of_time_wrong_sign = nn.functional.relu(
            -targets*inputs
        ).mean()
        
        # Total loss
        total_loss = (
            self.weight_max_error * max_error
            + self.weight_percentage_above_threshold * percentage_of_time_above_x
            + self.weight_wrong_sign * loss_percentage_of_time_wrong_sign
        )
        return total_loss

In [323]:
class CustomMSELoss(nn.Module):
    def __init__(
            self,
    ) -> None:
        super().__init__()


    def forward(
            self, 
            inputs: torch.Tensor, 
            targets: torch.Tensor,
        ) -> torch.Tensor:

        residuals = targets - inputs
        mse = (residuals**2).mean()
        return mse

In [324]:
class CustomDataset(Dataset):
    def __init__(
            self,
            X: torch.Tensor,
            y: torch.Tensor,
        ) -> None:
        self.X = X
        self.y = y

    def __len__(
            self
    ) -> int:
        return self.X.shape[0]
    
    def __getitem__(
            self,
            idx: int,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        X_item = self.X[idx,:]
        y_item = self.y[idx]
        return X_item, y_item

Scale X and y

In [325]:
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train.to_frame())

X_train_scaled = torch.Tensor(
    X_scaler.transform(X_train),
).float()

X_test_scaled = torch.Tensor(
    X_scaler.transform(X_test),
).float()

y_train_scaled = torch.Tensor(
    y_scaler.transform(y_train.to_frame()).squeeze(), # Make target tensor unidimensional
).float()

y_test_scaled = torch.Tensor(
    y_scaler.transform(y_test.to_frame()).squeeze(),
).float()

Custom loss model

In [None]:
epochs = 10
lr = 1e-4
batch_size = 96

dataloader = DataLoader(
    CustomDataset(X_train_scaled,y_train_scaled),
    batch_size=batch_size,
    shuffle=True,
)
model = Model(
    n_features=n_features,
)
optimizer = torch.optim.Adam(
    model.parameters(),
    lr = lr,
)
threshold = 117
converted_threshold = threshold / np.std(y_train)
criterion = CustomLossRelu(
    threshold=converted_threshold,
    weight_wrong_sign=1,
    weight_max_error=1,
    weight_percentage_above_threshold=1,
) 
for epoch in range(epochs):
    epoch_loss = 0
    for X_batch, y_batch in dataloader:
        prediction = model(X_batch)
        optimizer.zero_grad()
        loss = criterion(prediction, y_batch)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    average_loss = epoch_loss/len(dataloader)
    print(f"Average epoch loss: {average_loss}")
    print(f"Epoch {epoch} done.")

In [None]:
prediction_df["y_pred_custom_loss"] = y_scaler.inverse_transform(
    model(X_test_scaled).detach().numpy().reshape(-1,1),
)
prediction_df = prediction_df.merge(
    s1_prediction.drop(columns = "y_true"), 
    left_index = True, 
    right_index=True, 
    how = "inner",
)
prediction_df

In [None]:
s1_rmse = root_mean_squared_error(
    prediction_df["y_true"],
    prediction_df["y_pred_s1"],
)
print(f"Rmse of s1 is {s1_rmse}")

In [None]:
standard_rmse = root_mean_squared_error(
    prediction_df["y_true"],
    prediction_df["y_pred_standard_model"],
)
print(f"Rmse of standard model is {standard_rmse}")

In [None]:
custom_rmse = root_mean_squared_error(
    prediction_df["y_true"],
    prediction_df["y_pred_custom_loss"],
)
print(f"Rmse of custom model is {custom_rmse}")

In [None]:
prediction_df_molten = prediction_df.melt(ignore_index=False)
fig = px.line(
    prediction_df_molten,
    x = prediction_df_molten.index,
    y = "value",
    color = "variable",
)
fig.show()

In [None]:
y_test.mean()