In [86]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tab_transformer_pytorch import TabTransformer, FTTransformer
from preprocessing import get_features_and_target
from sklearn.preprocessing import LabelEncoder
from RMSELoss import RMSELoss
import plotly.graph_objects as go

# Getting Dataframe

In [87]:
train_df = pd.read_csv("data/train_data.csv")
dev_df = pd.read_csv("data/development_data.csv")

target_column = "PullTest (N)"  

x_train, y_train = get_features_and_target(train_df, target_column)
x_dev, y_dev = get_features_and_target(dev_df, target_column)




# Encode Categorical_Features

In [88]:
# Define the categorical features
categorical_features = ["Material"]

le = LabelEncoder()

for feature in categorical_features:
    x_train[feature] = le.fit_transform(x_train[feature])
    x_dev[feature] = le.transform(x_dev[feature])  

# Split Categorical_Features

In [89]:
# Drop categorical features to get the continuous features
x_train_numerical_features = x_train.drop(categorical_features, axis=1)
x_dev_numerical_features = x_dev.drop(categorical_features, axis=1)

# Seperate the categorical features
x_train_categorical_features = x_train[categorical_features]
x_dev_categorical_features = x_dev[categorical_features]

# Change df into Tensors

In [90]:
train_tensor = torch.tensor(x_train.to_numpy(), dtype=torch.float)
x_train_numer_tensor = torch.tensor(x_train_numerical_features.to_numpy(),dtype=torch.float)
x_dev_numer_tensor = torch.tensor(x_dev_numerical_features.to_numpy(),dtype=torch.float)
y_train_tensor = torch.tensor(y_train.to_numpy(), dtype=torch.float)

dev_tensor = torch.tensor(x_dev.to_numpy(), dtype=torch.float)
x_train_categorical_features_tensor = torch.tensor(x_train_categorical_features.to_numpy(),dtype=torch.long)
x_dev_categorical_features_tensor = torch.tensor(x_dev_categorical_features.to_numpy(),dtype=torch.long)
y_dev_tensor = torch.tensor(y_dev.to_numpy(), dtype=torch.float)

from torch.utils.data import TensorDataset,DataLoader

train_ds = TensorDataset(
    x_train_categorical_features_tensor,
    x_train_numer_tensor,
    y_train_tensor
)
val_ds = TensorDataset(
    x_dev_categorical_features_tensor,
    x_dev_numer_tensor,
    y_dev_tensor
)
g = torch.Generator()
g.manual_seed(42)
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, generator= g)
val_loader   = DataLoader(val_ds,   batch_size=32)



# Check Tensors


In [6]:
#torch.set_printoptions(sci_mode=False, precision=3)
#print(x_train_numer_tensor)

# Define Model

In [96]:
# categories is defined as a tuple, we only have one categorical feature "Material"
# the second parameter has to be empty for the model to work correctly
# in hard numbers this is displaying (1,)
model = FTTransformer(
    categories=(x_train_categorical_features.shape[1],),
    num_continuous=x_train_numerical_features.shape[1],
    dim=8,
    dim_out=1,
    depth=4,
    heads=4,
    attn_dropout=0.3,
    ff_dropout=0.3
)

# Initiate a saved Model


In [None]:
path = 'trained_models/regular_training/model_FTTransformer_lr0.00025_8_1_4_4_.3_0.3_date_20250810_epoch10000.pt'
model.load_state_dict(torch.load(path))

<All keys matched successfully>

# Define Training Epoch

In [95]:
def train_one_epoch(train_loader):
    total_loss = 0.0

    for x_cat, x_cont, y in train_loader:
        optimizer.zero_grad()

        # bring y to shape [B,1]
        y = y.unsqueeze(-1)

        # forward + backward + step
        pred = model(x_cat, x_cont)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        

    # return the average loss over ALL batches
    return total_loss / len(train_loader)

# Training

In [97]:
from datetime import datetime

criterion = RMSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001)

#Changeable Parameters
# ----------------------------------------------------#
#Model description for saving
model_description    = 'FTTransformer_lr0.0001_8_1_4_4_.1_0.1'

# Number of epochs to train
EPOCHS = 200000
#-----------------------------------------------------#

# Lists to store per‐epoch losses
train_losses = []
val_losses   = []

best_vloss   = float('inf')
last_ckpt  = None
timestamp    = datetime.now().strftime('%Y%m%d')


for epoch in range(EPOCHS):
    print(f'\nEPOCH {epoch+1}/{EPOCHS}')

    # --------------------
    # 1) TRAINING PHASE
    # --------------------
    model.train()
    avg_loss = train_one_epoch(train_loader)
    train_losses.append(avg_loss)
    print(f'train loss: {avg_loss:.4f}')

    # --------------------
    # 2) VALIDATION PHASE
    # --------------------
    model.eval()
    
    val_loss = 0.0
    with torch.no_grad():
        for x_cat, x_cont, y in val_loader:
            y = y.unsqueeze(-1)
            pred = model(x_cat, x_cont)
            val_loss += criterion(pred, y).item()

    

    avg_vloss = val_loss / len(val_loader)
    val_losses.append(avg_vloss)
    
    print(f'valid loss: {avg_vloss:.4f}')

    # --------------------
    # 3) CHECKPOINTING
    # --------------------
    if (epoch + 1) % 100 == 0 and avg_vloss < best_vloss:
        if last_ckpt is not None:
            os.remove(last_ckpt)
        best_vloss = avg_vloss
        ckpt_path = f'trained_models/regular_training/model_{model_description}_date_{timestamp}_epoch{epoch+1}.pt'
        torch.save(model.state_dict(), ckpt_path)
        last_ckpt = ckpt_path




EPOCH 1/200000
train loss: 2964.6734
valid loss: 3021.4922

EPOCH 2/200000
train loss: 2960.7201
valid loss: 3021.4868

EPOCH 3/200000
train loss: 2972.8651
valid loss: 3021.4807

EPOCH 4/200000
train loss: 2964.2566
valid loss: 3021.4748

EPOCH 5/200000
train loss: 2962.7407
valid loss: 3021.4683

EPOCH 6/200000
train loss: 2971.7833
valid loss: 3021.4622

EPOCH 7/200000
train loss: 2966.6965
valid loss: 3021.4557

EPOCH 8/200000
train loss: 2958.5004
valid loss: 3021.4491

EPOCH 9/200000
train loss: 2973.2173
valid loss: 3021.4425

EPOCH 10/200000
train loss: 2964.6628
valid loss: 3021.4358

EPOCH 11/200000
train loss: 2964.2995
valid loss: 3021.4294

EPOCH 12/200000
train loss: 2969.2907
valid loss: 3021.4228

EPOCH 13/200000
train loss: 2959.7141
valid loss: 3021.4163

EPOCH 14/200000
train loss: 2976.1394
valid loss: 3021.4097

EPOCH 15/200000
train loss: 2963.9198
valid loss: 3021.4033

EPOCH 16/200000
train loss: 2964.2085
valid loss: 3021.3968

EPOCH 17/200000
train loss: 2965

KeyboardInterrupt: 

In [43]:
import plotly.graph_objects as go

fig = go.Figure()

# Training loss trace
fig.add_trace(go.Scatter(
    y=train_losses,
    mode="lines+markers",
    name="Train Loss",
    line=dict(color="royalblue", width=2),
    marker=dict(size=4)
))

# Validation loss trace
fig.add_trace(go.Scatter(
    y=val_losses,
    mode="lines+markers",
    name="Validation Loss",
    line=dict(color="tomato", width=2),
    marker=dict(size=4)
))

# Layout
fig.update_layout(
    title="Training & Validation Loss over Epochs  - lr0.00025 8_1_4_4_.1_0.1",
    xaxis_title="Epoch",
    yaxis_title="Loss",
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(0, 10500, 500)),  # Show ticks every 100 epochs
        tickfont=dict(size=10)
    ),
    template="plotly_white",
    legend=dict(x=0.05, y= -0.25, bgcolor="rgba(255,255,255,0)", borderwidth=0)
)

fig.show()


In [44]:
import plotly.graph_objects as go

epochs = list(range(1, len(train_losses) + 1))

fig = go.Figure()

# Training loss trace
fig.add_trace(go.Scatter(
    x=epochs,
    y=train_losses,
    mode="lines",
    name="Training Loss",
    line=dict(color="royalblue", width=2)
))

# Validation loss trace
fig.add_trace(go.Scatter(
    x=epochs,
    y=val_losses,
    mode="lines",
    name="Validation Loss",
    line=dict(color="firebrick", width=2, dash="dash")
))

# Layout enhancements
fig.update_layout(
    title="Training & Validation Loss over 10 000 Epochs - lr0.00025 8_1_4_4_.1_0.1",
    xaxis_title="Epoch",
    yaxis_title="Loss",
    xaxis=dict(
        tickmode='array',
        tickvals=list(range(0, 10500, 500)),  # Show ticks every 100 epochs
        tickfont=dict(size=10)
    ),
    yaxis=dict(
        tickformat=".2e" if max(train_losses + val_losses) > 1e4 else ".4f",  # Dynamic formatting
        gridcolor="lightgray"
    ),
    legend=dict(
        orientation="h",
        yanchor="bottom",
        y=1.02,
        xanchor="right",
        x=1
    ),
    template="plotly_white",
    margin=dict(t=60, b=40)
)

fig.show()


# Skip

In [341]:
from sklearn.model_selection import ParameterGrid
import torch
import numpy as np

param_grid = {
    "dim":       [2, 4, 8],
    "depth":     [2, 4],
    "heads":     [2, 4],
    "attn_dropout":[0.1],
    "ff_dropout":[0.1],
    "lr":        [0.00025, 0.0003],
    "weight_decay":[0, 0.00001]
}

def run_experiment(params, train_loader, val_loader):
    results = []
    for params in ParameterGrid(param_grid):
        # 1) Build model & optimizer
        model = FTTransformer(
            categories=(1,),
            num_continuous=8,
            dim=params["dim"],
            dim_out=1,
            depth=params["depth"],
            heads=params["heads"],
            attn_dropout=params["attn_dropout"],
            ff_dropout=params["ff_dropout"]
        )
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=params["lr"],
            weight_decay=params["weight_decay"]
        )
        criterion = RMSELoss()

def train_one_epoch(train_loader):
    total_loss = 0.0

    for x_cat, x_cont, y in train_loader:
        optimizer.zero_grad()

        # bring y to shape [B,1]
        y = y.unsqueeze(-1) if y.dim()==1 else y

        # forward + backward + step
        pred = model(x_cat, x_cont)
        loss = criterion(pred, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        

    # return the average loss over ALL batches
    return total_loss / len(train_loader)

from datetime import datetime

criterion = RMSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00025)


EPOCHS = 1

# Lists to store per‐epoch losses
train_losses = []
val_losses   = []

best_vloss   = float('inf')
timestamp    = datetime.now().strftime('%Y%m%d_%H%M%S')


for epoch in range(EPOCHS):
    print(f'\nEPOCH {epoch+1}/{EPOCHS}')

    # --------------------
    # 1) TRAINING PHASE
    # --------------------
    model.train()
    avg_loss = train_one_epoch(train_loader)
    train_losses.append(avg_loss)
    print(f'train loss: {avg_loss:.4f}')

    # --------------------
    # 2) VALIDATION PHASE
    # --------------------
    model.eval()
    
    val_loss = 0.0
    with torch.no_grad():
        for x_cat, x_cont, y in val_loader:
            y = y.unsqueeze(-1) if y.dim()==1 else y
            pred = model(x_cat, x_cont)
            val_loss += criterion(pred, y).item()

    avg_vloss = val_loss / len(val_loader)
    val_losses.append(avg_vloss)
    print(f'valid loss: {avg_vloss:.4f}')

    # --------------------
    # 3) CHECKPOINTING
    # --------------------
    if (EPOCHS + 1) % 1000 == 0 and avg_vloss < best_vloss:
        best_vloss = avg_vloss
        ckpt_path = f'model_{timestamp}_epoch{epoch+1}.pt'
        torch.save(model.state_dict(), ckpt_path)


    


EPOCH 1/1
train loss: 384.0183
valid loss: 381.4391


In [None]:
df.head()

In [None]:
X.head()

In [350]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Collect predictions and true values from the validation set
model.eval()
all_preds = []
all_targets = []

with torch.no_grad():
    for x_categ, x_numer, y_train in val_loader:
        preds = model(x_categ, x_numer).squeeze().cpu().numpy()
        all_preds.extend(preds)
        all_targets.extend(y_train.cpu().numpy())

all_preds = np.array(all_preds)
all_targets = np.array(all_targets)

# Inverse-transform predictions and targets to original units
preds_orig = y_train
targets_orig = y_train

# Calculate MAE and RMSE in original units
mae = mean_absolute_error(targets_orig, preds_orig)
rmse = np.sqrt(mean_squared_error(targets_orig, preds_orig))

print(f"MAE (original units): {mae:.2f}")
print(f"RMSE (original units): {rmse:.2f}")

MAE (original units): 0.00
RMSE (original units): 0.00


In [None]:

regressor.fit(X_train, y_train)

predictions = regressor.predict(X_dev)

mse = mean_squared_error(y_dev, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_dev, predictions)

print("Mean Squared Error (MSE):", mse)
print("Root Mean Squared Error (RMSE):", rmse)
print("R² Score:", r2)

abs_errors = np.abs(predictions - y_dev)
sorted_errors = np.sort(abs_errors)
cdf = np.arange(1, len(sorted_errors)+1) / len(sorted_errors)

plt.figure(figsize=(8, 5))
plt.plot(sorted_errors, cdf, marker='.', linestyle='none')
plt.xlabel('Absolute Error |y_pred - y_true|')
plt.ylabel('Cumulative Probability')
plt.title('Cumulative Distribution Function (CDF) of Absolute Errors')
plt.grid(True)
plt.show()

## Coefficient of Determination (R²)

The coefficient of determination, denoted as $( R^2 )$, is a commonly used metric to evaluate the performance of a regression model. It indicates how well the model explains the variance in the target variable \( y \).

### Definition

The formula for $( R^2 )$ is:

$$
R^2 = 1 - \frac{SS_{\text{res}}}{SS_{\text{tot}}}
$$

Where:

- $( SS_{\text{res}} = \sum_{i=1}^{n} (y_i - \hat{y}_i)^2)$: Residual Sum of Squares (model error)
-  $(SS_{\text{tot}} = \sum_{i=1}^{n} (y_i - \bar{y})^2)$: Total Sum of Squares (total variance in the data)

### Interpretation

- $( R^2 = 1)$: Perfect prediction – the model explains 100% of the variance in $( y)$
- $( R^2 = 0)$: The model does no better than simply predicting the mean of $( y )$
- $( R^2 < 0 )$: The model performs worse than a constant mean prediction

### Example

In this case, the model achieved an $( R^2 )$ score of **0.61**, which means it explains **61% of the total variance** in the target variable.

This is a moderate-to-good result, indicating that the model captures significant patterns in the data, but there is still room for improvement.

## Mean Squared Error (MSE)

The **Mean Squared Error (MSE)** is a standard regression metric that measures the average of the squared differences between the predicted values and the actual target values.

### Definition

$MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$

Where:

- $y_i$: true value  
- $\hat{y}_i$: predicted value  
- $n$: number of samples

### Interpretation

- MSE penalizes larger errors more strongly due to squaring.
- The result is in the **squared unit** of the target variable (e.g., mm², N², €²).
- A **lower MSE** indicates better prediction accuracy.
- Because of squaring, the MSE is sensitive to **outliers**.

### Example

In this case, the model yielded an MSE of **88,837**, which may seem large, but this must be interpreted in the context of the unit and range of the target variable.

## Root Mean Squared Error (RMSE)

The **Root Mean Squared Error (RMSE)** is the square root of the MSE and represents the average prediction error in the same unit as the target variable.

### Definition

$RMSE = \sqrt{ \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2 } = \sqrt{MSE}$

### Interpretation

- RMSE is easier to interpret than MSE because it is in the **original unit** of the target variable.
- It gives a direct sense of **how far off predictions are**, on average.
- Like MSE, it is also sensitive to outliers due to the squaring.

### Example

In this case, the RMSE is approximately **298 N**.  
Given that the mean of the target variable (PullTest) is **2953 N**, this corresponds to a **relative prediction error of about 10.1 %**.

This means that, on average, the model's predictions deviate from the true pull test values by approximately 298 N.


In [None]:
results = {}

X_train_new = X_train.drop(columns=["Material", "Category", "Comments"])
X_dev_new = X_dev.drop(columns=["Material", "Category", "Comments"])

for name, model in models.items():
    model.fit(X_train_new, y_train)
    preds = model.predict(X_dev_new)
    
    mse = mean_squared_error(y_dev, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_dev, preds)
    
    results[name] = {
        "MSE": mse,
        "RMSE": rmse,
        "R²": r2
    }

In [None]:
results_df = pd.DataFrame(results).T
results_df = results_df.round(2)
print(results_df)

In [None]:
cdf_data = {}  

for name, model in models.items():
    model.fit(X_train_new, y_train)
    preds = model.predict(X_dev_new)
    
    mse = mean_squared_error(y_dev, preds)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_dev, preds)
    
    results[name] = {
        "MSE": mse,
        "RMSE": rmse,
        "R²": r2
    }
    
    abs_errors = np.abs(preds - y_dev)
    sorted_errors = np.sort(abs_errors)
    cdf = np.arange(1, len(sorted_errors)+1) / len(sorted_errors)
    cdf_data[name] = (sorted_errors, cdf)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

for name, (errors, cdf) in cdf_data.items():
    plt.plot(errors, cdf, label=name)

plt.xlabel("Absolute Error |ŷ - y| [N]")
plt.ylabel("Cumulative Probability")
plt.title("CDF of Absolute Prediction Errors per Model")
plt.grid(True)
plt.legend()
plt.show()