In [None]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pickle

from tqdm import tqdm
from copy import deepcopy
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Experimentation

In [None]:
# extract the embeddings from the dataframe
train_embeddings_text = train_df_raw["text"]
valid_embeddings_text = valid_df_raw["text"]
test_embeddings_text = test_df_raw["text"]

target_train = train_df_raw[["labels"]]
target_valid = valid_df_raw[["labels"]]
target_test = test_df_raw[["labels"]]

### Build dataset

In [None]:
class BertDataset(Dataset):
    def __init__(self, x, y, tokenizer, max_length):
        super(BertDataset, self).__init__()
        self.x = x
        self.y = torch.from_numpy(y).float().view(-1, 1)
        self.tokenizer=tokenizer
        self.max_length=max_length
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        text = str(self.x[index])
        # encode returns a dictionary with keys: input_ids, token_type_ids, attention_mask
        inputs = self.tokenizer.encode_plus(
            text=text,
            text_pair=None,
            padding='max_length',
            add_special_tokens=True,
            return_attention_mask=True,
            max_length=self.max_length,
        )
        ids = inputs["input_ids"][:self.max_length]
        token_type_ids = inputs["token_type_ids"][:self.max_length]
        mask = inputs["attention_mask"][:self.max_length]
        items = {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'target': self.y[index],
            }

        return items

In [None]:
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
dataset_train = BertDataset(
    x=train_embeddings_text.values, 
    y=target_train.values, 
    tokenizer=tokenizer, 
    max_length=100,
    )
dataloader_train = DataLoader(
    dataset=dataset_train, 
    batch_size=32, 
    shuffle=True,
    )

dataset_valid = BertDataset(
    x=valid_embeddings_text.values, 
    y=target_valid.values, 
    tokenizer=tokenizer, 
    max_length=100,
    )
dataloader_valid = DataLoader(
    dataset=dataset_valid, 
    batch_size=32, 
    shuffle=True,
    )

dataset_test = BertDataset(
    x=test_embeddings_text.values, 
    y=target_test.values, 
    tokenizer=tokenizer, 
    max_length=100,
    )
dataloader_test = DataLoader(dataset=dataset_test, batch_size=32, shuffle=True)

### Build BERT model

In [None]:
class BertRegressor(nn.Module):
    def __init__(self):
        super(BertRegressor, self).__init__()
        self.bert_model = transformers.BertModel.from_pretrained("bert-base-uncased")
        self.fc1 = nn.Linear(768, 512)
        nn.init.xavier_normal_(self.fc1.weight)
        self.batch_norm1 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.3)

        self.fc2 = nn.Linear(512, 256)
        nn.init.xavier_normal_(self.fc2.weight)
        self.batch_norm2 = nn.BatchNorm1d(256)
        self.dropout2 = nn.Dropout(0.2)

        self.out = nn.Linear(256, 1)
        nn.init.xavier_normal_(self.out.weight)
        
    def forward(self,input_ids, attention_mask, token_type_ids):
        # Feed input to BERT
        _, pooled_output = self.bert_model(
            input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids, 
            return_dict=False,
            )
        
        # Regression head
        x = self.dropout1(F.relu(self.batch_norm1(self.fc1(pooled_output))))
        x = self.dropout2(F.relu(self.batch_norm2(self.fc2(x))))
        output = F.sigmoid(self.out(x))

        return output

In [None]:
def train_regression(model, train_loader, optimizer, objective, device):
    model.train()
    train_batch_loss = []

    for batch in train_loader:
        ids = batch["input_ids"].to(device)
        mask = batch["attention_mask"].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        y = batch["target"].to(device)

        optimizer.zero_grad()

        y_hat = model(
            input_ids=ids,
            attention_mask=mask,
            token_type_ids=token_type_ids,
            )

        loss = objective(y_hat, y)
        loss.backward()

        optimizer.step()

        train_batch_loss.append(loss.item())

    train_loss = sum(train_batch_loss) / len(train_batch_loss)

    return train_loss


def validate_regression(model, val_loader, objective, device):
    model.eval()
    val_batch_loss = []

    with torch.no_grad():
        for batch in val_loader:
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            y_val = batch["target"].to(device)

            y_hat_val = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids,
                )

            loss_val = objective(y_hat_val, y_val)
            
            val_batch_loss.append(loss_val.item())

    val_loss = sum(val_batch_loss) / len(val_batch_loss)

    return val_loss

def evaluate(model, data_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for x_test, y_test in data_loader:
            x_test, y_test = x_test.to(device), y_test.to(device)
            y_hat = model(x_test)
            predictions.append(y_hat)

    return predictions

def predict(model, data_loader, device):
    model.eval()
    predictions = []

    with torch.no_grad():
        for batch in data_loader:
            ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            token_type_ids = batch["token_type_ids"].to(device)
            y_hat = model(
                input_ids=ids,
                attention_mask=mask,
                token_type_ids=token_type_ids,
                )
            predictions.append(y_hat.cpu())
    
    predictions = np.concatenate(predictions, axis=0).squeeze()
    return predictions

class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 1
        self.best_validation_loss = float('inf')
        self.best_model_weights = None

    def early_stop(self, validation_loss, model):
        if validation_loss < self.best_validation_loss - self.min_delta:
            self.best_validation_loss = validation_loss
            self.counter = 1
            self.save_best_weights(model)
            print(f"Early Stopping counter: {self.counter} out of {self.patience}")
        else:
            self.counter += 1
            print(f"Early Stopping counter: {self.counter} out of {self.patience}")
            if self.counter >= self.patience:
                return True
        return False

    def save_best_weights(self, model):
        self.best_model_weights = deepcopy(model.state_dict())

    def restore_best_weights(self, model):
        model.load_state_dict(self.best_model_weights)

In [None]:
# lists to store loss and accuracy
train_loss = []
val_loss = []

# training lists and parameters
n_epochs = 100
patience = 5
device = torch.device('cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu')

# instantiate model, loss function and optimizer
model = BertRegressor().to(device)
objective = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.00001, weight_decay=0.01)
early_stopper = EarlyStopper(patience=patience, min_delta=0)

for epoch in range(n_epochs):
    tr_loss = train_regression(model, dataloader_train, optimizer, objective, device)
    train_loss.append(tr_loss)

    eva_loss = validate_regression(model, dataloader_valid, objective, device)
    val_loss.append(eva_loss)
    print(f'[{epoch+1}/{n_epochs}] Train loss: {tr_loss:.4f} - Val loss: {eva_loss:.4f}')
    if early_stopper.early_stop(eva_loss, model):
        early_stopper.save_best_weights(model)
        print("Patience Depleated: Early Stopping triggered.")
        break
print("Restoring best weights")
early_stopper.restore_best_weights(model)
print("Training Complete")

In [None]:
y_pred_train = predict(model, dataloader_train, device)
y_pred_valid = predict(model, dataloader_valid, device)
y_pred_test = predict(model, dataloader_test, device)


mse_train = mean_squared_error(target_train, y_pred_train, squared=True)
mse_valid = mean_squared_error(target_valid, y_pred_valid, squared=True)
mse_test = mean_squared_error(target_test, y_pred_test, squared=True)

mae_train = mean_absolute_error(target_train, y_pred_train)
mae_valid = mean_absolute_error(target_valid, y_pred_valid)
mae_test = mean_absolute_error(target_test, y_pred_test)

# create single dataframe to compare results
results_df = pd.DataFrame()
results_df["train"] = [mse_train, mae_train]
results_df["valid"] = [mse_valid, mae_valid]
results_df["test"] = [mse_test, mae_test]
results_df.index = ["mse", "mae"]
results_df

In [None]:
train_results_df = pd.DataFrame({
    "input_text": train_embeddings_text,
    "target": target_train.values.flatten(),
    "predicted": y_pred_train
    })

valid_results_df = pd.DataFrame({
    "input_text": valid_embeddings_text,
    "target": target_valid.values.flatten(),
    "predicted": y_pred_valid
    })

test_results_df = pd.DataFrame({
    "input_text": test_embeddings_text,
    "target": target_test.values.flatten(),
    "predicted": y_pred_test
    })


train_results_df["residual"] = train_results_df["target"] - train_results_df["predicted"]
valid_results_df["residual"] = valid_results_df["target"] - valid_results_df["predicted"]
test_results_df["residual"] = test_results_df["target"] - test_results_df["predicted"]

train_results_df["residual_abs"] = abs(train_results_df["target"] - train_results_df["predicted"])
valid_results_df["residual_abs"] = abs(valid_results_df["target"] - valid_results_df["predicted"])
test_results_df["residual_abs"] = abs(test_results_df["target"] - test_results_df["predicted"])

In [None]:
# plot residuals
plt.scatter(test_results_df.index, test_results_df["residual"])
plt.title("Residuals vs. True Values")
plt.xlabel("True Values")
plt.ylabel("Residuals")
plt.show()

In [None]:
# get top n residuals
n = 25
sample_test = test_results_df.nlargest(n, "residual_abs")
sample_test