In [15]:
# Importing the necessary libraries
import torch
from transformers import (
    BertModel,
    BertTokenizer,
    AdamW,
    get_linear_schedule_with_warmup,
)
import pandas as pd
import numpy as np

# Load the numerical data you want to train BERT on
df = pd.read_csv("data.csv")

# Select only rows where x_e_out not missing

# Define the name of the column that you want to move to the end of the DataFrame
column_name = "x_e_out [-]"

# Select the column and drop it from the DataFrame
column_to_move = df[column_name]
col = df.drop(column_name, axis=1, inplace=True)

# Append the column back to the end of the DataFrame
df[column_name] = column_to_move

data = df[~df["x_e_out [-]"].isna()]

# start with a small data set for speed
data = data[0:100]
data = data.reset_index(drop=True)

# Convert numerical values to string format to match BERT input requirement
data = data.astype(str)

data["sequence"] = ""

# Concatenate all the values in a row into a single string using the column names
# Iterate through rows and columns
for index, row in data.iterrows():
    string = ""
    for column in data.columns:
        if column != "sequence":
            # Concatenate column name with row value
            string += column + ": " + str(row[column]) + " "
    data["sequence"][index] = string

data.describe


<bound method NDFrame.describe of      id        author geometry pressure [MPa] mass_flux [kg/m2-s] D_e [mm]  \
0     0      Thompson     tube            7.0              3770.0      nan   
1     1      Thompson     tube            nan              6049.0     10.3   
2     2      Thompson      nan          13.79              2034.0      7.7   
3     3          Beus  annulus          13.79              3679.0      5.6   
4     5           nan      nan          17.24              3648.0      nan   
..  ...           ...      ...            ...                 ...      ...   
95  130      Thompson     tube          13.79              1356.0      7.8   
96  131      Thompson      nan           3.45              3838.0     10.3   
97  132           nan     tube          18.27              2197.0      3.0   
98  133  Richenderfer    plate            0.2              5600.0      nan   
99  134      Thompson     tube          18.96              3458.0      1.9   

   D_h [mm] length [mm] chf_e

In [2]:
"""AdamW# Define the BERT classifier to be used to train the model
class BERTClassifier(torch.nn.Module):
    def __init__(self, dropout_rate=0.3):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained("bert-base-uncased")
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.linear = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        _, pooled_out = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        dropout_out = self.dropout(pooled_out)
        linear_out = self.linear(dropout_out)
        return linear_out
"""


In [21]:
import torch
from transformers import BertTokenizer, BertModel

# Define the BERT Classifier
class BERTSequenceImputer(torch.nn.Module):
    def __init__(self, freeze_BERT=True, dropout_rate=0.3):
        super(BERTSequenceImputer, self).__init__()

        # Load the BERT model and tokenizer
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        # Freeze the weights of the BERT model
        if freeze_BERT:
            for param in self.bert.parameters():
                param.requires_grad = False

        # Define the linear layer for classification
        self.dropout = torch.nn.Dropout(dropout_rate)
        self.fc = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):

        # Convert the attention mask and padded_inputs into device friendly tensors
        # attention_mask = attention_mask.to(torch.device("cuda:0"))
        # padded_inputs = padded_inputs.to(torch.device("cuda:0"))

        # Use the BERT model to transform the sequence into an embedded format
        embedded_seq, _ = self.bert(input_ids=input_ids, attention_mask=attention_mask)[
            :2
        ]
        embedded_seq = self.dropout(embedded_seq)

        # Predict the missing value (in our case the last token) using the linear layer
        output = self.fc(embedded_seq[:, -1])
        return output


In [22]:
# Load the pretrained BERT model and tokenizer to convert data to tokenize-able format
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# model = BertModel.from_pretrained("bert-base-uncased")

sequences = data["sequence"]

MAX_LENGTH = 128
tokenized_data = sequences.apply(
    (
        lambda row: tokenizer.encode(
            row, add_special_tokens=True, padding="max_length", max_length=MAX_LENGTH
        )
    )
)
"""
# Use the tokenizer to convert the data into tokens and then into PyTorch tensor format
# Get the maximum length of all the sentences to pad the shorter ones to match that format
tokenized_data = sequences.apply((lambda x: tokenizer.encode(x, 
                                                                    add_special_tokens=True,
                                                                    padding='longest')
                                        )
                                        )
"""
# a = len(data['sequence'])

tokenized_data = tokenized_data.reset_index(drop=True)
input_data = torch.tensor(tokenized_data)

# Define the optimizer to be used to train the model
dropout_rate = 0.3
model = BERTSequenceImputer(dropout_rate=dropout_rate)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Train the model over a set number of epochs
epochs = 4
for epoch in range(epochs):
    epoch_loss = 0
    for i in range(len(input_data)):
        # Reset gradients
        model.zero_grad()

        # Forward pass
        input_ids = input_data[i, :]
        # input_ids = input_ids.unsqueeze(0)

        attention_mask = [int(token_id.item() > 0) for token_id in input_ids]

        # input_ids[[0]][0][0].item()

        attention_mask = torch.tensor(attention_mask).unsqueeze(0)

        y_pred = model.forward(input_ids=input_ids, attention_mask=attention_mask)

        # Compute loss
        y_true = torch.tensor([float(labels[i])])
        loss_func = torch.nn.BCEWithLogitsLoss()
        loss = loss_func(y_pred.view(-1), y_true.view(-1))
        epoch_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update weights
        optimizer.step()

    # Print loss per epoch
    print(f"Epoch: {epoch+1}, Loss: {epoch_loss}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


ValueError: not enough values to unpack (expected 2, got 1)