In [None]:
import os
import re
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig, BertTokenizerFast
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

# Loading data

In [None]:
# Set random seed
SEED = 42

eq_df = pd.read_csv("Data/FeynmanEquations.csv")[['Filename', 'Formula']]

data_directory = 'Data/Feynman_with_units'
# Define the maximum number of lines to read from each file
N = 4000

# Create an empty list to store tuples of (filename, line)
data = []

# Iterate over files in the data directory
for filename in os.listdir(data_directory):
    if os.path.isfile(os.path.join(data_directory, filename)):
        file_path = os.path.join(data_directory, filename)
        # Open file and read lines
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')
            # Append tuples of (filename, line) for each line in the file (up to N lines)
            for line in lines[:N]:
                data.append((filename, line))

# Convert the list of tuples to a DataFrame
df = pd.DataFrame(data, columns=['Filename', 'features'])

del data

# Display the DataFrame
print(df)

# Processing data

## Preparing Tokenizer

In [None]:
# Combine formulas from 'eq_df' with a range of numbers and additional characters
corpus = eq_df.Formula.tolist() + [str(i) for i in range(10)] + ["-", "."]

# Initialize BERT tokenizer from pre-trained 'bert-base-uncased' model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Train tokenizer on the given corpus with a max vocab size of 1000
tokenizer = tokenizer.train_new_from_iterator(corpus, vocab_size=1000)

# Set beginning-of-sequence token and end-of-sequence token
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

# Get the vocabulary size
vocab_size = len(tokenizer)

df = pd.merge(eq_df, df, on="Filename", how='inner').drop(columns=['Filename'])

del eq_df, corpus

In [None]:
# Pre-tokenization for ensuring character level tokens 

def pre_tokenize(data):
    return data.replace(" ", ';').replace("", " ").replace(" ; ", tokenizer.sep_token)

df['features'] = df['features'].apply(pre_tokenize)

## Creating dataset

In [None]:
def process_data_to_model_inputs(batch):
    """
    Preprocesses data for input to the model.

    Args:
        batch (dict): A batch of data containing "features" and "Formula" keys.

    Returns:
        dict: Processed batch with input_ids, attention_mask, decoder_input_ids,
        decoder_attention_mask, and labels for the model.
    """
    # Tokenize the inputs and labels
    inputs = tokenizer(
        batch["features"],
        padding="max_length",
        max_length=256,
    )
    outputs = tokenizer(
        batch["Formula"],
        padding="max_length",
        max_length=128,
    )
    
    # Assign tokenized inputs and labels to batch
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    
    # Mask padding tokens in the labels
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch

In [None]:
# Initialize empty DataFrames for train, test, and validation datasets
df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_valid = pd.DataFrame()

# Split the data into train, test, and validation sets
for i in range(100):
    # Sample data for the current iteration and shuffle it
    dat = df.iloc[i * N: N * (i + 1)].sample(frac=1, random_state=SEED)
    
    # Calculate lengths for train, test, and validation splits
    total_len = len(dat)
    train_len = int(0.9 * total_len)
    test_len = int(0.05 * total_len)  # Remaining 5% for test and valid splits
    valid_len = total_len - train_len - test_len
    
    # Concatenate the splits to their respective DataFrames
    df_train = pd.concat([df_train, dat.iloc[:train_len]])
    df_test = pd.concat([df_test, dat.iloc[train_len:train_len + test_len]])
    df_valid = pd.concat([df_valid, dat.iloc[train_len + test_len:]])

del dat

df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
df_valid.reset_index(inplace=True, drop=True)

# Create datasets from the DataFrames
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
valid_dataset = Dataset.from_pandas(df_valid)

del df_train, df_test, df_valid, df

In [None]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=512,
    remove_columns=["Formula", 'features'],
)

valid_dataset = valid_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=512,
    remove_columns=["Formula", 'features'],
)

test_dataset = test_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=512,
    remove_columns=["Formula", 'features'],
)

train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)
valid_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask",  "labels"],
)
test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask",  "labels"],
)

# Training 

## Initializing model with desired configuration

In [None]:
# Initialize configurations for encoder and decoder
config_encoder = BertConfig()
config_decoder = BertConfig()

# To obtain attention outputs for the trained model
config_encoder.output_attentions = True
config_decoder.output_attentions = True

config_decoder.is_decoder = True
config_decoder.add_cross_attention = True
config_decoder.vocab_size = vocab_size

# Modify the number of hidden layers
config_encoder.num_hidden_layers = 3
config_decoder.num_hidden_layers = 3

# Create an encoder-decoder configuration
config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

# Initialize the Encoder-Decoder model
bert2bert = EncoderDecoderModel(config=config)

# Set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# Set parameters for beam search for greedy search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 30
bert2bert.config.min_length = 0
bert2bert.config.no_repeat_ngram_size = 0
bert2bert.config.early_stopping = False
bert2bert.config.length_penalty = 1.0
bert2bert.config.num_beams = 1

## Function for calculating sequence accuracy

In [None]:
def compute_metrics(pred):
    """
    Computes sequence accuracy metric for model predictions.

    Args:
        pred (EvalPrediction): The prediction object containing label_ids and predictions.

    Returns:
        dict: A dictionary containing the computed sequence accuracy.
    """
    # Extract label ids and predicted ids
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # Decode predicted and label sequences
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    
    # Replace padding tokens in label ids with pad_token_id
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    
    # Compute sequence accuracy
    count = 0
    total = len(pred_str)
    for i in range(total):
        if pred_str[i] == label_str[i]:
            count += 1
    acc = count / total

    return {"sequence_accuracy": acc}


## Preparing trainer & execution

In [None]:
# Calculate the number of steps per epoch
steps = math.ceil(len(train_dataset) / 16)

# Define training arguments
trainer_args = Seq2SeqTrainingArguments(
    output_dir="./bert_min_lr_e5",
    fp16=True,  # Change to False if using CPU only
    predict_with_generate=True,
    learning_rate=5e-05,
    num_train_epochs=30,  # The total number of training epochs to run
    per_device_train_batch_size=16,  # Batch size per device during training
    per_device_eval_batch_size=32,  # Batch size for evaluation
    report_to="none",
    evaluation_strategy="steps",  # Evaluated at the end of each epoch
    eval_steps=steps,
    do_eval=True,
    save_strategy="steps",
    save_steps=steps,
    save_total_limit=2,  # Save the best and most recent checkpoints
    logging_strategy='steps',
    logging_steps=steps,
    load_best_model_at_end=True, 
    metric_for_best_model="sequence_accuracy",
    greater_is_better=True,
    save_safetensors=False # safe_tensors had some bugs with this model
)

# Initialize trainer
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=trainer_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [None]:
trainer.train()