In [None]:
import os
import re
import math
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import torch.nn as nn
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, GenerationConfig, BertTokenizerFast

In [None]:
SEED = 42

In [None]:
eq_df =  pd.read_csv("Data/FeynmanEquations.csv")[['Filename','Formula']]

In [None]:
data_directory = 'Data/Feynman_with_units'
N = 1000
# Create an empty list to store tuples of (key, value)
data = []

# Iterate over files
for filename in os.listdir(data_directory):
    if os.path.isfile(os.path.join(data_directory, filename)):
        file_path = os.path.join(data_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')
            # Append tuples of (key, value) for each line in the file
            for line in lines[:N]:
                data.append((filename, line))

# Convert the list of tuples to a DataFrame
df = pd.DataFrame(data, columns=['Filename', 'features'])
del data
# Display DataFrame
print(df)

In [None]:
corpus = eq_df.Formula.tolist() + [str(i) for i in range(10)] + ["-", "."]

In [None]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
tokenizer = tokenizer.train_new_from_iterator(corpus, 1000)
tokenizer.bos_token = tokenizer.cls_token
tokenizer.eos_token = tokenizer.sep_token

In [None]:
vocab_size = len(tokenizer)

In [None]:
df = pd.merge(eq_df,df,on="Filename",how='inner').drop(columns=['Filename'])
del eq_df, corpus

In [None]:
df_train = pd.DataFrame()
df_test = pd.DataFrame()
df_valid = pd.DataFrame()
for i in range(100):
    dat = df.iloc[i*N : N*(i+1)].sample(frac=1,random_state=SEED)
    total_len = len(dat)
    train_len = int(0.9 * total_len)
    test_len = int(0.05 * total_len)  # Remaining 5% for test and valid splits
    valid_len = total_len - train_len - test_len
    df_train = pd.concat([df_train,dat.iloc[:train_len]])
    df_test = pd.concat([df_test,dat.iloc[train_len:train_len + test_len]])
    df_valid = pd.concat([df_valid, dat.iloc[train_len + test_len:]])

del dat

# Assign data to splits
df_train.reset_index(inplace=True,drop=True)
df_test.reset_index(inplace=True,drop=True)
df_valid.reset_index(inplace=True,drop=True)
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)
valid_dataset = Dataset.from_pandas(df_valid)

del df_train, df_test, df_valid, df

In [None]:
train_dataset

In [None]:
def process_data_to_model_inputs(batch):
    # tokenize the inputs and labels
    inputs = tokenizer(
        batch["features"],
        padding="max_length",
        max_length=256,
    )
    outputs = tokenizer(
        batch["Formula"],
        padding="max_length",
        max_length=128,
    )
    
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch

In [None]:
train_dataset = train_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=16,
    remove_columns=["Formula", 'features'],
)

valid_dataset = valid_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=32,
    remove_columns=["Formula", 'features'],
)

test_dataset = test_dataset.map(
    process_data_to_model_inputs,
    batched=True,
    batch_size=32,
    remove_columns=["Formula", 'features'],
)

In [None]:
train_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask", "labels"],
)
valid_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask",  "labels"],
)
test_dataset.set_format(
    type="torch",
    columns=["input_ids", "attention_mask",  "labels"],
)

In [None]:
from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

config_encoder = BertConfig()
config_decoder = BertConfig()

In [None]:
config_encoder.num_hidden_layers = 3
config_decoder.num_hidden_layers = 12

config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
bert2bert = EncoderDecoderModel(config=config)

In [None]:
# from transformers import EncoderDecoderModel

# bert2bert = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")

In [None]:
# bert2bert.config.decoder.vocab_size = vocab_size

In [None]:
# set special tokens
bert2bert.config.decoder_start_token_id = tokenizer.bos_token_id
bert2bert.config.eos_token_id = tokenizer.eos_token_id
bert2bert.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
bert2bert.config.vocab_size = bert2bert.config.decoder.vocab_size
bert2bert.config.max_length = 30
bert2bert.config.min_length = 0
bert2bert.config.no_repeat_ngram_size = 0
bert2bert.config.early_stopping = False
bert2bert.config.length_penalty = 1.0
bert2bert.config.num_beams = 1

In [None]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)
    count = 0
    total =  len(pred_str)
    for i in range(total):
      if(pred_str[i] == label_str[i]):
        count+=1
    acc = count / total

    return {"sequence_accuracy": acc}

In [None]:
steps = math.ceil(len(train_dataset) / 16)

In [None]:
trainer_args = Seq2SeqTrainingArguments(output_dir="./bert_3_gredy",
                                          fp16=True, # # Change to False if using CPU only
                                          predict_with_generate = True,
                                          learning_rate=0.0001 ,
                                          num_train_epochs=100, # The total number of training epochs to run.
                                          per_device_train_batch_size=16,  # batch size per device during training
                                          per_device_eval_batch_size=32, # batch size for evaluation
                                          # gradient_accumulation_steps=2,
                                          report_to="none",
                                          evaluation_strategy="steps", # Evaluated at the end of epochs
                                          eval_steps=steps,
                                          do_eval=True,
                                          save_strategy="steps",
                                          save_steps= steps,
                                          save_total_limit=2, # Save the best and most recent checkpoints
                                          logging_strategy='steps',
                                          logging_steps=steps,
                                          load_best_model_at_end=True, # Load the best model at the end
                                          metric_for_best_model="sequence_accuracy",
                                          greater_is_better=True,
                                          save_safetensors=False
            
                                         )

In [None]:
trainer = Seq2SeqTrainer(
    model=bert2bert,
    tokenizer=tokenizer,
    args=trainer_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

In [None]:
trainer.train()