In [None]:
import os
import re
import math
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import  AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, BertTokenizerFast

In [None]:
%%capture
# Load tokenizer & model

device = 'cuda'
ckp_path = "./bert_ckp" # path where pretrained model & tokenizer are saved

tokenizer = BertTokenizerFast.from_pretrained(ckp_path)
model = AutoModelForSeq2SeqLM.from_pretrained(ckp_path)
model.to(device)

In [None]:
eq_df =  pd.read_csv("Data/FeynmanEquations.csv")[['Filename','Formula']]
data_directory = 'Data/Feynman_with_units'
N = 20000
# Create an empty list to store tuples of (key, value)
data = []

# Iterate over files
for filename in os.listdir(data_directory):
    if os.path.isfile(os.path.join(data_directory, filename)):
        file_path = os.path.join(data_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')
            # Append tuples of (key, value) for each line in the file
            for line in lines[:N]:
                data.append((filename, line))

# Convert the list of tuples to a DataFrame
df = pd.DataFrame(data, columns=['Filename', 'features'])
del data

df = pd.merge(eq_df,df,on="Filename",how='inner').drop(columns=['Filename'])
df = df.sample(n=300)
df.reset_index(inplace=True,drop=True)

def pre_tokenize(data):
    return data.replace(" ", ';').replace("", " ").replace(" ; ", tokenizer.sep_token)

df['features'] = df['features'].apply(pre_tokenize)

test_data = Dataset.from_pandas(df)

del eq_df

In [None]:
def generate_eqns(batch):

    inputs = tokenizer(batch["features"], padding="max_length", truncation=True, max_length=256, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)

    # all special tokens including will be removed
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch

In [None]:
results = test_data.map(generate_eqns, batched=True, batch_size=32, remove_columns=["features"])

pred_str = results["pred"]
label_str = results["Formula"]

In [None]:
# Calculate sequence accuracy

count = 0
acc = 0
pbar = tqdm(range(len(results)))
pbar.set_description("Seq_Acc_Cal")
for i in pbar:
    if pred_str[i].replace(" ", "") == label_str[i].lower().strip():
        count += 1
    pbar.set_postfix(seq_accuracy=count / (i + 1))