In [1]:
import os
import re
import math
import random
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainer, Seq2SeqTrainingArguments, BertTokenizerFast
from bertviz import *

2024-04-10 02:44:38.948603: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
%%capture 

## Load from checkpoint

device = 'cuda'

tokenizer = BertTokenizerFast.from_pretrained("./bert_final/checkpoint-675000/")
model = AutoModelForSeq2SeqLM.from_pretrained('./bert_final/checkpoint-675000/')
model.to(device)

In [3]:
data_directory = 'Data/Feynman_with_units'
N = 200
# Create an empty list to store tuples of (key, value)
data = []

# Iterate over files
for filename in os.listdir(data_directory):
    if os.path.isfile(os.path.join(data_directory, filename)):
        file_path = os.path.join(data_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as f:
            lines = f.read().split('\n')
            # Append tuples of (key, value) for each line in the file
            for line in lines[:N]:
                data.append((filename, line))

# Convert the list of tuples to a DataFrame
df = pd.DataFrame(data, columns=['Filename', 'features'])
del data


eq_df =  pd.read_csv("Data/FeynmanEquations.csv")[['Filename','Formula']]

df = pd.merge(eq_df,df,on="Filename",how='inner').drop(columns=['Filename'])

In [8]:
def pre_tokenize(data):
    return data.replace(" ", ';').replace("", " ").replace(" ; ", tokenizer.sep_token)

df['features'] = df['features'].apply(pre_tokenize)

In [4]:
rand_sample = random.randint(0,df.shape[0])

In [5]:
encoder_input_ids = tokenizer(df.iloc[rand_sample].features, return_tensors="pt", add_special_tokens=True).input_ids
decoder_input_ids = tokenizer(df.iloc[rand_sample].Formula, return_tensors="pt", add_special_tokens=True).input_ids

encoder_input_ids = encoder_input_ids.to(device)
decoder_input_ids = decoder_input_ids.to(device)

In [6]:
outputs = model(input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids)

In [7]:
encoder_text = tokenizer.convert_ids_to_tokens(encoder_input_ids[0])
decoder_text = tokenizer.convert_ids_to_tokens(decoder_input_ids[0])

In [10]:
model_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens=decoder_text
)

<IPython.core.display.Javascript object>

In [11]:
head_view(
    encoder_attention=outputs.encoder_attentions,
    decoder_attention=outputs.decoder_attentions,
    cross_attention=outputs.cross_attentions,
    encoder_tokens= encoder_text,
    decoder_tokens=decoder_text
)

<IPython.core.display.Javascript object>