In [1]:
# takes LLM generated text and generates word embeddings from the BERT model, then writes to a pkl file
# import dependencies
import pandas as pd
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import torch
import pickle

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# load the prompt data
model_df = pd.read_csv('final_data.csv')

# drop the first column because it saved with an index column
model_df = model_df.drop(model_df.columns[0], axis=1)

# create lists to hold the embeddings for dumping into a pkl file
tokens_emb = []
cls_emb = []

# iterate through each prompt to generate the embeddings
for i in tqdm(range(len(model_df))):
    row = model_df.iloc[i]
    text = row['response_text']

    # send response_text to tokenizer
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # unpack the tensors
    with torch.no_grad():
        outputs = model(**inputs)
    token_embeddings = outputs[0]
    cls_embedding = outputs[1]

    # store in their respective lists
    tokens_emb.append(token_embeddings)
    cls_emb.append(cls_embedding)

# write to pkl files
with open("tokens_emb.pkl", "wb") as f:
    pickle.dump(tokens_emb, f)
with open("cls_emb.pkl", "wb") as f:
    pickle.dump(cls_emb, f)

100%|██████████████████████████████████████████████████████████████████████████| 14637/14637 [2:21:56<00:00,  1.72it/s]
