In [None]:
import pandas as pd
import re
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import torch
import string
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

In [None]:

def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if not c.isdigit()])      
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

data = pd.read_csv('./test.csv')
data = data.dropna()
data['text'] = data.iloc[:, 2]

In [None]:
def predict_next_word_bert(sequence):
    model = pipeline('fill-mask', model='bert-base-uncased')
    pred = model(sequence + ' [MASK]')
    return pred[0]['token_str']

def predict_next_word_gpt2(sequence):
    tokenizer = AutoTokenizer.from_pretrained("gpt2")
    model = AutoModelForCausalLM.from_pretrained("gpt2")
    inputs = tokenizer(sequence, return_tensors="pt")
    input_ids = inputs["input_ids"]
    with torch.no_grad():
        logits = model(**inputs).logits[:, -1, :]
    pred_id = torch.argmax(logits).item()
    pred_word = tokenizer.decode(pred_id)
    return pred_word

In [None]:
#without . version

from tqdm import tqdm

actual_words = []
predictions_bert = []
predictions_gpt2 = []

for idx, doc in tqdm(enumerate(data['text']), total=len(data['text']), desc="Processing Text"):
    sentences = doc.split('.')
    longest_sentence = max(sentences, key=lambda s: len(s.split()))
    longest_sentence = preprocess_text(longest_sentence)
    words = longest_sentence.split()
    
    if len(words) > 1:
        prompt = ' '.join(words[:-1])
        actual_word = words[-1]
        actual_words.append(actual_word)
        predictions_bert.append(predict_next_word_bert(prompt))
        predictions_gpt2.append(predict_next_word_gpt2(prompt))

In [None]:
import pandas as pd

data = {
    "Actual Words": actual_words,
    "BERT Predictions": predictions_bert,
    "GPT-2 Predictions": predictions_gpt2,
}

df = pd.DataFrame(data)

output_path = "./predictions_comparison_without_punc.csv"
df.to_csv(output_path, index=False)

output_path


In [None]:
def calculate_accuracy(predictions, actual_words):
    correct = sum([1 for pred, actual in zip(predictions, actual_words) if pred == actual])
    return correct / len(actual_words)

In [None]:
accuracy_bert = calculate_accuracy(predictions_bert, actual_words)
accuracy_gpt2 = calculate_accuracy(predictions_gpt2, actual_words)
print(f"BERT Accuracy: {accuracy_bert}")
print(f"GPT-2 Accuracy: {accuracy_gpt2}")

In [None]:
process_bert = []
process_gpt = []
for index in range(len(predictions_bert)):
    process_bert.append(preprocess_text(predictions_bert[index]))
    process_gpt.append(preprocess_text(predictions_gpt2[index]))

In [None]:
accuracy_bert = calculate_accuracy(process_bert, actual_words)
accuracy_gpt2 = calculate_accuracy(process_gpt, actual_words)
print(f"BERT Accuracy: {accuracy_bert}")
print(f"GPT-2 Accuracy: {accuracy_gpt2}")

with . version

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = ''.join([c for c in text if not c.isdigit()])      
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text.strip()

data = pd.read_csv('./test.csv')
data = data.dropna()
data['text'] = data.iloc[:, 2]

In [None]:
from tqdm import tqdm

actual_wordsv2 = []
predictions_bertv2 = []
predictions_gpt2v2 = []

for idx, doc in tqdm(enumerate(data['text']), total=len(data['text']), desc="Processing Text"):
    sentences = doc.split('.')
    longest_sentence = max(sentences, key=lambda s: len(s.split()))
    
    longest_sentence = preprocess_text(longest_sentence)
    longest_sentence = longest_sentence + "."
    
    words = longest_sentence.split()
    if len(words) > 1:
        prompt = ' '.join(words[:-1])
        actual_word = words[-1]
        actual_wordsv2.append(actual_word)
        predictions_bertv2.append(predict_next_word_bert(prompt))
        predictions_gpt2v2.append(predict_next_word_gpt2(prompt))

In [None]:
import pandas as pd

data = {
    "Actual Words": actual_wordsv2,
    "BERT Predictions": predictions_bertv2,
    "GPT-2 Predictions": predictions_gpt2v2,
}

df = pd.DataFrame(data)

output_path = "./predictions_comparison_with_punc.csv"
df.to_csv(output_path, index=False)

output_path


In [None]:
accuracy_bert2 = calculate_accuracy(predictions_bertv2, actual_wordsv2)
accuracy_gpt22 = calculate_accuracy(predictions_gpt2v2, actual_wordsv2)
print(f"BERT Accuracy: {accuracy_bert2}")
print(f"GPT-2 Accuracy: {accuracy_gpt22}")

In [None]:
process_bertv2 = []
process_gptv2 = []
actual_wordsv22 = []
for index in range(len(predictions_bert)):
    process_bertv2.append(' '.join(predictions_bertv2[index].split()))
    process_gptv2.append(' '.join(predictions_gpt2v2[index].split()))

In [None]:
accuracy_bert2 = calculate_accuracy(process_bertv2, actual_wordsv2)
accuracy_gpt22 = calculate_accuracy(process_gptv2, actual_wordsv2)
print(f"BERT Accuracy: {accuracy_bert2}")
print(f"GPT-2 Accuracy: {accuracy_gpt22}")