In [None]:
# Transformers installation
! pip install transformers datasets


In [None]:
# load SQuAD dataset
from datasets import load_dataset

# split into train and test set with the train_test_split method
squad = load_dataset("squad", split="train[:5000]")
squad = squad.train_test_split(test_size=0.2)

In [None]:
# load tokenizer to process the question and context fields
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        # print(i, sequence_ids)
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# To apply the preprocessing function over the entire dataset, use the map function
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

In [None]:
# for creating a batch of examples using DefaultDataCollator
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
# setting up an optimizer function, learning rate schedule, and some training hyperparameters
from transformers import create_optimizer

batch_size = 16
num_epochs = 3
total_train_steps = (len(tokenized_squad["train"]) // batch_size) * num_epochs
optimizer, schedule = create_optimizer(
    init_lr=5e-5,
    num_warmup_steps=0,
    num_train_steps=total_train_steps,
)



In [None]:
# pick the model which will be evaluated for CO2 emissions
from transformers import TFAutoModelForQuestionAnswering

model_name = "distilbert-base-uncased"
# model_name = 'bert-base-uncased'
model = TFAutoModelForQuestionAnswering.from_pretrained(model_name)


In [None]:
# Convert datasets to the tf.data.Dataset format with prepare_tf_dataset()
tf_train_set = model.prepare_tf_dataset(
    tokenized_squad["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_squad["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

# configure model for training with the compile method
import tensorflow as tf

model.compile(optimizer=optimizer)

In [None]:
device_name = tf.test.gpu_device_name()
device_name

# train model
with tf.device('/device:GPU:0'):
    model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3)

In [None]:
# use codecarbon package to measure the carbon emissions of the model
! pip install codecarbon
from codecarbon import EmissionsTracker

with EmissionsTracker(project_name="bert-base-uncased") as tracker:
    with tf.device('/device:GPU:0'):
        history = model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=2)

print(tracker.final_emissions)

In [None]:
# total CO2 emissions and validation loss
print(f"Total emission of kilograms of CO2 is: {tracker.final_emissions}")
print(f"Final error of the model is: {history.history['val_loss'][-1]}")

In [None]:
!pip install openai
import os
import openai
openai.api_key  = os.getenv('enter your API key here')

import numpy as np
from scipy.spatial.distance import cosine

!pip install openai

def answer_question(prompt, question):
    inputs = tokenizer(prompt, question, add_special_tokens=True, return_tensors="tf")
    input_ids = inputs["input_ids"].numpy()[0]
    attention_mask = inputs["attention_mask"].numpy()[0]

    outputs = model(inputs)
    answer_start = tf.argmax(outputs.start_logits, axis=1).numpy()[0]
    answer_end = tf.argmax(outputs.end_logits, axis=1).numpy()[0]

    answer = tokenizer.decode(input_ids[answer_start:answer_end+1], skip_special_tokens=True)
    return answer

In [None]:
def calculate_cosine_similarity(sentence1: str, sentence2: str) -> float:
    """
    Calculate the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.

    Returns:
        float: The cosine similarity between the two sentences, represented as a float value between 0 and 1.
    """
    # Tokenize the sentences into words
    words1 = sentence1.lower().split()
    words2 = sentence2.lower().split()

    # Create a set of unique words from both sentences
    unique_words = set(words1 + words2)

    # Create a frequency vector for each sentence
    freq_vector1 = np.array([words1.count(word) for word in unique_words])
    freq_vector2 = np.array([words2.count(word) for word in unique_words])

    # Calculate the cosine similarity between the frequency vectors
    similarity = 1 - cosine(freq_vector1, freq_vector2)

    return similarity

def levenshtein_distance(s1: str, s2: str) -> float:
    """
    Compute the Levenshtein distance between two strings.

    Args:
        s1 (str): The first string.
        s2 (str): The second string.

    Returns:
        float: The Levenshtein distance between the two strings.
    """
    m = len(s1)
    n = len(s2)

    # Create a matrix to store the distances between substrings of s1 and s2
    d = [[0] * (n + 1) for _ in range(m + 1)]

    # Initialize the first row and column of the matrix
    for i in range(m + 1):
        d[i][0] = i
    for j in range(n + 1):
        d[0][j] = j

    # Compute the distances between all substrings of s1 and s2
    for j in range(1, n + 1):
        for i in range(1, m + 1):
            if s1[i - 1] == s2[j - 1]:
                d[i][j] = d[i - 1][j - 1]
            else:
                d[i][j] = min(d[i - 1][j], d[i][j - 1], d[i - 1][j - 1]) + 1

    # Return the Levenshtein distance between the two strings
    return d[m][n]*(-1)

In [None]:
random_index_integers=np.random.randint(len(squad["train"]), size=100)

!pip install google-generativeai openai sentence-transformers
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer

import google.generativeai as palm

In [None]:
# evaluate the models using cosine similarity and STS scores and testing on samples from the SQuAD dataset

scores_1 = []
scores_2 = []
scores_3 = []
scores_4 = []
def openai_text_embedding(prompt: str, key: str) -> str:
    # API Key
    openai.api_key = key

    return openai.Embedding.create(
        input=prompt, model="text-embedding-ada-002"
    )["data"][0]["embedding"]

def palm_text_embedding(prompt: str, key: str) -> str:
    # API Key
    palm.configure(api_key=key)
    model = "models/embedding-gecko-001"

    return palm.generate_embeddings(model=model, text=prompt)['embedding']


def calculate_cosine_similarity(sentence1: str, sentence2: str) -> float:
    """
    Calculate the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.

    Returns:
        float: The cosine similarity between the two sentences, represented as a float value between 0 and 1.
    """
    # Tokenize the sentences into words
    words1 = sentence1.lower().split()
    words2 = sentence2.lower().split()

    # Create a set of unique words from both sentences
    unique_words = set(words1 + words2)

    # Create a frequency vector for each sentence
    freq_vector1 = np.array([words1.count(word) for word in unique_words])
    freq_vector2 = np.array([words2.count(word) for word in unique_words])

    # Calculate the cosine similarity between the frequency vectors
    similarity = 1 - cosine(freq_vector1, freq_vector2)

    return similarity

def calculate_sts_score(sentence1: str, sentence2: str) -> float:
    model = SentenceTransformer(
        "paraphrase-MiniLM-L6-v2"
    )

    # Compute sentence embeddings
    embedding1 = model.encode([sentence1])[0]  # Flatten the embedding array
    embedding2 = model.encode([sentence2])[0]  # Flatten the embedding array

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score


def calculate_sts_openai_score(sentence1: str, sentence2: str, key: str) -> float:
    # Compute sentence embeddings
    embedding1 = openai_text_embedding(sentence1, key) # Flatten the embedding array
    embedding2 = openai_text_embedding(sentence2, key)  # Flatten the embedding array

    # Convert to array
    embedding1 = np.asarray(embedding1)
    embedding2 = np.asarray(embedding2)

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score

def calculate_sts_palm_score(sentence1: str, sentence2: str, key: str) -> float:
    # Compute sentence embeddings
    embedding1 = palm_text_embedding(sentence1, key) # Flatten the embedding array
    embedding2 = palm_text_embedding(sentence2, key)  # Flatten the embedding array

    # Convert to array
    embedding1 = np.asarray(embedding1)
    embedding2 = np.asarray(embedding2)

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score

random_index_integers=np.random.randint(len(squad["train"]), size=100)
# np.random.randint(0, len(squad["train"]), 100)
random_index_integers
for i in random_index_integers:
    current_sample = squad['train'][int(i)]

    # Collect context + question
    prompt = current_sample['context']
    question = current_sample['question']
    if len(prompt)<2:
      prediction = "N/A"
    if len(question)<2:
      prediction = "N/A"
    # prediction
    prediction = answer_question(prompt, question)
    if len(prediction)<2:
      prediction = "no output found"

    # save ground truth
    ground_truth = current_sample['answers']['text'][0]

    # compare answers
    score1 = calculate_cosine_similarity(prediction, ground_truth)
    score2 = calculate_sts_score(prediction, ground_truth)
    score3 = calculate_sts_openai_score(prediction, ground_truth, "enter your API key here")
    score4 = calculate_sts_palm_score(prediction, ground_truth, "enter your API key here")

    # collect scores
    scores_1.append(score1)
    scores_2.append(score2)
    scores_3.append(score3)
    scores_4.append(score4)

In [None]:
import numpy as np

#print avg and stdev of each similarity score for the SQuAD dataset

print(np.mean(scores_1), np.std(scores_1))
print(np.mean(scores_2), np.std(scores_2))
print(np.mean(scores_3), np.std(scores_3))
print(np.mean(scores_4), np.std(scores_4))

In [None]:
! pip install datasets
from datasets import load_dataset

# load AdversarialQA dataset
dataset = load_dataset("adversarial_qa", "adversarialQA")

In [None]:
import pandas as pd
from tqdm import tqdm

contexts = []
questions = []
answers = []
for i in tqdm(range(20)):
    curr_sample = dataset['validation'][i]
    contexts.append(curr_sample['context'])
    questions.append(curr_sample['question'])
    answers.append(curr_sample['answers']['text'][0])

df1 = pd.DataFrame()
df1['context'] = contexts
df1['question'] = questions
df1['answer'] = answers

In [None]:
# evaluate the models using cosine similarity and STS scores and testing on samples from the AdversarialQA dataset

scores_1 = []
scores_2 = []
scores_3 = []
scores_4 = []
def openai_text_embedding(prompt: str, key: str) -> str:
    # API Key
    openai.api_key = key

    return openai.Embedding.create(
        input=prompt, model="text-embedding-ada-002"
    )["data"][0]["embedding"]

def palm_text_embedding(prompt: str, key: str) -> str:
    # API Key
    palm.configure(api_key=key)
    model = "models/embedding-gecko-001"

    return palm.generate_embeddings(model=model, text=prompt)['embedding']


def calculate_cosine_similarity(sentence1: str, sentence2: str) -> float:
    """
    Calculate the cosine similarity between two sentences.

    Args:
        sentence1 (str): The first sentence.
        sentence2 (str): The second sentence.

    Returns:
        float: The cosine similarity between the two sentences, represented as a float value between 0 and 1.
    """
    # Tokenize the sentences into words
    words1 = sentence1.lower().split()
    words2 = sentence2.lower().split()

    # Create a set of unique words from both sentences
    unique_words = set(words1 + words2)

    # Create a frequency vector for each sentence
    freq_vector1 = np.array([words1.count(word) for word in unique_words])
    freq_vector2 = np.array([words2.count(word) for word in unique_words])

    # Calculate the cosine similarity between the frequency vectors
    similarity = 1 - cosine(freq_vector1, freq_vector2)

    return similarity

def calculate_sts_score(sentence1: str, sentence2: str) -> float:
    model = SentenceTransformer(
        "paraphrase-MiniLM-L6-v2"
    )

    # Compute sentence embeddings
    embedding1 = model.encode([sentence1])[0]  # Flatten the embedding array
    embedding2 = model.encode([sentence2])[0]  # Flatten the embedding array

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score


def calculate_sts_openai_score(sentence1: str, sentence2: str, key: str) -> float:
    # Compute sentence embeddings
    embedding1 = openai_text_embedding(sentence1, key) # Flatten the embedding array
    embedding2 = openai_text_embedding(sentence2, key)  # Flatten the embedding array

    # Convert to array
    embedding1 = np.asarray(embedding1)
    embedding2 = np.asarray(embedding2)

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score

def calculate_sts_palm_score(sentence1: str, sentence2: str, key: str) -> float:
    # Compute sentence embeddings
    embedding1 = palm_text_embedding(sentence1, key) # Flatten the embedding array
    embedding2 = palm_text_embedding(sentence2, key)  # Flatten the embedding array

    # Convert to array
    embedding1 = np.asarray(embedding1)
    embedding2 = np.asarray(embedding2)

    # Calculate cosine similarity between the embeddings
    similarity_score = 1 - cosine(embedding1, embedding2)

    return similarity_score

random_index_integers=np.random.randint(len(dataset["train"]), size=100)
# np.random.randint(0, len(df1["train"]), 100)
random_index_integers
for i in random_index_integers:
    current_sample = dataset['train'][int(i)]

    # Collect context + question
    prompt = current_sample['context']
    question = current_sample['question']
    if len(prompt)<2:
      prediction = "N/A"
    if len(question)<2:
      prediction = "N/A"
    # prediction
    prediction = answer_question(prompt, question)
    if len(prediction)<2:
      prediction = "no output found"

    # save ground truth
    ground_truth = current_sample['answers']['text'][0]

    # compare answers
    score1 = calculate_cosine_similarity(prediction, ground_truth)
    score2 = calculate_sts_score(prediction, ground_truth)
    score3 = calculate_sts_openai_score(prediction, ground_truth, "enter your API key here")
    score4 = calculate_sts_palm_score(prediction, ground_truth, "enter your API key here")

    # collect scores
    scores_1.append(score1)
    scores_2.append(score2)
    scores_3.append(score3)
    scores_4.append(score4)

In [None]:
#print avg and stdev of each similarity score for the AdversarialQA dataset

print(np.mean(scores_1), np.std(scores_1))
print(np.mean(scores_2), np.std(scores_2))
print(np.mean(scores_3), np.std(scores_3))
print(np.mean(scores_4), np.std(scores_4))