# **Section 1: Dataset and Pre-Processing**

## **Step 1: Load the Data**

In [None]:
import pandas as pd
import json

# Load the datasets
with open('/content/sample_data/train.json') as f:
    train_data = json.load(f)
train_df = pd.DataFrame(train_data)

with open('/content/sample_data/val.json') as f:
    valid_data = json.load(f)
valid_df = pd.DataFrame(valid_data)

with open('/content/sample_data/test.json') as f:
    test_data = json.load(f)
test_df = pd.DataFrame(test_data)


## **Step 2: Tokenization Function Using spaCy**

In [None]:
import spacy

# Load English tokenizer
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nlp.remove_pipe('lemmatizer')


def text_pipeline_spacy_special(text):
  tokens = []
  doc = nlp(text)
  for t in doc:
    if not t.is_punct and not t.is_space:
      tokens.append(t.text.lower())
  return tokens


# Tokenizing questions and options
train_df['Question_Tokens'] = train_df['question'].apply(text_pipeline_spacy_special)
train_df['Options_Tokens'] = train_df['options'].apply(lambda opts: [text_pipeline_spacy_special(opt) for opt in opts])
valid_df['Question_Tokens'] = valid_df['question'].apply(text_pipeline_spacy_special)
valid_df['Options_Tokens'] = valid_df['options'].apply(lambda opts: [text_pipeline_spacy_special(opt) for opt in opts])
test_df['Question_Tokens'] = test_df['question'].apply(text_pipeline_spacy_special)
test_df['Options_Tokens'] = test_df['options'].apply(lambda opts: [text_pipeline_spacy_special(opt) for opt in opts])


## **Step 3: Pre-process and Analyze the Data**

In [None]:
#Q1.1: Count questions and options in each split
train_options_count = sum(len(item["options"]) for item in train_data)
valid_options_count = sum(len(item["options"]) for item in valid_data)
test_options_count = sum(len(item["options"]) for item in test_data)
print(f"Training set has {len(train_df)} questions.")
print(f"Validation set has {len(valid_df)} questions.")
print(f"Test set has {len(test_df)} questions.")
print(f"Training set has {train_options_count} options")
print(f"Validation set has {valid_options_count} options.")
print(f"Test set has {test_options_count} options.")
#Q1.2 & Q1.3: Average number of tokens per question and per choice in the training set
avg_tokens_question_train = train_df['Question_Tokens'].apply(len).mean()
print(f"Average number of tokens per question in the training set: {avg_tokens_question_train}")


options_lengths_train = [len(token) for sublist in train_df['Options_Tokens'].tolist() for token in sublist]
avg_tokens_option_train = sum(options_lengths_train) / len(options_lengths_train)
print(f"Average number of tokens per option in the training set: {avg_tokens_option_train}")


#Q1.4: Average number of tokens per correct choice in the training set
def correct_option_tokens(row):
    return len(text_pipeline_spacy_special(row['options'][row['correct_index']]))

avg_tokens_correct_option_train = train_df.apply(correct_option_tokens, axis=1).mean()
print(f"Average number of tokens per correct option in the training set: {avg_tokens_correct_option_train}")


# Perform any additional exploration of the data that you feel would be helpful for this multiple-choicequestion-answering task. Briefly describe what you found.
def lexical_overlap(data):
    overlap_correct = []
    overlap_incorrect = []

    for item in data:
        question_tokens = set(text_pipeline_spacy_special(item['question']))

        for idx, option in enumerate(item['options']):
            option_tokens = set(text_pipeline_spacy_special(option))
            shared_tokens = len(question_tokens.intersection(option_tokens))

            if idx == item['correct_index']:
                overlap_correct.append(shared_tokens)
            else:
                overlap_incorrect.append(shared_tokens)

    avg_overlap_correct = sum(overlap_correct) / len(overlap_correct)
    avg_overlap_incorrect = sum(overlap_incorrect) / len(overlap_incorrect)

    return avg_overlap_correct, avg_overlap_incorrect

avg_overlap_correct_train, avg_overlap_incorrect_train = lexical_overlap(train_data)
print("Average lexical overlap with question in correct options:", avg_overlap_correct_train)
print("Average lexical overlap with question in incorrect options:", avg_overlap_incorrect_train)

def option_length_comparison(data):
    lengths_correct = []
    lengths_incorrect = []

    for item in data:
        for idx, option in enumerate(item['options']):
            option_length = len(text_pipeline_spacy_special(option))

            if idx == item['correct_index']:
                lengths_correct.append(option_length)
            else:
                lengths_incorrect.append(option_length)

    avg_length_correct = sum(lengths_correct) / len(lengths_correct)
    avg_length_incorrect = sum(lengths_incorrect) / len(lengths_incorrect)

    return avg_length_correct, avg_length_incorrect

avg_length_correct_train, avg_length_incorrect_train = option_length_comparison(train_data)
print("Average length of correct options:", avg_length_correct_train)
print("Average length of incorrect options:", avg_length_incorrect_train)

#Semantic Similarity: It measures how much two pieces of text are related in meaning, not just in shared tokens. spaCy's language models can calculate this based on word embeddings, which capture semantic meanings of words.
def calculate_semantic_similarity(data):
    similarities = []

    for item in data:
        question = nlp(item['question'])
        correct_option = nlp(item['options'][item['correct_index']])
        similarity = question.similarity(correct_option)
        similarities.append(similarity)

    avg_similarity = sum(similarities) / len(similarities)
    return avg_similarity

avg_similarity_train = calculate_semantic_similarity(train_data)
print("Average semantic similarity between questions and correct options:", avg_similarity_train)


Training set has 741 questions.
Validation set has 103 questions.
Test set has 202 questions.
Training set has 2964 options
Validation set has 412 options.
Test set has 808 options.
Average number of tokens per question in the training set: 6.272604588394062
Average number of tokens per option in the training set: 22.338056680161944
Average number of tokens per correct option in the training set: 26.032388663967613
Average lexical overlap with question in correct options: 2.6531713900134952
Average lexical overlap with question in incorrect options: 1.605038236617184
Average length of correct options: 26.032388663967613
Average length of incorrect options: 21.106612685560055


  similarity = question.similarity(correct_option)


Average semantic similarity between questions and correct options: 0.3151678621389988


Expected Insights
Similarity Between Question and Options: If the average lexical overlap is higher for correct options, this might indicate that correct answers share more vocabulary with the question, potentially guiding the development of features for machine learning models or rules for rule-based approaches.

Option Length Comparison: Should correct options consistently be longer or shorter, this characteristic could serve help in future as we can clearly see that correct options have more average length then the incoreect options.

Sementic Similarity : A higher average similarity score between questions and correct options might suggest that leveraging semantic similarity could improve answer selection, but in our case it is not helpful as the socre is really low 0.31

# **Section 2: Set Similarity Measures**

## **Step 1: Define the Similarity Measures**

In [None]:
def overlap_coefficient(set1, set2):
    return len(set1.intersection(set2)) / min(len(set1), len(set2))

def sorensen_dice_coefficient(set1, set2):
    return 2 * len(set1.intersection(set2)) / (len(set1) + len(set2))

def jaccard_similarity(set1, set2):
    return len(set1.intersection(set2)) / len(set1.union(set2))


## **Step 2: Calculate Similarities for Each Question-Option Pair**

In [None]:
def calculate_accuracy_and_ties(data, similarity_function):
    correct_predictions = 0
    tie_counts = 0

    for item in data:
        question_tokens = set(text_pipeline_spacy_special(item['question']))
        scores = []

        for option in item['options']:
            option_tokens = set(text_pipeline_spacy_special(option))
            score = similarity_function(question_tokens, option_tokens)
            scores.append(score)

        # Check for ties
        max_score = max(scores)
        if scores.count(max_score) > 1:
            tie_counts += 1
            # If tied, arbitrarily pick the first occurrence of the max score
            predicted_index = scores.index(max_score)
        else:
            predicted_index = scores.index(max_score)

        if predicted_index == item['correct_index']:
            correct_predictions += 1

    accuracy = correct_predictions / len(data)
    return accuracy, tie_counts

for measure in [overlap_coefficient, sorensen_dice_coefficient, jaccard_similarity]:
    train_accuracy, train_ties = calculate_accuracy_and_ties(train_data, measure)
    valid_accuracy, valid_ties = calculate_accuracy_and_ties(valid_data, measure)
    print(f"Measure: {measure.__name__}")
    print(f"Training Accuracy: {train_accuracy}, Ties: {train_ties}")
    print(f"Validation Accuracy: {valid_accuracy}, Ties: {valid_ties}")
    print("-" * 50)


Measure: overlap_coefficient
Training Accuracy: 0.5236167341430499, Ties: 246
Validation Accuracy: 0.46601941747572817, Ties: 29
--------------------------------------------------
Measure: sorensen_dice_coefficient
Training Accuracy: 0.4291497975708502, Ties: 20
Validation Accuracy: 0.3592233009708738, Ties: 4
--------------------------------------------------
Measure: jaccard_similarity
Training Accuracy: 0.4291497975708502, Ties: 20
Validation Accuracy: 0.3592233009708738, Ties: 4
--------------------------------------------------


In the provided calculate_accuracy_and_ties function, ties are resolved by selecting the first option with the highest score. This approach is arbitrary but ensures consistent and reproducible behavior. Depending on your specific needs or insights into the dataset, you might choose a different strategy to break ties, such as randomly selecting among tied options or using additional heuristics to make an informed choice.

# **Section 3: Cosine similarity of TF vectors**

## **Step 1: Create TF Vectors**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def custom_tokenizer(text):
    return text_pipeline_spacy_special(text)

vectorizer = CountVectorizer(tokenizer=custom_tokenizer, analyzer='word')

def generate_tf_vectors(dataframe):
    questions = dataframe['question'].tolist()
    options_list = dataframe['options'].tolist()

    # Flatten the list
    all_texts = questions + [option for options in options_list for option in options]
    tf_matrix = vectorizer.fit_transform(all_texts).toarray()

    # Split the TF matrix
    q_tf_matrix = tf_matrix[:len(questions)]
    o_tf_matrix = tf_matrix[len(questions):]

    return q_tf_matrix, np.split(o_tf_matrix, len(questions))


## **Step 2: Calculating Cosine Similarity and Selecting Best Answer**

In [None]:
def select_best_answer(q_tf_matrix, o_tf_matrix_list):
    selected_answers = []
    for q_vector, o_matrix in zip(q_tf_matrix, o_tf_matrix_list):
        cos_sim = cosine_similarity([q_vector], o_matrix)[0]
        selected_answers.append(np.argmax(cos_sim))
    return selected_answers

def calculate_accuracy(selected_answers, correct_answers):
    correct_predictions = sum(1 for selected, correct in zip(selected_answers, correct_answers) if selected == correct)
    return correct_predictions / len(correct_answers)


## **Step 3: Evaluate the Model**

In [None]:
# Generate TF vectors for questions and options
q_tf_matrix_train, o_tf_matrix_list_train = generate_tf_vectors(train_df)
q_tf_matrix_valid, o_tf_matrix_list_valid = generate_tf_vectors(valid_df)

# Select the best answer based on cosine similarity
selected_answers_train = select_best_answer(q_tf_matrix_train, o_tf_matrix_list_train)
selected_answers_valid = select_best_answer(q_tf_matrix_valid, o_tf_matrix_list_valid)

# Calculate accuracy
accuracy_train = calculate_accuracy(selected_answers_train, train_df['correct_index'].tolist())
accuracy_valid = calculate_accuracy(selected_answers_valid, valid_df['correct_index'].tolist())

print(f"Training Accuracy: {accuracy_train}")
print(f"Validation Accuracy: {accuracy_valid}")




Training Accuracy: 0.446693657219973
Validation Accuracy: 0.4563106796116505


## **Modifying the CountVectorizer for Bigrams**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

def custom_tokenizer(text):
    return text_pipeline_spacy_special(text)

# Initialize CountVectorizer with both unigrams and bigrams
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, analyzer='word', ngram_range=(1, 2))

def generate_tf_vectors_with_bigrams(dataframe):
    questions = dataframe['question'].tolist()
    options_list = dataframe['options'].tolist()

    # Flatten the list
    all_texts = questions + [option for options in options_list for option in options]
    tf_matrix = vectorizer.fit_transform(all_texts).toarray()

    # Split the TF matrix back
    q_tf_matrix = tf_matrix[:len(questions)]
    o_tf_matrix = tf_matrix[len(questions):]

    return q_tf_matrix, np.split(o_tf_matrix, len(questions))

def select_best_answer(q_tf_matrix, o_tf_matrix_list):
    selected_answers = []
    for q_vector, o_matrix in zip(q_tf_matrix, o_tf_matrix_list):
        # Reshape the question vector to a 2D array
        q_vector_reshaped = q_vector.reshape(1, -1)

        # Ensure options matrix is a 2D array;
        o_matrix_2d = np.array(o_matrix)

        # Calculate cosine similarity
        cos_sim = cosine_similarity(q_vector_reshaped, o_matrix_2d)[0]

        # option with the highest similarity
        selected_answers.append(np.argmax(cos_sim))
    return selected_answers


q_tf_matrix_train, o_tf_matrix_list_train = generate_tf_vectors_with_bigrams(train_df)
q_tf_matrix_valid, o_tf_matrix_list_valid = generate_tf_vectors_with_bigrams(valid_df)

selected_answers_train_bigrams = select_best_answer(q_tf_matrix_train, o_tf_matrix_list_train)
accuracy_train_bigrams = calculate_accuracy(selected_answers_train_bigrams, train_df['correct_index'].tolist())

selected_answers_valid_bigrams = select_best_answer(q_tf_matrix_valid, o_tf_matrix_list_valid)
accuracy_valid_bigrams = calculate_accuracy(selected_answers_valid_bigrams, valid_df['correct_index'].tolist())

print(f"Training Accuracy with Bigrams: {accuracy_train_bigrams}")
print(f"Validation Accuracy with Bigrams: {accuracy_valid_bigrams}")




Training Accuracy with Bigrams: 0.4534412955465587
Validation Accuracy with Bigrams: 0.44660194174757284


# **Section 4: Cosine similarity of vectors from bert-base-uncased**

## **Step 1: Load BERT Model and Tokenize**

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to encode texts to BERT embeddings
def encode_with_bert(texts):
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input)
    return model_output.pooler_output  # Use the pooled output for a summary representation

# Function to calculate cosine similarity between a question and its options
def calculate_cosine_similarity(question_embedding, options_embeddings):
    similarities = cosine_similarity(question_embedding, options_embeddings)
    return np.argmax(similarities)


## **Step 2: Getting Accuracy**

In [None]:
def select_best_answer(question_embeddings, options_embeddings_list):
    selected_answers = []
    for q_embedding, o_embeddings in zip(question_embeddings, options_embeddings_list):
        # Extract the actual embedding from the wrapping list
        q_embedding = np.array(q_embedding[0])
        o_embeddings_matrix = np.vstack([opt[0] for opt in o_embeddings])

        cos_sim = cosine_similarity([q_embedding], o_embeddings_matrix)[0]
        selected_answers.append(np.argmax(cos_sim))
    return selected_answers

correct_predictions_train = 0
correct_predictions_valid = 0

for dataset, is_train in [(train_df, True), (valid_df, False)]:
    for _, item in dataset.iterrows():
        question = item['question']
        options = item['options']
        correct_index = item['correct_index']
        embeddings = encode_with_bert([question] + options)
        question_embedding = embeddings[0].reshape(1, -1)
        options_embeddings = embeddings[1:]

        selected_option_index = calculate_cosine_similarity(question_embedding, options_embeddings)

        if selected_option_index == correct_index:
            if is_train:
                correct_predictions_train += 1
            else:
                correct_predictions_valid += 1

# After the loop, calculate and print the accuracy
accuracy_train = correct_predictions_train / len(train_df)
accuracy_valid = correct_predictions_valid / len(valid_df)

print(f"Training Accuracy: {accuracy_train}")
print(f"Validation Accuracy: {accuracy_valid}")



Training Accuracy: 0.14709851551956815
Validation Accuracy: 0.14563106796116504


# **Section 5: Fine-tuning a transformer model**

In [None]:
import pandas as pd
from transformers import TrainingArguments
# Define a function to create question-option pairs
def create_question_option_pairs(data):
    pairs = []
    for idx, row in data.iterrows():
        question = row['question']
        correct_option = row['options'][row['correct_index']]
        options = row['options']
        for option in options:
            label = 1 if option == correct_option else 0
            pair = f"{question} [SEP] {option}", label
            pairs.append(pair)
    return pairs

# Create question-option pairs for training and validation sets
train_pairs = create_question_option_pairs(train_df)
valid_pairs = create_question_option_pairs(valid_df)

# Convert to DataFrame
train_pairs_df = pd.DataFrame(train_pairs, columns=['text', 'label'])
valid_pairs_df = pd.DataFrame(valid_pairs, columns=['text', 'label'])
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Tokenize the data
train_encodings = tokenizer(train_pairs_df['text'].tolist(), truncation=True, padding=True)
valid_encodings = tokenizer(valid_pairs_df['text'].tolist(), truncation=True, padding=True)

# Create Dataset objects
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = Dataset(train_encodings, train_pairs_df['label'].tolist())
valid_dataset = Dataset(valid_encodings, valid_pairs_df['label'].tolist())

# Define the training arguments with output_dir
training_args = TrainingArguments(
    output_dir='./output',  # Specify where to save the trained model and training logs
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=1e-5,
    num_train_epochs=4,
    weight_decay=0,
    logging_dir='./logs',
)


# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

# Train the model
trainer.train()
# Evaluate on the validation set
predictions = trainer.predict(valid_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = valid_pairs_df['label'].tolist()

# Calculate evaluation metrics
accuracy = accuracy_score(true_labels, pred_labels)
precision = precision_score(true_labels, pred_labels)
recall = recall_score(true_labels, pred_labels)
f1 = f1_score(true_labels, pred_labels)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss
500,0.5021
1000,0.3944


Accuracy: 0.7888349514563107
Precision: 0.5784313725490197
Recall: 0.5728155339805825
F1 Score: 0.575609756097561
