In [None]:
import json
import pandas as pd
import random
from sklearn.metrics import accuracy_score
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from gensim.models import KeyedVectors
import numpy as np
from scipy.spatial.distance import cosine
import fasttext
import gensim.downloader
import matplotlib.pyplot as plt


In [None]:
with open('train_rand_split.jsonl', 'r') as file:
    data = [json.loads(line) for line in file]

In [None]:
data

In [None]:
df = pd.DataFrame(data)

In [None]:
df

In [None]:
df["question"][0]

In [None]:
# Extract relevant features from the question dictionary
df['stem'] = df['question'].apply(lambda x: x['stem'])
df['choices'] = df['question'].apply(lambda x: {choice['label']: choice['text'] for choice in x['choices']})

df.drop('question', axis=1, inplace=True)

df = df[['id', 'stem', 'choices', 'answerKey']]



In [None]:
df

In [None]:
df.isna().sum()

In [None]:
df["answerKey"].unique()

# Use Baseline Model => just Random Guessing

In [None]:
answer_choices = df["choices"]

random_predictions = [random.choice(list(choices.keys())) for choices in answer_choices]

In [None]:
random_predictions

# Compute the accuracy for this model  

In [None]:
correct_answers = df["answerKey"]
accuracy_of_the_baseline_model = accuracy_score(correct_answers, random_predictions)
print("Random Guessing Baseline Accuracy:", accuracy_of_the_baseline_model)

# Preprocessing-Phase 

In [None]:
df['tokens'] = df['stem'].apply(word_tokenize)

In [None]:
df["stem"] = df['tokens']

In [None]:
df = df.drop(columns=['tokens'])

In [None]:
# Convert all words to lowercase 
#df["stem"] = df["stem"].apply(lambda x: [word.lower() for word in x])

In [None]:
df["stem"]

# Remove Stop Words

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
stop_words

In [None]:
# Remove stop words from the tokens
df["stem"] = df["stem"].apply(lambda x: [word for word in x if word not in stop_words])

In [None]:
# Remove punctuation from the tokens
df["stem"] = df["stem"].apply(lambda x: [word for word in x if word not in string.punctuation])

In [None]:
df["stem"][0]

In [None]:
def tokenize_answers(choices):
    tokenized_choices = {}
    for key, value in choices.items(): #as a tuple 
        tokens = word_tokenize(value)  # Tokenize the answer choice
        tokenized_choices[key] = tokens
    return tokenized_choices

In [None]:
df["choices"] = df["choices"].apply(tokenize_answers)

In [None]:
df["choices"] 

# Load  The Models

In [None]:
word2vec_model_path = 'C:\\Users\\IMOE001\\Desktop\\GoogleNews-vectors-negative300.bin'
fasttext_model_path = 'C:\\Users\\IMOE001\\Desktop\\wiki-news-300d-1M.vec'

In [None]:
glove_model = gensim.downloader.load('glove-wiki-gigaword-300')
word2vec_model = KeyedVectors.load_word2vec_format(word2vec_model_path, binary=True)
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_model_path)

In [None]:
# Test some words
words_to_test = ['king','King']

# Check if each word is present in the vocabulary
for word in words_to_test:
    if word in word2vec_model:
        print(f"Vector representation of '{word}': {word2vec_model[word]}")
    else:
        print(f"'{word}' is not present in the vocabulary.")


In [None]:
def number_of_words_without_embedding(the_column,the_column2 ,  model):
    count = 0
    for wordlist in the_column:
        for word in wordlist:
            if word not in model:
                count += 1
                
    for row in the_column2 : 
        for wordlist in row.values() : 
            for word in wordlist:
                if word not in model:
                    count += 1
    return count

In [None]:
number_of_words_without_embedding_using_word2vec = number_of_words_without_embedding(df["stem"] , df["choices"] ,  word2vec_model )
number_of_words_w1ithout_embedding_using_fasttext = number_of_words_without_embedding(df["stem"] ,df["choices"], fasttext_model )
number_of_words_w1ithout_embedding_using_Glove = number_of_words_without_embedding(df["stem"] , df["choices"] , glove_model )

In [None]:
no_of_words_without_embedding_in_each_model = [number_of_words_w1ithout_embedding_using_Glove , number_of_words_w1ithout_embedding_using_fasttext , number_of_words_without_embedding_using_word2vec]
model_names_2 = ['GloVe', 'FastText', 'Word2Vec']
plt.figure(figsize=(8, 6))

plt.bar(model_names_2, no_of_words_without_embedding_in_each_model , color = ['#8B4513', '#A0522D', '#CD853F', '#D2691E'])

plt.xlabel('Model')
plt.ylabel('Number of words without embedding')
plt.title('Number of words without embedding in each Model')

plt.show()


In [None]:
def convert_question_to_vector(words, model):
    word_vectors = [] 
    for word in words:
        if word in model:
            word_vectors.append(model[word])
    if word_vectors:
        return  np.mean(word_vectors, axis=0)
    else:
        return  np.zeros(model.vector_size)    # Return zero vector if no words found in the model

In [None]:
df["question_vectors_word2vec"] = df["stem"].apply(lambda x: convert_question_to_vector(x, word2vec_model))
df["question_vectors_fasttext"] = df["stem"].apply(lambda x: convert_question_to_vector(x, fasttext_model))
df["question_vectors_glove"] = df["stem"].apply(lambda x: convert_question_to_vector(x, glove_model))

In [None]:
def convert_answer_to_vector(choices, model):
    average_vectors = {}
    for key, tokens in choices.items():
        word_vectors = []
        for token in tokens:
            if token in model: 
                word_vectors.append(model[token])
        if word_vectors:
            average_vector = np.mean(word_vectors, axis=0)
        else:
            average_vector = np.zeros(model.vector_size)
        average_vectors[key] = average_vector
    return average_vectors

In [None]:
df["answer_vectors_word2vec"] = df["choices"].apply(lambda x: convert_answer_to_vector(x, word2vec_model))
df["answer_vectors_fastext"] = df["choices"].apply(lambda x: convert_answer_to_vector(x, fasttext_model))
df["answer_vectors_glove"] = df["choices"].apply(lambda x: convert_answer_to_vector(x, glove_model))

   # Making predictions using models 

In [None]:
def cosine_similarity(x, y):
    dot_product = np.dot(x, y)
    
    magnitude_x = np.sqrt(np.sum(x**2)) 
    magnitude_y = np.sqrt(np.sum(y**2))
    
    cosine_similarity = dot_product / (magnitude_x * magnitude_y)
    
    return cosine_similarity

In [None]:
#here we want the vector that give me the maxmum cosine similarity with question vector
def get_predicted_answer(question_vector, answer_vectors):
    max_similarity = -2
    max_label = None
    for label, answer_vector in answer_vectors.items():
        similarity = cosine_similarity(question_vector, answer_vector)
        if similarity > max_similarity:
            max_similarity = similarity
            max_label = label
    return max_label

In [None]:
predicted_answers_use_word2vec = df.apply(lambda row: get_predicted_answer(row["question_vectors_word2vec"], row["answer_vectors_word2vec"]), axis=1)
predicted_answers_use_fasttex = df.apply(lambda row: get_predicted_answer(row["question_vectors_fasttext"] , row["answer_vectors_fastext"]), axis=1)
predicted_answers_use_glove = df.apply(lambda row: get_predicted_answer(row["question_vectors_glove"], row["answer_vectors_glove"]), axis=1)

In [None]:
accuracy_of_word2vec = accuracy_score(df["answerKey"], predicted_answers_use_word2vec)
print("Accuracy of word2vec:", accuracy_of_word2vec)

In [None]:
accuracy_of_fastext = accuracy_score(df["answerKey"], predicted_answers_use_fasttex)
print("Accuracy of fasttex:", accuracy_of_fastext)

In [None]:
predicted_answers_use_glove.fillna('A', inplace=True)

In [None]:
accuracy_of_glove = accuracy_score(df["answerKey"], predicted_answers_use_glove )
print("Accuracy of glove:", accuracy_of_glove)

# Plot the Results 

In [None]:
the_accuracy_score_for_models = [accuracy_of_glove , accuracy_of_fastext , accuracy_of_word2vec , accuracy_of_the_baseline_model ]

model_names = ['GloVe', 'FastText', 'Word2Vec', 'Baseline']

plt.figure(figsize=(10, 8))

plt.bar(model_names, the_accuracy_score_for_models, color = ['#FFC0CB', '#FF69B4', '#FF1493', '#DB7093'])

plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Comparison of Model Accuracies')
plt.ylim(0, 1)  # Set y-axis limit from 0 to 1 for accuracy score

plt.show()
