In [12]:
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# Open the file and read each line separately
data = []

def load_data(file_path):
    with open(file_path, 'r') as f:
        for line in f:
            # Load each line as a JSON object and append it to the data list
            data.append(json.loads(line))
    return data

In [17]:
def preprocess_text(text):
    if isinstance(text, str):  # Check if the input is a string
        tokens = word_tokenize(text.lower())
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(token) for token in tokens]
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]
        tokens = [token for token in tokens if token.isalnum()]
        return tokens
    else:
        return []


In [18]:
def get_average_word_vectors(text_samples, word_embedding_model):
    num_features = word_embedding_model.vector_size
    average_vectors = np.zeros((len(text_samples), num_features))
    for i, text in enumerate(text_samples):
        words = preprocess_text(text)
        word_vectors = [word_embedding_model[word] for word in words if word in word_embedding_model]
        if word_vectors:
            average_vectors[i] = np.mean(word_vectors, axis=0)
    return average_vectors

In [19]:
def load_word_embeddings(embedding_type):
    if embedding_type == 'word2vec':
        return api.load('word2vec-google-news-300')
    elif embedding_type == 'fasttext':
        return api.load('fasttext-wiki-news-subwords-300')
    elif embedding_type == 'glove':
        return api.load('glove-wiki-gigaword-300')

In [20]:
def calculate_similarity(question_vec, choice_vec):
    return cosine_similarity([question_vec], [choice_vec])[0][0]

In [21]:
def logistic_regression(X_train, X_test, y_train, y_test):
    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    return y_pred


In [22]:
def evaluate_accuracy(true_answers, predicted_answers):
    return accuracy_score(true_answers, predicted_answers)

In [24]:
data = load_data('./train_rand_split.json')

X = []
y = []

for embedding_type in ['word2vec', 'fasttext', 'glove']:
        X = []
        y = []

        for question in data:
            question_text = question['question']['stem']  # Access the question text from the nested structure
            question_embedding_model = load_word_embeddings(embedding_type)
            question_vector = get_average_word_vectors([question_text], question_embedding_model)[0]
            for choice in question['question']['choices']:
                choice_text = choice['text']  # Access the choice text from the nested structure
                choice_embedding_model = load_word_embeddings(embedding_type)
                choice_vector = get_average_word_vectors([choice_text], choice_embedding_model)[0]
                similarity = calculate_similarity(question_vector, choice_vector)
                X.append([similarity])
                y.append(1 if choice['label'] == question['answerKey'] else 0)  # Compare with the correct label


        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        predicted_answers = logistic_regression(X_train, X_test, y_train, y_test)
        accuracy = evaluate_accuracy(y_test, predicted_answers)
        print(f"Accuracy for {embedding_type}: {accuracy}")

KeyboardInterrupt: 