In [89]:
import pandas as pd
import gensim
import gensim.downloader as api
from nltk.tokenize import word_tokenize, sent_tokenize


 Task 1: Evaluation of the word2vec-google-news-300 Pre-trained Model

In [82]:
model = api.load('word2vec-google-news-300') # https://www.geeksforgeeks.org/nlp-gensim-tutorial-complete-guide-for-beginners/?ref=header_search

model2 = api.load('glove-wiki-gigaword-300') # https://radimrehurek.com/gensim/models/word2vec.html
model3 = api.load('glove-wiki-gigaword-200') # https://radimrehurek.com/gensim/models/word2vec.html

model4 = api.load('glove-twitter-100')
model5 = api.load('glove-wiki-gigaword-100')




In [131]:
# Function Debugged by ChatGPT (OpenAI, 2023; ChatGPT (Model 4.0)) https://chat.openai.com/chat"
def evaluate_synonym(model, model_name):
    # Read the synonym dataset as a dataframe
    data = pd.read_csv('synonym.csv')

    # Initialize an empty list to store results
    result = []

    # Iterate over each row in the dataset
    for index, row in data.iterrows():
        # Extract the question word and the correct answer from the current row
        question = row['question']
        answer = row['answer']

        # Extract the four option words from the current row
        options = [row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5]]

        # Initialize label and cosine similarity
        label = None
        cosine = 0

        # Determine if the model is a Word2Vec model or KeyedVectors object
        if hasattr(model, 'wv'):
            vocabularies = model.wv.key_to_index  # For Word2Vec model
            similarity = model.wv.similarity      # Accessing similarity method
        else:
            vocabularies = model.key_to_index     # For KeyedVectors object
            similarity = model.similarity         # Accessing similarity method

        # Check if the question word is in the model's vocabulary
        if question not in vocabularies:
            label = 'guess'
        # Check if any of the option words are in the model's vocabulary
        elif not any(option in vocabularies for option in options):
            label = 'guess'

        # If the label is 'guess', append the result with 'NULL' as the guess word
        if label == 'guess':
            result.append([question+',', answer+',', 'NULL', label])
        else:
            # Otherwise, evaluate each option word
            for option in options:
                if option in vocabularies:
                    # Compute the cosine similarity score between the question word and the option word
                    score = similarity(question, option)
                    if score > cosine:
                        # Update the best guess based on the highest similarity score
                        temp = [question+',', answer+',', option+',', label]
                        cosine = score
            
            # Determine if the best guess is correct or wrong
            if temp[1] == temp[2]:
                temp[3] = 'correct'
            else:
                temp[3] = 'wrong'

            # Append the result
            result.append(temp)

    # Convert the results to a DataFrame
    output = pd.DataFrame(result, columns=['question', 'answer', 'guess', 'label'])

    # Save the results to a CSV file
    output.to_csv(model_name+'-details.csv', index=False)

    # Calculate the number of correct labels and the number of questions answered without guessing
    correct_labels = output['label'].value_counts().get('correct', 0)
    questions_answered = output['label'].value_counts().sum() - output['label'].value_counts().get('guess', 0)

    # Calculate the accuracy of the model
    accuracy = correct_labels / questions_answered if questions_answered > 0 else 0

    # Print and return the results
    print([model_name, str(len(vocabularies))+',', str(correct_labels)+',', str(questions_answered)+',', accuracy])
    return [model_name, str(len(vocabularies))+',', str(correct_labels)+',', str(questions_answered)+',', accuracy]


In [132]:
list1 = evaluate_synonym(model, 'word2vec-google-news-300')

['word2vec-google-news-300', '3000000,', '70,', '79,', 0.8860759493670886]


Task 2: Comparison with Other Pre-trained Models

In [133]:
list2 = evaluate_synonym(model2, 'glove-wiki-gigaword-300')
list3 = evaluate_synonym(model3, 'glove-wiki-gigaword-200')
list4 = evaluate_synonym(model4, 'glove-twitter-100')
list5 = evaluate_synonym(model5, 'glove-wiki-gigaword-100')

['glove-wiki-gigaword-300', '400000,', '71,', '80,', 0.8875]
['glove-wiki-gigaword-200', '400000,', '68,', '80,', 0.85]
['glove-twitter-100', '1193514,', '39,', '78,', 0.5]
['glove-wiki-gigaword-100', '400000,', '65,', '80,', 0.8125]


In [134]:
def analyze(list, list2, list3, list4, list5):
    analysis = pd.DataFrame([list, list2, list3, list4,list5], columns = ['model', 'vocabulary_size', 'correct_labels', 'questions_answered', 'accuracy'])
    analysis.to_csv('analysis.csv', index = False)
analyze(list1, list2, list3, list4, list5)

Task 3: Train your Own Models

In [135]:
def preprocessed_text(book):
    with open(book, 'r', encoding='utf-8') as file:
        # Read the entire content of the file into a single string
        text = file.read()

    # Use NLTK to split the text into a list of sentences
    sentences = sent_tokenize(text)

    processed_sentence = []

    # Iterate over each sentence in the list of sentences
    for sentence in sentences:
        # Convert the sentence to lowercase and tokenize it into words
        tokens = word_tokenize(sentence.lower())

        # Filter out tokens that are not purely alphabetic
        tokens = [token for token in tokens if token.isalpha()]

        # Append the processed tokens (words) to the processed_sentence list
        processed_sentence.append(tokens)

    # Return the list of processed sentences
    return processed_sentence

type(preprocessed_text('book1.txt'))


list

In [136]:
def train(text:list, window:int, embedding:int):
    model_name = f"Word2Vec_e{embedding}_w{window}"
    model = gensim.models.Word2Vec(sentences=text, window=window, vector_size=embedding)
    return model_name, model

In [137]:
text = preprocessed_text('five_books.txt')

my_model_name_1, my_model_1= train(text, 5, 300) # window 5 embedding 300
my_model_name_2, my_model_2= train(text, 10, 300) # window 10 embedding 300

my_model_name_3, my_model_3= train(text, 5, 200) # window 5 embedding 200
my_model_name_4, my_model_4= train(text, 10, 200) # wiondow 10 embedding 200


In [138]:
result1 = evaluate_synonym(my_model_1, my_model_name_1)
result2 = evaluate_synonym(my_model_2, my_model_name_2)
result3 = evaluate_synonym(my_model_3, my_model_name_3)
result4 = evaluate_synonym(my_model_4, my_model_name_4)

['Word2Vec_e300_w5', '3026,', '5,', '14,', 0.35714285714285715]


['Word2Vec_e300_w5', '3026,', '5,', '14,', 0.35714285714285715]

In [None]:
def merge_dataframe(result1, result2, result3, result4):
    current_df = pd.read_csv('analysis.csv')
    new_df = pd.DataFrame([result1, result2, result3, result4], columns=current_df.columns)
    merged_df = pd.concat([current_df, new_df])
    merged_df.to_csv('analysis.csv', index = False)
    return merged_df
    

In [None]:
merge_dataframe(result1, result2, result3, result4)