In [89]:
import pandas as pd
import gensim
import gensim.downloader as api
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


 Task 1: Evaluation of the word2vec-google-news-300 Pre-trained Model

In [82]:
model = api.load('word2vec-google-news-300') # https://www.geeksforgeeks.org/nlp-gensim-tutorial-complete-guide-for-beginners/?ref=header_search

model2 = api.load('glove-wiki-gigaword-300') # https://radimrehurek.com/gensim/models/word2vec.html
model3 = api.load('glove-wiki-gigaword-200') # https://radimrehurek.com/gensim/models/word2vec.html

model4 = api.load('glove-twitter-100')
model5 = api.load('glove-wiki-gigaword-100')




In [131]:
def evaluate_synonym(model, model_name):
    data = pd.read_csv('synonym.csv')
    result = []
    for index, row in data.iterrows():
        question = row['question']
        answer = row['answer']
        options = [row.iloc[2], row.iloc[3], row.iloc[4], row.iloc[5]]
        label = None
        cosine = 0

        if hasattr(model, 'wv'):
            vocabularies = model.wv.key_to_index
            similarity = model.wv.similarity
        else:
            vocabularies = model.key_to_index
            similarity = model.similarity

        if question not in vocabularies:
            label = 'guess'
        if not any(option in vocabularies for option in options):
            label = 'guess'
        if label == 'guess':
            result.append([question+',', answer+',', 'NULL', label])
        else:
            for option in options:
                if option in vocabularies:
                    score = similarity(question, option)
                    if score > cosine:
                        temp = [question+',', answer+',', option+',', label]
                        cosine = score
            if temp[1] == temp[2]:
                temp[3] = 'correct'
            else:
                temp[3] = 'wrong'
            result.append(temp)
    output = pd.DataFrame(result, columns=['question', 'answer', 'guess', 'label'])
    output.to_csv(model_name+'-details.csv', index=False)
    correct_labels = output['label'].value_counts().get('correct', 0)
    questions_answered = output['label'].value_counts().sum() - output['label'].value_counts().get('guess', 0)
    accuracy = correct_labels / questions_answered
    print([model_name, str(len(vocabularies))+',', str(correct_labels)+',', str(questions_answered)+',', accuracy])
    return [model_name, str(len(vocabularies))+',', str(correct_labels)+',', str(questions_answered)+',', accuracy]

In [132]:
list1 = evaluate_synonym(model, 'word2vec-google-news-300')

['word2vec-google-news-300', '3000000,', '70,', '79,', 0.8860759493670886]


Task 2: Comparison with Other Pre-trained Models

In [133]:
list2 = evaluate_synonym(model2, 'glove-wiki-gigaword-300')
list3 = evaluate_synonym(model3, 'glove-wiki-gigaword-200')
list4 = evaluate_synonym(model4, 'glove-twitter-100')
list5 = evaluate_synonym(model5, 'glove-wiki-gigaword-100')

['glove-wiki-gigaword-300', '400000,', '71,', '80,', 0.8875]
['glove-wiki-gigaword-200', '400000,', '68,', '80,', 0.85]
['glove-twitter-100', '1193514,', '39,', '78,', 0.5]
['glove-wiki-gigaword-100', '400000,', '65,', '80,', 0.8125]


In [134]:
def analyze(list, list2, list3, list4, list5):
    analysis = pd.DataFrame([list, list2, list3, list4,list5], columns = ['model', 'vocabulary_size', 'correct_labels', 'questions_answered', 'accuracy'])
    analysis.to_csv('analysis.csv', index = False)
analyze(list1, list2, list3, list4, list5)

Task 3: Train your Own Models

In [135]:
def preprocessed_text(book):
    with open(book, 'r', encoding='utf-8') as file:
        text = file.read()
    sentences = sent_tokenize(text)
    processed_sentence = []
    for sentence in sentences:
        tokens = word_tokenize(sentence.lower())
        tokens = [token for token in tokens if token.isalpha()]
        processed_sentence.append(tokens)
    return processed_sentence
type(preprocessed_text('book1.txt'))

list

In [136]:
def train(text:list, window:int, embedding:int):
    model_name = f"Word2Vec_e{embedding}_w{window}"
    model = gensim.models.Word2Vec(sentences=text, window=window, vector_size=embedding)
    return model_name, model

In [137]:
text = preprocessed_text('book1.txt')
mymodel_name, my_model= train(text, 5, 300)

In [138]:
evaluate_synonym(my_model, mymodel_name)

['Word2Vec_e300_w5', '3026,', '5,', '14,', 0.35714285714285715]


['Word2Vec_e300_w5', '3026,', '5,', '14,', 0.35714285714285715]