## Pretrained embeddings and fine tuining

In the first cell we look at the pretrained embeddings available.

In [None]:
import gensim.downloader
import numpy as np
# Show all available models in gensim-data
gensim.downloader.info()['models'].keys()

In [None]:

#Here we download one of the 4 pretrained models that we used 'word2vec-ruscorpora-300', 
# 'word2vec-google-news-300'  'glove-twitter-200'  'glove-wiki-gigaword-300'
google_model = gensim.downloader.load('word2vec-google-news-300')


In the cell below we define the path of the downloaded pretrained model, these paths represent the location of this embeddings in the machine were this notebook was executed. The paths have to be substituted with the correct location of the embeddings where this notebook is currently executing.

In [3]:
from gensim.models import KeyedVectors
EMBEDDING_FILE_GIGA= 'C:/Users/filip/gensim-data/glove-wiki-gigaword-300/glove-wiki-gigaword-300/glove-wiki-gigaword-300.txt'
EMBEDDING_FILE_GOOGLE = 'C:/Users/filip/gensim-data/word2vec-google-news-300/word2vec-google-news-300/GoogleNews-vectors-negative300.bin'
EMBEDDING_FILE_TW = 'C:/Users/filip/gensim-data/glove-twitter-200/glove-twitter-200/glove-twitter-200.txt'
EMBEDDING_FILE_RUSS = 'C:/Users/filip/gensim-data/word2vec-ruscorpora-300/word2vec-ruscorpora-300/word2vec-ruscorpora-300'



In [1]:
from sklearn.linear_model import LogisticRegression
from gensim.utils import simple_preprocess
from gensim.models import fasttext
from datautils import documents_vector
from datautils import documents_vector_pre
import datautils
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from datautils import documents_vector
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import FastText
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
PATH = './Dataset/power-gb-train.tsv'
RES_DIR = './Results/'


In [6]:
#we load the pretrained model
google_model = KeyedVectors.load_word2vec_format(EMBEDDING_FILE_GOOGLE, binary=True)
#we split the dataset
X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

tr_fold = list(map(simple_preprocess, X_train))
val_fold = list(map(simple_preprocess, X_val))


#pooling
X_train1 = documents_vector_pre(tr_fold, google_model)
X_va1l = documents_vector_pre(val_fold, google_model)

we compute the word analogy for the pretrained model and we load the test file

In [None]:
import requests
url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
test_file = 'resources/questions-words.txt'
questions = requests.get(url).content.decode()
with open(test_file,mode='w',encoding='utf-8') as outputfile:
    outputfile.write(questions)
print(questions[:1000])

#word analogy with dummy4unknown
w2v_large_analogy_dummy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
#word analogy 
w2v_large_analogy = google_model.evaluate_word_analogies(test_file)


In [None]:
#we take a look at the scores of word analogy
w2v_large_analogy

In [None]:
#we take a look at the scores of word analogy with dummy for unknown
w2v_large_analogy_dummy

In the cell below we apply a grid search over the hyperparameters of the logistic regression

In [None]:


from sklearn.metrics import recall_score
from sklearn.metrics import precision_score

hyperparameters1 = {
        "penalty": ["l2"],
        "C": [0.1, 1.0, 10.0, 100.0, 1000.0,1500.0],
        "solver": ["lbfgs"],
        "max_iter": [100, 200, 500,700],
    }

param_grid1 = list(ParameterGrid(hyperparameters1))

results_df = pd.DataFrame(
    columns=["penalty", "C", "solver", "max_iter", "F1 Score","Precision","Recall"]
)

for par1 in param_grid1:
            
        model = LogisticRegression(**par1)
        model.fit(X_train1, y_train)

        # Compute F1 score,Prediction and Recall on validation set
        y_val_pred = model.predict(X_va1l)
        f1_macro = f1_score(y_val, y_val_pred, average="macro")
        precision= precision_score(y_val,y_val_pred)
        recall=recall_score(y_val,y_val_pred)

        print(f"Parameters: {par1}")
        print(f"\tF1 score: {f1_macro}")
        results_df = pd.concat(
            [
                results_df,
                pd.DataFrame(
                    {
                        "penalty": par1["penalty"],
                        "C": par1["C"],
                        "solver": par1["solver"],
                        "max_iter": par1["max_iter"],
                        "F1 Score": f1_macro,
                        "Precision": precision,
                        "Recall":recall
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
#change the name when we change the pretrained model
results_df.to_csv(RES_DIR+f"results-Logistic-google.csv", index=False)

Here we defined a function for the fine tuning of a word2vec model using punkt tokenizer

In [27]:
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import Word2Vec

def w2v_tok(nome,dimensione,google_model):
    #we download the tokenizer
    nltk.download('punkt')

    #we define the word2vec model dimension the one of the pretrained emebedding
    finetuned_model = Word2Vec(vector_size=dimensione, min_count=1)
    #we split the dataset
    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

    #we use the tokenizer to tokenize our training set
    tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in X_train]

    #we buld the vocabulary
    finetuned_model.build_vocab(tokenized_corpus)

    #we take the word that are common between our model and the pretrained
    intersecting_words = set(finetuned_model.wv.key_to_index.keys()) & set(google_model.key_to_index.keys())

    # Update the embeddings of the fine-tuned model with pre-trained embeddings for intersecting words
    for word in intersecting_words:
        finetuned_model.wv[word] = google_model[word]

    # Train the fine-tuned Word2Vec model
    finetuned_model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)
    finetuned_model.wv.save(f"./Embeddings/w2v-pretrained-tok-{nome}")

    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    X_train1 = documents_vector(tr_fold, finetuned_model)
    X_va1l = documents_vector(val_fold, finetuned_model)
    

Here we defined a function for the fine tuning of a fasttext model using punkt tokenizer

In [28]:
from nltk.tokenize import word_tokenize
import nltk
from gensim.models import FastText
from gensim.models import Word2Vec

def ftx_tok(nome,dimensione,google_model):
    #we download the tokenizer
    nltk.download('punkt')

    #we define the fasttext model dimension the one of the pretrained emebedding
    finetuned_model = FastText(vector_size=dimensione, min_count=1)
    #we split the dataset
    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

    #we tokenize our training set
    tokenized_corpus = [word_tokenize(sentence.lower()) for sentence in X_train]
    #we build the vocabulary
    finetuned_model.build_vocab(tokenized_corpus)
    #we take the word that are common between our model and the pretrained
    intersecting_words = set(finetuned_model.wv.key_to_index.keys()) & set(google_model.key_to_index.keys())

    # Update the embeddings of the fine-tuned model with pre-trained embeddings for intersecting words
    for word in intersecting_words:
        finetuned_model.wv[word] = google_model[word]

    # Train the fine-tuned Word2Vec model
    finetuned_model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)

    finetuned_model.wv.save(f"./Embeddings/ftx-pretrained-tok-{nome}")

    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    #pooling
    X_train1 = documents_vector(tr_fold, finetuned_model)
    X_va1l = documents_vector(val_fold, finetuned_model)


In the cell below is defined the function that make the fine tuning using the bert based tokenizer. We fine tuned a word2vec model

In [29]:
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import FastText


def w2v_huggin(nome,dimensione,google_model):
    nltk.download('punkt')

    #we take the tokenier from bert
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    #we create a word2vec model
    finetuned_model = Word2Vec(vector_size=dimensione, min_count=1)
    #we split the dataset
    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

    #we tokenize
    tokenized_corpus = [tokenizer.tokenize(sentence.lower()) for sentence in X_train]
    #we build the vocabulary
    finetuned_model.build_vocab(tokenized_corpus)

    #we take the words that are common between our model vocabulartìy and the pretrained model
    intersecting_words = set(finetuned_model.wv.key_to_index.keys()) & set(google_model.key_to_index.keys())

    # Update the embeddings of the fine-tuned model with pre-trained embeddings for intersecting words
    for word in intersecting_words:
        finetuned_model.wv[word] = google_model[word]

    # Train the fine-tuned Word2Vec model
    finetuned_model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)
    #we save the model
    finetuned_model.wv.save(f"./Embeddings/w2v-pretrained-huggin-{nome}")

    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    #we apply the pooling
    X_train1 = documents_vector(tr_fold, finetuned_model)
    X_va1l = documents_vector(val_fold, finetuned_model)
   

In the cell below is defined the function that make the fine tuning using the bert based tokenizer. We fine tuned a fasttext model

In [30]:
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
import nltk
from gensim.models import Word2Vec
from gensim.models import FastText

def ftx_hug(nome,dimensione,google_model):

    nltk.download('punkt')

    #we take the tokenier from bert
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    #we create a fasttext model
    finetuned_model = FastText(vector_size=dimensione, min_count=1)
    #we split the dataset in training and validation
    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

    #we tokenize
    tokenized_corpus = [tokenizer.tokenize(sentence.lower()) for sentence in X_train]
    #we build the vocabulary
    finetuned_model.build_vocab(tokenized_corpus)

    #we take the common word from our model and the pretrained model
    intersecting_words = set(finetuned_model.wv.key_to_index.keys()) & set(google_model.key_to_index.keys())

    # Update the embeddings of the fine-tuned model with pre-trained embeddings for intersecting words
    for word in intersecting_words:
        finetuned_model.wv[word] = google_model[word]

    # Train the fine-tuned Fasttex model
    finetuned_model.train(tokenized_corpus, total_examples=len(tokenized_corpus), epochs=10)
    #we save it
    finetuned_model.wv.save(f"./Embeddings/ftx-pretrained-huggin-{nome}")

    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    #we procede with the pooling
    X_train1 = documents_vector(tr_fold, finetuned_model)
    X_va1l = documents_vector(val_fold, finetuned_model)

    

We took everyone of our models and we compute we apply the fine tuning with to our models(word2vec and fasttext) using 2 different tokenizer

In [None]:
vet=[EMBEDDING_FILE_GOOGLE,EMBEDDING_FILE_GIGA,EMBEDDING_FILE_TW,EMBEDDING_FILE_RUSS]
for i in range(0,4):
    
    if(i==0 or i==3):
        #here we put binary equals to true because 'google-news-300' and 'russ corpora 300'gives a binary file
        google_model = KeyedVectors.load_word2vec_format(vet[i], binary=True)
        if(i==0):
            #fine tuning with google news 300
            w2v_tok('google',300,google_model)
            w2v_huggin('google',300,google_model)
            ftx_hug('google',300,google_model)
            ftx_tok('google',300,google_model)
        else:
            #fine tuning with russ corpora 300
            w2v_tok('russ',300,google_model)
            w2v_huggin('russ',300,google_model)
            ftx_hug('russ',300,google_model)
            ftx_tok('russ',300,google_model)
    else:

        #here we put binary equals to true because 'glove-twitter-200' and 'glove wiki gigaword 300'gives a binary file
        google_model= KeyedVectors.load_word2vec_format(vet[i], binary=False)
        if(i==2):
            #fine tuning with twitter
            w2v_tok('twitter',200,google_model)
            w2v_huggin('twitter',200,google_model)
            ftx_hug('twitter',200,google_model)
            ftx_tok('twitter',200,google_model)
        else:
            #fine tuning with gigaword 300
            w2v_tok('giga',300,google_model)
            w2v_huggin('giga',300,google_model)
            ftx_hug('giga',300,google_model)
            ftx_tok('giga',300,google_model)
    

In the cell below we evaluate on the validation set the fine tuned models. We apply a grid search over the possible hyperparameters of the logistic regression. We took only the fasttext fine tuned models

In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from gensim.models import KeyedVectors

#we take the the fasttext finte tuned model and we run them ove a logistic regression
vet=['Embeddings/ftx-pretrained-tok-giga','Embeddings/ftx-pretrained-huggin-giga',
     'Embeddings/ftx-pretrained-tok-twitter','Embeddings/ftx-pretrained-huggin-twitter',
     'Embeddings/ftx-pretrained-tok-google','Embeddings/ftx-pretrained-huggin-google',
     'Embeddings/ftx-pretrained-tok-russ','Embeddings/ftx-pretrained-huggin-russ']

for i in range(0,8):
    
    #we load the fine tuned model
    finetuned_model=model = KeyedVectors.load(vet[i])

    # we split the data set into training and validation set
    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)


    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    #we apply the pooling
    X_train1 = documents_vector_pre(tr_fold, finetuned_model)
    X_va1l = documents_vector_pre(val_fold, finetuned_model)


    #definition of the hyperparameters of the logistic regression
    hyperparameters1 = {
            "penalty": ["l2"],
            "C": [0.1, 1.0, 10.0, 100.0, 1000.0,1500.0],
            "solver": ["lbfgs"],
            "max_iter": [100, 200, 500,700],
        }

    param_grid1 = list(ParameterGrid(hyperparameters1))

    results_df = pd.DataFrame(
        columns=["penalty", "C", "solver", "max_iter", "F1 Score","Precision","Recall"]
    )

    for par1 in param_grid1:
            #we create and train the logistic regression
            model = LogisticRegression(**par1)
            model.fit(X_train1, y_train)

            # Compute F1 score, Precison and Recall on validation set
            y_val_pred = model.predict(X_va1l)
            f1_macro = f1_score(y_val, y_val_pred, average="macro")
            precision= precision_score(y_val,y_val_pred)
            recall=recall_score(y_val,y_val_pred)

            print(f"Parameters: {par1}")
            print(f"\tF1 score: {f1_macro}")
            #we save everything in the dataframe
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {
                            "penalty": par1["penalty"],
                            "C": par1["C"],
                            "solver": par1["solver"],
                            "max_iter": par1["max_iter"],
                            "F1 Score": f1_macro,
                            "Precision": precision,
                            "Recall":recall
                            },
                        index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

    # we export the dataframe to csv
    #for ponkt tokenizer
    if(i==0):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-tok-giga.csv", index=False)
    if(i==2):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-tok-twitter.csv", index=False)
    if(i==4):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-tok-google.csv", index=False)
    if(i==6):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-tok-russ.csv", index=False)
    #for hugginface tokenizer
    if(i==1):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-huggin-giga.csv", index=False)
    if(i==3):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-huggin-twitter.csv", index=False)
    if(i==5):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-huggin-google.csv", index=False)
    if(i==7):
        results_df.to_csv(RES_DIR+f"results-Logistic-ftx-pretrained-huggin-russ.csv", index=False)

In the cell below we evaluate on the validation set the fine tuned models. We apply a grid search over the possible hyperparameters of the logistic regression. We took only the word2vec fine tuned models

In [None]:
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from gensim.models import KeyedVectors

#we take the the word2vec fine tuned model and we run them ove a logistic regression
vet=['Embeddings/w2v-pretrained-tok-giga','Embeddings/w2v-pretrained-huggin-giga',
     'Embeddings/w2v-pretrained-tok-twitter','Embeddings/w2v-pretrained-huggin-twitter',
     'Embeddings/w2v-pretrained-tok-google','Embeddings/w2v-pretrained-huggin-google',
     'Embeddings/w2v-pretrained-tok-russ','Embeddings/w2v-pretrained-huggin-russ']

for i in range(0,8):
    #we load the fine tuned model
    finetuned_model=model = KeyedVectors.load(vet[i])

    # we split the data set into training and validation set
    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)


    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    #we apply the pooling
    X_train1 = documents_vector_pre(tr_fold, finetuned_model)
    X_va1l = documents_vector_pre(val_fold, finetuned_model)


    #definition of the hyperparameters of the logistic regression
    hyperparameters1 = {
            "penalty": ["l2"],
            "C": [0.1, 1.0, 10.0, 100.0, 1000.0,1500.0],
            "solver": ["lbfgs"],
            "max_iter": [100, 200, 500,700],
        }

    param_grid1 = list(ParameterGrid(hyperparameters1))

    results_df = pd.DataFrame(
        columns=["penalty", "C", "solver", "max_iter", "F1 Score","Precision","Recall"]
    )

    for par1 in param_grid1:
            #we create and train the logistic regression
            model = LogisticRegression(**par1)
            model.fit(X_train1, y_train)

            # Compute F1 score, Precison and Recall on validation set
            y_val_pred = model.predict(X_va1l)
            f1_macro = f1_score(y_val, y_val_pred, average="macro")
            precision= precision_score(y_val,y_val_pred)
            recall=recall_score(y_val,y_val_pred)

            print(f"Parameters: {par1}")
            print(f"\tF1 score: {f1_macro}")
            #we save everything in the dataframe
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {
                            "penalty": par1["penalty"],
                            "C": par1["C"],
                            "solver": par1["solver"],
                            "max_iter": par1["max_iter"],
                            "F1 Score": f1_macro,
                            "Precision": precision,
                            "Recall":recall
                            },
                        index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

    # we export the dataframe to csv
    if(i==0):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-tok-giga.csv", index=False)
    if(i==2):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-tok-twitter.csv", index=False)
    if(i==4):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-tok-google.csv", index=False)
    if(i==6):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-tok-russ.csv", index=False)
    #for hugginface tokenizer
    if(i==1):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-huggin-giga.csv", index=False)
    if(i==3):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-huggin-twitter.csv", index=False)
    if(i==5):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-huggin-google.csv", index=False)
    if(i==7):
        results_df.to_csv(RES_DIR+f"results-Logistic-w2v-pretrained-huggin-russ.csv", index=False)

Here we compute the word analogy for every fasttext fine tuned model

In [28]:

#we take the the fasttext fine tuned model and we compute the word analogy
vet=['Embeddings/ftx-pretrained-tok-giga','Embeddings/ftx-pretrained-huggin-giga',
     'Embeddings/ftx-pretrained-tok-twitter','Embeddings/ftx-pretrained-huggin-twitter',
     'Embeddings/ftx-pretrained-tok-google','Embeddings/ftx-pretrained-huggin-google',
     'Embeddings/ftx-pretrained-tok-russ','Embeddings/ftx-pretrained-huggin-russ']


analogie = pd.DataFrame(
    columns=["Model", "Accuracy"]
)

for i in range(0,8):
    
    if(i==0):

        #we load the model 
        google_model = KeyedVectors.load(vet[i])
        #we evaluate with word analogy
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        #we put the result on a dataframe
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                        "Model":'ftx-giga-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        #we compute the word analogy with dummy for unknown
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)

        #we crate the dataframe
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                        "Model":'ftx-giga-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                    },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        
    if(i==1):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-giga-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)

        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-giga-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        
    if(i==2):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-twitter-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-twitter-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==3):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-twitter-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-twitter-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==4):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-google-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-google-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==5):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-google-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-google-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==6):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    { 
                        "Model":'ftx-russ-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-russ-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==7):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-russ-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'ftx-russ-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )

analogie.to_csv("word_analogies_ftx.csv", index=False)

Here we compute the word analogy for every fasttext fine tuned model

In [None]:
#we take the the word2vec fine tuned model and we compute the word analogy
vet=['Embeddings/w2v-pretrained-tok-giga','Embeddings/w2v-pretrained-huggin-giga',
     'Embeddings/w2v-pretrained-tok-twitter','Embeddings/w2v-pretrained-huggin-twitter',
     'Embeddings/w2v-pretrained-tok-google','Embeddings/w2v-pretrained-huggin-google',
     'Embeddings/w2v-pretrained-tok-russ','Embeddings/w2v-pretrained-huggin-russ']


analogie = pd.DataFrame(
    columns=["Model", "Accuracy"]
)

for i in range(0,8):
    
    if(i==0):

        #we load the model as keyvector object
        google_model = KeyedVectors.load(vet[i])
        #we compute the word analogy
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        #we save the scores in a dataframe
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                        "Model":'w2v-giga-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        #we compute the word analogy with dummy4unknown 
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)

        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                        "Model":'w2v-giga-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                    },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        
    if(i==1):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-giga-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)

        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-giga-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        
    if(i==2):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-twitter-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-twitter-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==3):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-twitter-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-twitter-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==4):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-google-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-google-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==5):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-google-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-google-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==6):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    { 
                        "Model":'w2v-russ-tok-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-russ-tok-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
    if(i==7):
        google_model = KeyedVectors.load(vet[i])
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-russ-hug-false',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )
        w2v_large_analogy = google_model.evaluate_word_analogies(test_file,dummy4unknown=True)
        analogie = pd.concat(
            [
                analogie,
                pd.DataFrame(
                    {
                         "Model":'w2v-russ-hug-true',
                        'Accuracy':w2v_large_analogy[0]
                        },
                    index=[0],
                    ),
                ],
                ignore_index=True,
            )

analogie.to_csv("word_analogies_w2v.csv", index=False)