## Test word2vec embeddings

In this notebook we create and train different Word2vec models. In particular we tried different combinations of the hyperparameters, in particular for the values of the context window and vector size

In [2]:
from sklearn.linear_model import LogisticRegression
from gensim.utils import simple_preprocess
from gensim.models import fasttext
from datautils import documents_vector
import datautils
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import pandas as pd
from sklearn.linear_model import LogisticRegression
from gensim.utils import simple_preprocess
from gensim.models import Word2Vec
from datautils import documents_vector
from datautils import documents_vector_pre
PATH = './Dataset/power-gb-train.tsv'
RES_DIR = './Results/'

In the cell below we look for the best comination of the hyperparameteres for the logistic regression

In [6]:

def file_logic(X_train,X_val,y_train,y_val,w,s):
    #definition of the possible hyperparameters of the logistic regression
    hyperparameters1 = {
            "penalty": ["l2"],
            "C": [0.1, 1.0, 10.0, 100.0, 1000.0,1500],
            "solver": ["lbfgs"],
            "max_iter": [100, 200, 500,700],
        }

    param_grid1 = list(ParameterGrid(hyperparameters1))

    results_df = pd.DataFrame(
        columns=["penalty", "C", "solver", "max_iter", "F1 Score","Recall","Precision","Vector_size","Window"]
    )

    for par1 in param_grid1:
                
            model = LogisticRegression(**par1)
            model.fit(X_train, y_train)

            # Compute F1 score on validation set
            y_val_pred = model.predict(X_val)
            f1_macro = f1_score(y_val, y_val_pred, average="macro")
            recall= recall_score(y_val,y_val_pred)
            precision=precision_score(y_val,y_val_pred)

            print(f"Parameters: {par1}")
            print(f"\tF1 score: {f1_macro}")
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {
                            "penalty": par1["penalty"],
                            "C": par1["C"],
                            "solver": par1["solver"],
                            "max_iter": par1["max_iter"],
                            "F1 Score": f1_macro,
                            'Recall':recall,
                            'Precision':precision,
                            'Vector_size': s,
                            'Window': w
                            },
                        index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

    results_df.to_csv(RES_DIR+f"results-Logistic-w2v-batch1-w{w}-s{s}.csv", index=False)

In the cell below we apply a grid search over the values of the context window and vector size of the word2vec model

In [None]:
from gensim.models import KeyedVectors

#definition of the possible hyperparameters for the context window and vector size
hyperparameters2 = {
            "vec": [150,300,600],
            "window": [10,20,30,40,50],
        }

param_grid2 = list(ParameterGrid(hyperparameters2))


for par in param_grid2:

    X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

    tr_fold = list(map(simple_preprocess, X_train))
    val_fold = list(map(simple_preprocess, X_val))

    modelw2v = Word2Vec(
    tr_fold,
    vector_size=par["vec"],
    window=par["window"],
    min_count=2,
    workers=10,
)

    X_train = documents_vector(tr_fold, modelw2v)
    X_val = documents_vector(val_fold, modelw2v)
    file_logic(X_train,X_val,y_train,y_val,par["window"],par["vec"])

In the cell below we create and train our word2vec model. In this cell the values of the context window and vector size are already the best of the one that we compute

In [None]:
#split the dataset into training and validation and test set
X_train, y_train, X_val, y_val, X_test, y_test = datautils.split_holdout_dataset(PATH)

tr_fold = list(map(simple_preprocess, X_train))
val_fold = list(map(simple_preprocess, X_val))
test_fold = list(map(simple_preprocess, X_test))

# w2v model training the best combination of the hyperparameters
modelw2v = Word2Vec(
    tr_fold,
    vector_size=2000,
    window=1000,
    min_count=2,
    workers=10,
)

# Save word vectors
word_vectors = modelw2v.wv
word_vectors.save(f"./Embeddings/w2v-w1000-v2000-skip.kv")

# Pooling
X_train = documents_vector(tr_fold, modelw2v)
X_val = documents_vector(val_fold, modelw2v)
X_test = documents_vector(test_fold, modelw2v)

In this cell we download a test file for the word analogy in order to evaluate our Word2vec model

In [None]:
import requests
#this is the url of test file
url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
test_file = 'resources/questions-words.txt'
questions = requests.get(url).content.decode()
with open(test_file,mode='w',encoding='utf-8') as outputfile:
    outputfile.write(questions)
print(questions[:1000])

#we compute the word analogy without dummy4unknown
w2v_large_analogy = modelw2v.evaluate_word_analogies(test_file)
#we compute the word analogy with dummy4unknown
w2v_large_analogy_dummy = modelw2v.evaluate_word_analogies(test_file,dummy4unknown=True)

In [None]:
#we just want to see our score
w2v_large_analogy

In [None]:
#we just want to see our score
w2v_large_analogy_dummy

In  the cell below we took the values of the best hyperparameters  for the logistic regression

In [None]:
#we create and train the logistic regression withe the best hyperparameters of the logistic regression
model = LogisticRegression(penalty='l2',C=10.0,solver='lbfgs',max_iter=500)
model.fit(X_train, y_train)

# Compute F1 score,Recall and Precision on the test set
y_test_pred = model.predict(X_test)
f1_macro = f1_score(y_test, y_test_pred, average="macro")
recall= recall_score(y_test, y_test_pred)
precision=precision_score(y_test, y_test_pred)


#print of three score that we compute
print(f"\tF1 score: {f1_macro}")
print(f"\tRecall {recall}")
print(f"\tPrecision: {precision}")

#we create the dataframe to save the results
results_df = pd.DataFrame(
        columns=["penalty", "C", "solver", "max_iter", "F1 Score","Recall","Precision","Vector_size","Window"]
    )
results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {
                            "penalty": 'l2',
                            "C": 10,
                            "solver": 'lbfgs',
                            "max_iter": 500,
                            "F1 Score": f1_macro,
                            'Recall':recall,
                            'Precision':precision,
                            'Vector_size': 2000,
                            'Window': 1000
                            },
                        index=[0],
                        ),
                    ],
                    ignore_index=True,
                )

results_df.to_csv(RES_DIR+f"results-Logistic-w2v-test-w1000-s2000.csv", index=False)
