## Fast Text Embeddings

In [1]:
from sklearn.linear_model import LogisticRegression
from gensim.utils import simple_preprocess
from gensim.models import fasttext
from datautils import documents_vector
import datautils
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
import pandas as pd
PATH = './Dataset/power-gb-train.tsv'
RES_DIR = './Results/'

In this cell below we apply two grid search: one for the hyperparameters of the fast text model in particular context window and vector size, the other one over the hyperparameters of the logistic regression. Here we are loking for the best combinations of these hyperparameters.

In [None]:

#here we split the dataset into training and validation
X_train, y_train, X_val, y_val, _, _ = datautils.split_holdout_dataset(PATH)

tr_fold = list(map(simple_preprocess, X_train))
val_fold = list(map(simple_preprocess, X_val))

#creation of the grid search over the context window and vector size
hyperparameters = {
        "vector_size": [150, 300,600],
        "window": [10, 20, 30,40,50],
}

param_grid = list(ParameterGrid(hyperparameters))


for par in param_grid:
    # we create the fast text model
    model_ftx = fasttext.FastText(
            sentences=tr_fold,
            vector_size=par['vector_size'],
            window=par['window'],
            min_count=2,
            workers=10,
        )

    # Save word vectors
    word_vectors = model_ftx.wv
    word_vectors.save(f"./Embeddings/ftx-w{par['window']}-s{par['vector_size']}.kv")

    # Pooling
    X_train = documents_vector(tr_fold, model_ftx)
    X_val = documents_vector(val_fold, model_ftx)

    #definition of the hyper parameter of the logistic regression
    hyperparameters1 = {
            "penalty": ["l2"],
            "C": [0.1, 1.0, 10.0, 100.0, 1000.0,1500],
            "solver": ["lbfgs"],
            "max_iter": [100, 200, 500,700],
    }

    param_grid1 = list(ParameterGrid(hyperparameters1))

    #creation of dataframe where we are going to save the data
    results_df = pd.DataFrame(
        columns=["penalty", "C", "solver", "max_iter","Precision","Recall" "F1 Score","Vector_size","Window"]
    )

    for par1 in param_grid1:
            #cration o the logistic regression model with the htperparameters
            #selected by the logistic regression
            model = LogisticRegression(**par1)
            #training of the logistic regression model
            model.fit(X_train, y_train)

            # Compute F1 score on validation set
            y_val_pred = model.predict(X_val)
            f1_macro = f1_score(y_val, y_val_pred, average="macro")
            recall= recall_score(y_val, y_val_pred)
            precision=precision_score(y_val, y_val_pred)

            print(f"Parameters: {par1}")
            print(f"\tF1 score: {f1_macro}")
            #we save the combination of hyperparameters of the model and 
            # his scores
            results_df = pd.concat(
                [
                    results_df,
                    pd.DataFrame(
                        {
                            "penalty": par1["penalty"],
                            "C": par1["C"],
                            "solver": par1["solver"],
                            "max_iter": par1["max_iter"],
                            "Precision" : precision,
                            "Recall": recall,
                            "F1 Score": f1_macro,
                            'Vector_size': par['vector_size'],
                            'Window': par['window'],
                        },
                        index=[0],
                    ),
                ],
                ignore_index=True,
            )

    results_df.to_csv(RES_DIR+f"results-Logistic-ftx-w{par['window']}-s{par['vector_size']}.csv", index=False)

Best combination of the hyperparameters of the logistic regression and vector size and context window, and tested over the test set

In [None]:
#here we split the dataset into training, validation and test set
X_train, y_train, X_val, y_val, X_test, y_test = datautils.split_holdout_dataset(PATH)

tr_fold = list(map(simple_preprocess, X_train))
val_fold = list(map(simple_preprocess, X_val))
test_fold = list(map(simple_preprocess, X_test))

#using the values of the best values of context window and vector size
model_ftx = fasttext.FastText(
            sentences=tr_fold,
            vector_size=600,
            window=50,
            min_count=2,
            workers=10,
        )

# Save word vectors
word_vectors = model_ftx.wv
word_vectors.save(f"./Embeddings/ftx-w{par['window']}-s{par['vector_size']}.kv")

# Pooling
X_train = documents_vector(tr_fold, model_ftx)
X_val = documents_vector(val_fold, model_ftx)
X_test = documents_vector(test_fold, model_ftx)


#using the best combination of hyperparameters for the logistic regression
model = LogisticRegression(solver='lbfgs',penalty='l2',C=10,max_iter=500)
#training of the logistic regression model
model.fit(X_train, y_train)

# Compute F1 score,Recall precision on test set,
y_test_pred = model.predict(X_test)
f1_macro = f1_score(y_test, y_test_pred, average="macro")
recall= recall_score(y_test, y_test_pred)
precision=precision_score(y_test, y_test_pred)

print(f"\tF1 score: {f1_macro}")
print(f"\tRecall: {recall}")
print(f"\tPrecision: {precision}")

In this cell we download a test file for the word analogy in order to evaluate our best FastText model

In [None]:
import requests

X_train, y_train, X_val, y_val, X_test, y_test = datautils.split_holdout_dataset(PATH)

tr_fold = list(map(simple_preprocess, X_train))
val_fold = list(map(simple_preprocess, X_val))
test_fold = list(map(simple_preprocess, X_test))

#using the values of the best values of context window and vector size
model_ftx = fasttext.FastText(
            sentences=tr_fold,
            vector_size=600,
            window=50,
            min_count=2,
            workers=10,
        )

X_train = documents_vector(tr_fold, model_ftx)
X_val = documents_vector(val_fold, model_ftx)


#we load the test file for the word analogy
#this is the url of test file
url = "https://raw.githubusercontent.com/nicholas-leonard/word2vec/master/questions-words.txt"
test_file = 'resources/questions-words.txt'
questions = requests.get(url).content.decode()
with open(test_file,mode='w',encoding='utf-8') as outputfile:
    outputfile.write(questions)
print(questions[:1000])

#we compute the word analogy without dummy4unknown
ftx_large_analogy = model_ftx.evaluate_word_analogies(test_file)
#we compute the word analogy with dummy4unknown
ftx_large_analogy_dummy = model_ftx.evaluate_word_analogies(test_file,dummy4unknown=True)

In [None]:
#we check teh scorre of the word analogy 
ftx_large_analogy

In [None]:
#we check the score of the word analogy with dummy4unknown 
ftx_large_analogy_dummy