<a href="https://colab.research.google.com/github/Taedriel/ZSL-v2/blob/wordEmbedding/WordsEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation part

## import stuff


In [273]:
!yes | pip install transformers wget "wikipedia>=1.4.0" unzip mxnet gluonnlp "scipy>=1.7" scikit-bio wikipedia2vec --quiet --upgrade
!mkdir -p temp article

In [274]:
import numpy as  np
import torch
from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizer
import tensorflow as tf
import gluonnlp as nlp 
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import wordnet

import matplotlib as mpl
import matplotlib.pyplot as plt

from wikipedia2vec import Wikipedia2Vec
import wikipedia
wikipedia.set_rate_limiting(True)

import gc
import traceback
import pickle
import json
import math

from tqdm import tqdm
from typing import List, Tuple, Dict, Callable
from os.path import exists, join, abspath
from os import system
from enum import Enum
from time import perf_counter
from scipy.spatial.distance import cityblock

from scipy.stats import SpearmanRConstantInputWarning
from scipy.stats import spearmanr
from skbio import DistanceMatrix
from skbio.tree import nj

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [275]:
import logging
import warnings
from transformers import logging as transformer_logging

FORMAT = '%(levelname)-10s %(message)s'
logging.basicConfig(format=FORMAT, level = logging.INFO, filename = "WordsEmbeddings.log" )

transformer_logging.set_verbosity_error()

warnings.filterwarnings("ignore", category=UserWarning, module='wikipedia')

## Utils classes

### Usefull functions ✅

In [276]:
def dict2csv(filename : str, embeddings : Dict[str, List[float]]) -> None:
    """ write a dict of embeddings under a .CSV file

    the .CSV file is construct with a header looking like this :
    \tembeddings\t | 0 | 1 | 2 | 3 | ...
    where each line contain an embeddings for the word in the first row
    Args:
        filename (str) : a path to the file where the .csv is to be written
        embeddings (Dict[str, List[float]]): a dictionnary of embeddings

    """
    logging.info(f"writing dict to {filename} file")
    try:
        f = open(filename, "w")
    except OSError:
        raise OSError("Could not open file")

    dimension_number = len(next(iter(embeddings.values())))
    with f:
        print("embeddings", *[str(i) for i in range(dimension_number)], sep=",", file=f)
        for tag, embedding in embeddings.items():
            print(tag, *list(map(lambda x: str(float(x)), embedding)), sep=",", file=f)
    logging.info("done")

def sim2dist(mat : List[List[float]], func : Callable[[float], float] \
             = lambda x: 1 - x, hollow : bool = True) -> List[List[float]]:
    """ map the function func to each elements in the matrix

    apply the lambda function func to each element of the matrix. if hollow is set 
    to True, set the diagonal of the matrix to 0.
    Args:
        mat (List[List[float]]) : a matrix of number
        func (Callable[[float], float]) : a simple function to apply to each elem of the matrix
        hollow (bool) : whether to consider the diagonal of the matrix or not
    
    """
    logging.info(f"converting similarity matrix to distance matrix")
    inv_data = [[0 for i in range(len(mat[0]))] for j in range(len(mat))]

    for i, elem in enumerate(mat):
        for j, case in enumerate(elem):
            if i == j and hollow: 
                inv_data[i][j] = 0
            else:
                inv_data[i][j] = func(case)
                
    logging.info("done")
    return inv_data

def print_mat(mat : List[List[float]], format_function : Callable[[float], str]=lambda x: x) -> None:
    """ print a matrice on stdout

    format each number in the matrice using the format_function
    Args:
        mat (List[List[float]]) : a matrix of number
        format_function (Callable[[float], str]) : a simple format function to display numbers froms the matrix
    """
    for line in mat:
        for case in line:
            print(f"{format_function(case):8}", end="")
        print()

### Strategy ✅



In [277]:
class BERTMergeStrategy:
    """ strategy to extract BERT embeddings
    
    different approach exist, 
    see https://raw.githubusercontent.com/lbourdois/blog/master/assets/images/BERT/bert-feature-extraction-contextualized-embeddings.png
    for more possible strategy
    """

    def merge(self, vector : List[List[float]]) -> torch.Tensor:
        raise NotImplementedError

class Sum4LastLayers(BERTMergeStrategy):

    def merge(self, vector : List[List[float]]) -> torch.Tensor:
        return torch.sum(vector[-4:], dim = 0)

class Concat4LastLayer(BERTMergeStrategy):

    def merge(self, vector : List[List[float]]) -> torch.Tensor:
        return torch.concat(vector[-4:], dim = 0)

class SimilarityStrategy:
    """strategy to compute similarity between embeddings. Cosine similarity should 
    be the only valid one in word embeddings, other aren't relevant
    """

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        raise NotImplementedError

class CosineSim(SimilarityStrategy):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        cos = torch.nn.CosineSimilarity(dim=0)
        return cos(embed1, embed2)

class EuclidianDistSim(SimilarityStrategy):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        return np.linalg.norm(embed1-embed2)

class ManhattanDistSim(SimilarityStrategy):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        return cityblock(embed1, embed2)

### Article

In [278]:
class customArticle:
    """ store a wikipedia article for further processing by models"""

    def __init__(self, index : int, title : str, realtitle : str, summary : str, ambiguous : bool):
        self.index : int = index
        self.title : str = title
        self.realtitle : str = realtitle
        self.summary :str = summary
        self.ambiguous :bool = ambiguous

class ArticleRetriever:

    """ Class in charge of retrieveing article from different sources and store them in orer
    to not re retrieve them. 
    
    Act as a proxy between wikipedia and the model. This class save all the article 
    retrieve in a dict using the name given. Further call to this retriever will 
    then load the previously saved file if it hasn't been deleted.
    """

    article_dir = "./article"

    def __init__(self, name : str = None, list_title : List[str] = []):

        self.name : str = name
        if self.name is None:
            self.name = "temp"

        self.list_title : List[str] = list_title
        self.modified : bool = False
        self._load()

    def _load(self):
        if not exists(self.get_filename()):
            self.articles_map = {}
            logging.info(f"creating file {self.get_filename()}")
        else:
            with open(self.get_filename(), "rb") as mapfile:
                self.articles_map = pickle.load(mapfile)
                assert(type(self.articles_map) == type(dict()))
            logging.info(f"loading file {self.get_filename()} with {len(self.articles_map)} articles")
    
    def set_list_vocab(self, new_name : str, list_title : List[str]):
        logging.info("changing vocab, reloading file...")
        self.list_title : List[str] = list_title
        self.name = new_name
        self._load()

    def get_filename(self) -> str:
        """ return the filename of the file where article are saved"""
        return join(WikipediaArticleRetriever.article_dir, self.name)

    def load_article(self, title : str, force_reload : bool = False) -> customArticle:
        """ retrieve an article from wikipedia. If forcce reload is specified, re check the article 
        is summary isn't present or if article not alread in the dict""" 

        if title not in self.articles_map:
            self.modified = True
            realtitle, summary, ambiguous = self._retrieve_article(title)
            self.articles_map[title] = customArticle(len(self.articles_map), title, realtitle, summary, ambiguous)

        if title in self.articles_map and self.articles_map[title].summary == None and force_reload:
            self.modified = True
            realtitle, summary, ambiguous = self._retrieve_article(title)
            self.articles_map[title].summary = summary

        return self.articles_map[title]

    def load_all_articles(self, force_reload : bool = False) -> None:
        """retrieve all article from the vocab from sources"""
        
        logging.info(f"Starting loading articles... [Force reload : {force_reload}]")
        nb_success = 0

        nb_article = len(self.list_title)
        for i, title in tqdm(enumerate(self.list_title), total=nb_article, desc=f"{'loading articles':30}"):
            self.load_article(title, force_reload)

            if self.articles_map[title].summary is not None: 
                nb_success += 1

        logging.info(f"Finished loading {nb_success} article(s) / {nb_article} ({round(nb_success / nb_article * 100, 1)}%)!")
        return self.modified

    def __call__(self, force_reload : bool = False) -> None:
        return self.load_all_articles(force_reload)

    def _retrieve_article(self, title : str, closed_list : List[str]) -> Tuple[str, str, bool]:
        raise NotImplementedError

    def get_article(self, title) -> customArticle:
        """return the article if it's present, else, try to retrieve it"""

        if title not in self.articles_map:
            self.load_article(title)

        return self.articles_map[title]
        
    def save(self):
        """save the articles in a binary format using pickle"""
        logging.info(f"saving the file {self.get_filename()}")
        with open(self.get_filename(), "wb") as mapfile:
            pickle.dump(self.articles_map, mapfile)

class WikipediaArticleRetriever(ArticleRetriever):

    def __init__(self, name: str = None, list_title: List[str] = []):
        super().__init__(name, list_title)

    def get_filename(self) -> str:
        """ return the filename of the file where article are saved"""
        return join(WikipediaArticleRetriever.article_dir, "Wiki-" + self.name)

    def _retrieve_article(self, title : str, closed_list : List = []) -> Tuple[str, str, bool]: 
        closed_list.append(title)
        try:
            article = wikipedia.page(title, auto_suggest=False, redirect=True)
            return (article.title, article.summary, False)

        except wikipedia.PageError as e:
            search_result = wikipedia.search(title, suggestion = False)

            logging.warning(f"{title} misspelled or article missing. Best find is {search_result[0]}")
            if search_result[0] is not None and search_result[0] not in closed_list:            
                return self._retrieve_article(search_result[0], closed_list)  
            else: return (None, None, None)

        except wikipedia.DisambiguationError as e:
            logging.warning(f"{title} is ambiguous, fallback on {e.options[0]}")
            return (None, None, None)
            # if e.options[0] is not None and e.options[0] not in closed_list:
            #     res = self._retrieve_article(e.options[0], closed_list)
            #     return (res[0], res[1], True)
        return (None, None, None)

class WordNetArticleRetriever(ArticleRetriever):

    def __init__(self, name: str = None, list_title: List[str] = []):
        super().__init__(name, list_title)

    def get_filename(self) -> str:
        """ return the filename of the file where article are saved"""
        return join(WikipediaArticleRetriever.article_dir, "Word-" + self.name)

    def _retrieve_article(self, title: str, closed_list : List = []) -> Tuple[str, str, bool]:
        result = wordnet.synsets(title)
        if len(result) > 0:
            return (title, result[0].definition(), True)

        return (None, None, None)

class ArticleViewer():

    def __init__(self, filename):
        self.name = filename

        if not exists(self.name):
            raise FileNotFoundError()
        else:
            with open(self.name, "rb") as mapfile:
                self.articles_map = pickle.load(mapfile)

    def get(self, title):
        return self.articles_map[title]

    def get_all_articles(self):
        return self.articles_map.keys()


### Embeddings operations 

In [279]:
class EmbeddingsLoader:

    """class that load an embeddings file to perform operation on it. Base class
     for multiple operations such as matrix similarity operations.
     """

    def __init__(self, filename : str):

        self.file = filename
        self.embeddings = {}

        self._load_file()

    def _load_file(self):
        try:
            with open(self.file, "r") as f:
                lines = f.readlines()
                
            for line in lines[1:]:
                data = line.split(",")
                self.embeddings[data[0]] = torch.FloatTensor(list(map(float, data[1:])))

        except IOError as e:
            raise IOError(f"No file {self.file}")

class SimilarityMatrix(EmbeddingsLoader):

    def __init__(self, embeddings : Dict[str, List[float]], strategy : SimilarityStrategy):
        super(SimilarityMatrix, self).__init__(embeddings)
        self.strategy = strategy
        self._create_matrix()
        self.computed : bool = False

    def _create_matrix(self) -> None:
        n_tokens = len(self.embeddings)
        self.cosine_sim_matrix : Dict[Dict[float]] = {}
        for tag in self.embeddings.keys():
            self.cosine_sim_matrix[tag] = {}

    def compute_sim(self) -> None:
        """ compute cosine similarity between all vectors """

        closed_list = []

        logging.info("Computing cosine similarity, this could take some time...")
        for tag, vector in tqdm(self.embeddings.items(), total = len(self.embeddings), desc=f"{'computing sim matrix':30}"):

            for otag, other_vector in self.embeddings.items():

                if otag == tag: continue
                # if (tag, otag) in closed_list or (otag, tag) in closed_list: continue

                similarity = self.strategy.sim(vector, other_vector)

                self.cosine_sim_matrix[otag][tag] = similarity
                self.cosine_sim_matrix[tag][otag] = similarity

                # closed_list.append((tag, otag))
                # closed_list.append((otag, tag))

        self.computed = True

    def export_sim_matrix(self, filename):
        if not self.computed:
            self.compute_sim()
        
        try:
            f = open(filename, "w")
        except OSError:
            raise OSError("Could not open file")

        with f:
            print("/", *[tag for tag in self.embeddings.keys()], sep = ",", file = f)

            for tag in self.embeddings.keys():
                print(tag, *[str(round(float(self.cosine_sim_matrix[tag][otag]), 3)) for otag in self.embeddings.keys()], sep = ",", file = f)

    def get_sim_matrix(self) -> Tuple[List[str], List[List[float]]]:
        """return the similarity matrix of the embeddings
        """
        if not self.computed:
            self.compute_sim()

        X = len(self.embeddings)
        matrix = [[0 for j in range(X)] for i in range(X)]
        ids = []
        
        for i, tag in enumerate(self.embeddings.keys()):
            ids.append(tag)
            for j, otag in enumerate(self.embeddings.keys()):
                if i == j:
                    continue

                matrix[i][j] = self.cosine_sim_matrix[tag][otag]
                matrix[j][i] = self.cosine_sim_matrix[tag][otag]

        return ids, matrix

    def sim_between(self, token1 : str, token2 : str) -> float:
        v1 = self.embeddings[token1]
        v2 = self.embeddings[token2]

        if token2 not in self.cosine_sim_matrix[token1] or token1 not in self.cosine_sim_matrix[token2]:
            similarity = self.strategy.sim(v1, v2)
            self.computed = True

            self.cosine_sim_matrix[token1][token2] = similarity
            self.cosine_sim_matrix[token1][token2] = similarity

        return self.cosine_sim_matrix[token1][token2]

### Solver

In [280]:
class Solver(EmbeddingsLoader):

    DEFAULT_MIN_LIST_RESULT = 10

    def __init__(self, embeddings):
        super(Solver, self).__init__(embeddings)

    def get_nearest_embedding_of(self, embedding, nb = 10):

        if nb > len(self.embeddings):
            raise Exception("nb too high, not enough token")

        nearest = []
        for tag, e in self.embeddings.items():

            cos = torch.nn.CosineSimilarity(dim=0)
            similarity = cos(embedding, e)

            nearest.append((tag, similarity))
        
        nearest.sort(key = lambda tup : tup[1])
        return nearest[-1:-nb-1:-1]

    def __call__(self, embeddeding, tag=None):
        result = self.get_nearest_embedding_of(embeddeding, min(Solver.DEFAULT_MIN_LIST_RESULT, len(self.embeddings)))
        if tag is not None:
            print(f"Nearest Word for {tag}:")
        for i in result:
            print(f"\t{i[0]:12}: {round(float(i[1]) * 100, 3)}%")
    
    def score(self, embedding, target):
        target_embeddings = self.embeddings[target]

        cos = torch.nn.CosineSimilarity(dim=0)
        return float(cos(embedding, target_embeddings))

    def least_squared_score(self, embedding, target):
        target_embeddings = self.embeddings[target]
        return float(np.linalg.norm(target_embeddings - embedding))

    def mean_squared_score(self, embedding, target):
        target_embeddings = self.embeddings[target]
        return float(np.square(np.subtract(embedding, target_embeddings)).mean())
        

## Models

In [281]:
class WordToVector:

    def __init__(self, list_tags : List[str] = []):
        self.list_tags = list_tags
        self.embeddings = {}

    def set_list_class(self, list_class : List[str]):
        self.list_tags = list_class
        self.reset_embeddings()

    def check_embeddings_exist(self, filename : str, article_ret : WikipediaArticleRetriever):
        temp_tags_list = self.list_tags
        first_tag = self.list_tags[0]
        self.list_tags = [first_tag]

        self.convert(article_ret)
        try:
            loader = EmbeddingsLoader(filename)
        except OSError:
            self.list_tags = temp_tags_list
            return False

        first_embedding = self.embeddings[first_tag]
        to_compare = loader.embeddings[first_tag]

        intersect = set(temp_tags_list) & set(loader.embeddings.keys())

        self.list_tags = temp_tags_list
        if torch.equal(first_embedding, to_compare) and len(intersect) == len(temp_tags_list):
            return True

        return False

    def convert(self, article_ret : WikipediaArticleRetriever):
        return NotImplementedError

    def reset_embeddings(self):
        self.embeddings.clear()

    def get_embedding_of(self, token):
        if token not in self.embeddings:
            raise Exception(f"no such token {token}")
        
        return self.embeddings[token]

    def get_class_list(self):
        return self.embeddings.keys()

    def export(self, filename):
        """export all the embeddings in filename under a .csv format.
           Raise exception if embeddings hasn't been calculed yet."""

        if len(self.embeddings) == 0:
            raise Exception("Tags not converted yet !")
        
        dict2csv(filename, self.embeddings)

### BERT model

In [282]:
class BERTModel(WordToVector):

    temp_dir = "./temp"

    def __init__(self, list_tag : List[str], big: bool = False, window : int = 100):
        super(BERTModel, self).__init__(list_tag)
        self.window_size = window

        self.model_size = "bert-large-uncased" if big else "bert-base-uncased"

        self.tokenizer = BertTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = BertModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

    def _one_pass(self, inputs):
        with torch.no_grad():
            outputs = self.model(input_ids = inputs["input_ids"], attention_mask = inputs["attention_mask"])

        hidden_states = outputs[2]

        # [# layers, # batches, # tokens, # features] ==> [# tokens, # layers, # features]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)

        return self.merging_strategy.merge(token_embeddings[0])

    def convert(self, article_ret : WikipediaArticleRetriever):
        """ convert all word in their embeddings"""

        if len(self.list_tags) == 0:
            raise Exception("no tags yet !")

        logging.info("Starting converting tokens...")
        nb_token = len(self.list_tags)
        for i, tag in tqdm(enumerate(self.list_tags), total = nb_token, desc=f"{'converting to embedding':30}"):
            
            if tag in self.embeddings: continue

            article = article_ret.get_article(tag)
            if article.summary is None or self.window_size == 0:
                self.embeddings[tag] = self._one_pass(self.tokenizer(tag, return_tensors = "pt"))
                continue

            sub_ids = self.tokenizer.encode(tag + ". " + article.summary)[0:self.window_size]
            subinputs = {   "input_ids": torch.IntTensor(sub_ids).unsqueeze(0), \
                            "token_type_ids": torch.IntTensor([0 for k in range(len(sub_ids))]).unsqueeze(0), \
                            "attention_mask": torch.IntTensor([1 for k in range(len(sub_ids))]).unsqueeze(0)  }

            self.embeddings[tag] = self._one_pass(subinputs)



### RoBERTa model

In [283]:
class ROBERTAModel(BERTModel):

    def __init__(self, list_tag : List[str], big: bool = False, window : int = 100):
        WordToVector.__init__(self, list_tag)
        self.window_size = window

        self.model_size = "roberta-large" if big else "roberta-base"

        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = RobertaModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

### DocBERTModel model

In [284]:
class DocBERTModel(BERTModel):

    def __init__(self, list_tag : List[str], big : bool = False):
        WordToVector.__init__(self, list_tag)
        self.window_size = "document"

        self.model_size = "bert-large-uncased" if big else "bert-base-uncased"

        self.tokenizer = BertTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = BertModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.max_size = self.tokenizer.model_max_length
        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

    def _one_pass(self, subinputs):
        with torch.no_grad():
            outputs = self.model(input_ids = subinputs["input_ids"], attention_mask = subinputs["attention_mask"])

        hidden_states = outputs[2]

        # [# layers, # batches, # tokens, # features] ==> [# tokens, # layers, # features]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)

        return self.merging_strategy.merge(token_embeddings[0])


    def convert(self, article_ret):
        """ convert all word in their embeddings"""

        if len(self.list_tags) == 0:
            raise Exception("no tags yet !")

        logging.info("Starting converting tokens...")
        nb_token = len(self.list_tags)
        for i, tag in tqdm(enumerate(self.list_tags), total = nb_token, desc=f"{'converting to embedding':30}"):
            
            if tag in self.embeddings: continue

            article = article_ret.get_article(tag)

            if article.summary is None:
                logging.warning(f"no article for {tag}")
                self.embeddings[tag] = self._one_pass(self.tokenizer(tag, return_tensors = "pt"))
                continue

            torch_cls = []

            ids = self.tokenizer.encode(article.summary)
            nb_token = len(ids)

            if nb_token < self.max_size:
                self.embeddings[tag] = self._one_pass(self.tokenizer(tag, return_tensors = "pt"))
                continue

            nb_pass = math.ceil(nb_token / self.max_size)
            logging.info(f"{tag} is {nb_pass} pass")

            stop = 50
            for j in range(nb_pass):
                start = stop - 50
                stop = min(nb_token, start + self.max_size)
                
                sub_ids = ids[start:stop]

                subinputs = { "input_ids": torch.IntTensor(sub_ids).unsqueeze(0), \
                            "token_type_ids": torch.IntTensor([0 for k in range(len(sub_ids))]).unsqueeze(0), \
                            "attention_mask": torch.IntTensor([1 for k in range(len(sub_ids))]).unsqueeze(0)  }
                torch_cls.append(self._one_pass(subinputs))
                if stop == nb_token: break

            self.embeddings[tag] = torch.mean(torch.stack(tuple(t for t in torch_cls)), axis=0)

### DocBERTAModel model

In [285]:
class DocBERTAModel(DocBERTModel):

    def __init__(self, list_tag : List[str], big : bool = False):
        WordToVector.__init__(self, list_tag)
        self.window_size = "document"

        self.model_size = "roberta-large" if big else "roberta-base"

        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = RobertaModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.max_size = self.tokenizer.model_max_length

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

### Wikipedia2Vec

In [286]:
class Wiki2VecModel(WordToVector):

    file = []

    def __init__(self, list_tag : List[str], size : int = 300):
        WordToVector.__init__(self, list_tag)
        self.window_size = size

        self.model_size = "wikipedia2vec"
        assert size in [100, 300, 500], f"size should be one of this value (100, 300, 500)"
        self.zip_filename = f"enwiki_20180420_{self.window_size}d.pkl.bz2"
        self.unzip_filename = self.zip_filename[:-4]
        
        try:
            self._load()
        except:
            return
        
        self.model = Wikipedia2Vec.load(self.path)

    def _load(self):

        self.path = abspath(self.unzip_filename)

        if exists(self.path):
            return

        try:
            system(f"wget http://wikipedia2vec.s3.amazonaws.com/models/en/2018-04-20/{self.zip_filename}")
        except:
            raise SystemError(f"can't retrieve the file {self.zip_filename}")

        try:
            system(f"bunzip2 ./{self.zip_filename}")
        except:
            raise SystemError(f"can't unzip the file {self.zip_filename}")

    def _retrieve(self, word):
        try: return self.model.get_word_vector(word)
        except: pass

        try: return self.model.get_word_vector(word.capitalize())
        except: pass

        try: return self.model.get_word_vector(word.lower())
        except: pass

        try: return self.model.get_entity_vector(word)
        except: pass

        try: return self.model.get_entity_vector(word.capitalize())
        except: pass
        
        try: return self.model.get_entity_vector(word.lower())
        except: pass

        return None
    
    def _one_turn(self, resolve_dict = {}):
        unk = []

        for word in self.list_tags:
            w = word.replace("_", " ")
            if w in self.embeddings: continue

            if w in resolve_dict:
                embed = self._retrieve(resolve_dict[w])
            else:
                embed = self._retrieve(w)

            if embed is None:
                logging.warning(f"{w} cannot be retrieved.")
                unk.append(word)
            else:
                self.embeddings[w] = torch.from_numpy(embed)
        
        return unk

    def convert(self, ar):
        resolve_filename = f"./temp/{ar.name[:-4]}_resolve.json"
        resolve = {}
        while True: 
            unk_list = self._one_turn(resolve)
        
            if unk_list is None or len(unk_list) == 0:
                break

            print(len(unk_list), "items haven't been found, resolve mode.")

            with open(resolve_filename, 'w') as f:
                resolve_dict = {word: "" for word in unk_list}
                json.dump(resolve_dict, f)

            input("press enter to resume resolve")

            resolve = {}
            with open(resolve_filename, 'r') as f:
                resolve = json.load(f)
                assert(type(resolve) == type(dict()))

In [287]:
gc.collect()

142

# Practical part

## Embeddings to word proba

In [None]:
solver = Solver("/content/animal10-roberta-base-0.csv")

totest = solver.embeddings["cat"]

solver(totest, "cat")
print(solver.score(totest, "dog"))
print(solver.mean_squared_score(totest, "dog"))
print(solver.least_squared_score(totest, "dog"))

##Word to embeddings

In [22]:
#@title List of vocab :
#@markdown * animal10
#@markdown * cifar 10 / 100 
#@markdown * imagenet / subimagenet (200 first)
#@markdown * king / queen 

labels_path = tf.keras.utils.get_file('ImageNetLabels.txt','https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt')
imagenet = list(np.array(open(labels_path).read().splitlines()))
subimagenet = imagenet[:200]

animal10 = ["dog", "cat", "horse", "spider", "butterfly", "chicken", "sheep", "cow", "squirrel", "elephant"]

cifar10  = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]

cifar100 = ["apple", "aquarium_fish", "baby", "bear", "beaver", "bed", "bee", "beetle", "bicycle", "bottle", "bowl", "boy", "bridge", "bus", "butterfly", "camel", "can", \
            "castle", "caterpillar", "cattle", "chair", "chimpanzee", "clock", "cloud", "cockroach", "computer_keyboard", "couch", "crab", "crocodile", "cup", \
            "dinosaur", 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'lamp', 'lawn_mower', 'leopard', 'lion', \
            'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', \
            'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark',\
            'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', \
            'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm']

king = ["king", "woman", "man", "queen", "boy", "girl", "male", "female"]

custom = []

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt


In [23]:
#@title Choose a dataset
save_name = "cifar100" #@param ["animal10", "cifar10", "cifar100", "king", "imagenet", "subimagenet", "custom"]
mapping_save_list = {
    "animal10": animal10,
    "cifar10" : cifar10,
    "cifar100" : cifar100,
    "king" : king,
    "imagenet" : imagenet,
    "subimagenet": subimagenet,
    "custom" : custom
}

vocab = mapping_save_list[save_name]

In [172]:
#@title Choose a model and params
window_size  = 300 #@param ["0", "100", "200", "300", "400", "500"] {type:"raw"}
is_big       = True #@param {type:"boolean"}
model_choice = "BERT" #@param ["ROBERTA", "BERT", "Wikipedia2Vec", "DocBERT", "DocBERTA"]

if model_choice == "ROBERT":
    model = ROBERTAModel(vocab, big = is_big, window = window_size)
elif model_choice == "BERT":
    model = BERTModel(vocab, big = is_big, window = window_size)
elif model_choice == "Wikipedia2Vec":
    model = Wiki2VecModel(vocab, size = window_size)
elif model_choice == "DocBERT":
    model = DocBERTModel(vocab, big = is_big)
elif model_choice == "DocBERTA":
    model = DocBERTAModel(vocab, big = is_big)
    
articlesRetriever = WikipediaArticleRetriever(save_name + ".art", vocab)

In [173]:
csv_file = f"{save_name}-{model.model_size}-{model.window_size}.csv"

if articlesRetriever():
    articlesRetriever.save()

model.reset_embeddings()

if not model.check_embeddings_exist(csv_file, articlesRetriever):
    model.convert(articlesRetriever)
    model.export(csv_file)

print("\n", len(model.get_class_list()))

loading articles              : 100%|██████████| 100/100 [00:00<00:00, 73700.65it/s]
converting to embedding       : 100%|██████████| 1/1 [00:12<00:00, 12.17s/it]


 1





In [None]:
# wiki2vec_dict = model.model.dictionary

# with open("./temp/wiki_article.txt", "w") as f:
#     for word in wiki2vec_dict.words():
#         print(word.text, file=f)

#     for ent in wiki2vec_dict.entities():
#         print(ent.title, file=f)

## Neighboor joining Tree

In [None]:
neighboor_sim = SimilarityMatrix(csv_file, CosineSim())

ids, data = neighboor_sim.get_sim_matrix()

ids = [tids.replace(" ", "_") for tids in ids]
inv_data  = sim2dist(data, lambda x: 1 - x) 

# print()
# print_mat(data_inv, format_function = lambda x: round(float(x), 2))

In [None]:
dm = DistanceMatrix(inv_data, ids)
tree = nj(dm)

with open(f"{save_name}-{model_choice}-.tree", "w") as f:
    print(tree, file = f)

print("\n\n", tree.ascii_art())

## wikipedia debug

In [None]:
#@title Article to search for { run: "auto", vertical-output: true, display-mode: "both" }
totest = "buck" #@param {type:"string"}

result = wikipedia.search(totest, suggestion = False)
print(result)
print(f"first result is: {result[0]}")

try:
    print(wikipedia.page(totest, auto_suggest=False, redirect=True))
    print(wikipedia.page(result[0], auto_suggest=False, redirect=True))
except Exception as e:
    print(f"best option envisaged: {e.options[0]}")
    print(e)

In [None]:
articleviewer = ArticleViewer("/content/article/cifar100.art")
print(articleviewer.get_all_articles())
articleviewer.get(articleviewer.get_all_articles()[0]).summary

# Test part

## Tests 

### Test superclass

In [296]:
class Test:

    def __init__(self, name):
        self.name = name
        self.vocab = []
    
    def _start(self, model : WordToVector, articlesRetriever : ArticleRetriever):
        articlesRetriever.set_list_vocab(f"{self.name}.art", self.vocab)

        if articlesRetriever(force_reload = False):
            articlesRetriever.save()

        self.save_file = f"test-{self.name}-{model.model_size}-{model.window_size}.csv"
        model.set_list_class(self.vocab)

        if not model.check_embeddings_exist(self.save_file, articlesRetriever):
            model.convert(articlesRetriever)
            model.export(self.save_file)


    def _end(self):
        raise NotImplementedError

    def __call__(self, model, articlesRetriever):

        logging.info(f"Start test {self.name}")

        tic = perf_counter()
        self._start(model, articlesRetriever)
        result = self._end(model)
        toc = perf_counter()

        logging.info(f"End test {self.name}")

        return result, toc - tic

### Embedding distance Test

In [297]:
class EmbeddingDistanceTest(Test):

    def __init__(self, vocab, thresold, name):
        Test.__init__(self, name)

        self.thresold = thresold
        self.vocab = vocab

    def _end(self, model):
        
        sim_matrix = SimilarityMatrix(self.save_file, CosineSim())
        ids, cosine_mat = sim_matrix.get_sim_matrix()

        sim_list = []

        for i, idsa in tqdm(enumerate(ids), total = len(ids), desc=f"{'listing sim matrix':30}"):
            for j, idsb in enumerate(ids):

                if i == j: continue 

                cos_val = cosine_mat[i][j]

                if cos_val >= self.thresold:
                    sim_list.append((idsa, idsb, cos_val))

        sim_list.sort(key=lambda x: x[2], reverse = True)
        return len(sim_list) // 2


### Syntactic test

In [298]:
class SyntacticTest(Test):

    def __init__(self, quadruple_set : Tuple[str, str, str, str], name : str):
        Test.__init__(self, name)

        self.relations : Tuple[str, str, str, str] = quadruple_set

        for relation in self.relations:
            for item in relation:
                if item not in self.vocab:
                    self.vocab.append(item)

    def _end(self, model):

        solver = Solver(self.save_file)
        top1, top3, top5, top10 = 0, 0, 0, 0

        for w1, w2, w3, w4 in tqdm(self.relations, total=len(self.relations), desc=f"{'calculating relations':30}"):

            w1_emb = model.get_embedding_of(w1).numpy()
            w2_emb = model.get_embedding_of(w2).numpy()
            w3_emb = model.get_embedding_of(w3).numpy()

            totest = w1_emb - w2_emb + w3_emb
            result = solver.get_nearest_embedding_of(torch.from_numpy(totest), 13)
            filtered_result = list(filter(lambda x: x != None, map(lambda x: x[0] if x[0] not in [w1, w2, w3] else None, result)))

            if filtered_result[0] == w4:
                top1 += 1

            if w4 in filtered_result[:3]:
                top3 += 1

            if w4 in filtered_result[:5]:
                top5 += 1

            if w4 in filtered_result[:10]:
                top10 += 1

        
        return list(map(lambda x : x / len(self.relations), (top1, top3, top5, top10)))

### Similarity Test

In [299]:
class SimilarityTest(Test):

    def __init__(self, pair_set : Tuple[str, str, float], pair_name):
        Test.__init__(self, pair_name)

        self.pair : Tuple[str, str, float] = pair_set

        for w1, w2, i in self.pair:
            if w1 not in self.vocab:
                self.vocab.append(w1)
            if w2 not in self.vocab:
                self.vocab.append(w2)
    
    def _end(self, model):
        sim_list = []
        i_list = []

        sim_computer = SimilarityMatrix(self.save_file, CosineSim())

        for w1, w2, i in tqdm(self.pair, total=len(self.pair), desc=f"{'calculating pair':30}"):
            sim = sim_computer.sim_between(w1, w2)

            sim_list.append(sim)
            i_list.append(i)

        result = spearmanr(sim_list, i_list)
        return (result.correlation, result.pvalue)

In [300]:
def print_list(listed):
    print(f"{'word':20}{'target':20}{'similarity':10}")
    print("="*50)
    for ida, idb, sim in listed:
        print(f"{ida:20}{idb:20}{round(float(sim), 6):6}")


## Pipeline

In [301]:
class TestPipeline():

    list_test = [ SimilarityTest(nlp.data.WordSim353('all'), "Wordsim353"), 
                  SimilarityTest(nlp.data.SimLex999('all') , "SimLex999") ,
                  SyntacticTest(nlp.data.GoogleAnalogyTestSet(), "GoogleAnalogy"),
                  EmbeddingDistanceTest(cifar100, 0.80, "Imagenet")
                ]

    def __init__(self, model, articleRetriever, list_test = None):
        self.model = model
        self.articleRetriver = articleRetriever

        if list_test == None:
            self.list_test = TestPipeline.list_test
        else:
            self.list_test = list_test 

    def execute(self):

        for i, test in enumerate(self.list_test):

            print(f"Test {i} : {test.name}".center(80, "="))
            res, time_elapsed = test(self.model, self.articleRetriver)
            print(f"\n{res}")
            print(f"{round(time_elapsed, 2)} sec.".center(80, "="))

# main

In [302]:
gc.collect()

96733

In [303]:
def all_models_gen():
    yield BERTModel([], big = False, window = 0)    
    yield BERTModel([], big = False, window = 100)
    yield BERTModel([], big = False, window = 300)
    yield BERTModel([], big = False, window = 512)
    yield BERTModel([], big = True, window = 0)
    yield BERTModel([], big = True, window = 100)
    yield BERTModel([], big = True, window = 300)
    yield BERTModel([], big = True, window = 512)
    yield ROBERTAModel([], big = False, window = 0)
    yield ROBERTAModel([], big = False, window = 100)
    yield ROBERTAModel([], big = False, window = 300)
    yield ROBERTAModel([], big = False, window = 512)
    yield ROBERTAModel([], big = True, window = 0)
    yield ROBERTAModel([], big = True, window = 100)
    yield ROBERTAModel([], big = True, window = 300)
    yield ROBERTAModel([], big = True, window = 512)
    yield DocBERTModel([], big = False)
    yield DocBERTModel([], big = True)
    yield DocBERTAModel([], big = False)
    yield DocBERTAModel([], big = True)
    yield Wiki2VecModel([])

def split_test(name):
    print()
    print("#" * 80)
    print("#", name.center(78, " "), "#", sep="")
    print("#" * 80)
    print()

def one_test_all_model(test_pipeline, articleRetriever):

    for model in all_models_gen():
        split_test(f"{model.model_size} {model.window_size}")
        TestPipeline(model, articleRetriever, test_pipeline).execute()
        gc.collect()


def all_test_all_model(articleRetriever):

    for model in all_models_gen():
        split_test(f"{model.model_size} {model.window_size}")
        TestPipeline(model, articleRetriever).execute()
        gc.collect()

gc.collect()
logging.info('Starting Main'.center(40, "="))

wikiRetriever = WikipediaArticleRetriever()
wordRetriever = WordNetArticleRetriever()

all_test_all_model(wikiRetriever)



################################################################################
#                             bert-base-uncased 0                              #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 470822.21it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 12.54it/s]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 12996.32it/s]



(0.3446979447459543, 2.9514816821822846e-11)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 562377.01it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 13.43it/s]
converting to embedding       : 100%|██████████| 1028/1028 [01:18<00:00, 13.04it/s]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 14884.88it/s]



(0.2261919666095845, 4.681122512807033e-13)


loading articles              : 100%|██████████| 905/905 [06:21<00:00,  2.37it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  9.96it/s]
converting to embedding       : 100%|██████████| 905/905 [01:08<00:00, 13.18it/s]
calculating relations         : 100%|██████████| 19544/19544 [24:22<00:00, 13.36it/s]



[0.014275480966025378, 0.029778960294719607, 0.03919361440851412, 0.056743757674989764]


loading articles              : 100%|██████████| 100/100 [00:31<00:00,  3.13it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  6.95it/s]
converting to embedding       : 100%|██████████| 100/100 [00:07<00:00, 13.04it/s]
computing sim matrix          : 100%|██████████| 100/100 [00:00<00:00, 178.59it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 1312.75it/s]



2527

################################################################################
#                            bert-base-uncased 100                             #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 433825.05it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 12.57it/s]
converting to embedding       : 100%|██████████| 437/437 [02:13<00:00,  3.26it/s]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 13915.52it/s]



(0.20950876558366513, 7.470047738702162e-05)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 297854.69it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 13.98it/s]
converting to embedding       : 100%|██████████| 1028/1028 [05:01<00:00,  3.41it/s]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 16137.65it/s]



(-0.0024591372425814316, 0.9381234419396443)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 350170.21it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  2.21it/s]
converting to embedding       : 100%|██████████| 905/905 [04:40<00:00,  3.22it/s]
calculating relations         : 100%|██████████| 19544/19544 [24:42<00:00, 13.18it/s]



[0.10238436348751535, 0.12520466639377814, 0.1313958248055669, 0.14357347523536634]


loading articles              : 100%|██████████| 100/100 [00:00<00:00, 262307.94it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  2.17it/s]
converting to embedding       : 100%|██████████| 100/100 [00:40<00:00,  2.50it/s]
computing sim matrix          : 100%|██████████| 100/100 [00:00<00:00, 171.50it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 1388.01it/s]



2912

################################################################################
#                            bert-base-uncased 300                             #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 361307.09it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 11.75it/s]
converting to embedding       : 100%|██████████| 437/437 [04:49<00:00,  1.51it/s]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 15585.79it/s]



(0.17678406122053927, 0.0008645988276033717)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 285451.47it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 13.07it/s]
converting to embedding       : 100%|██████████| 1028/1028 [10:05<00:00,  1.70it/s]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 15654.19it/s]



(0.0023118130635488244, 0.9418236221979635)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 516722.72it/s]
converting to embedding       : 100%|██████████| 1/1 [00:01<00:00,  1.19s/it]
converting to embedding       : 100%|██████████| 905/905 [10:18<00:00,  1.46it/s]
calculating relations         : 100%|██████████| 19544/19544 [24:40<00:00, 13.20it/s]



[0.10269136307818257, 0.12474416700777732, 0.1305259926320098, 0.14316414244781006]


loading articles              : 100%|██████████| 100/100 [00:00<00:00, 156328.89it/s]
converting to embedding       : 100%|██████████| 1/1 [00:01<00:00,  1.22s/it]
converting to embedding       : 100%|██████████| 100/100 [01:34<00:00,  1.06it/s]
computing sim matrix          : 100%|██████████| 100/100 [00:01<00:00, 96.92it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 1372.02it/s]



1906

################################################################################
#                            bert-base-uncased 512                             #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 341324.18it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 14.11it/s]
converting to embedding       : 100%|██████████| 437/437 [06:01<00:00,  1.21it/s]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 13591.54it/s]



(0.13832144862321397, 0.009365620925345585)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 297546.37it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00, 13.81it/s]
converting to embedding       : 100%|██████████| 1028/1028 [11:39<00:00,  1.47it/s]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 15968.77it/s]



(-0.0026134003423013185, 0.9342504150361196)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 271383.79it/s]
converting to embedding       : 100%|██████████| 1/1 [00:02<00:00,  2.00s/it]
converting to embedding       : 100%|██████████| 905/905 [13:57<00:00,  1.08it/s]
calculating relations         : 100%|██████████| 19544/19544 [25:04<00:00, 12.99it/s]



[0.10243553008595989, 0.12336266884977487, 0.12975849365534178, 0.14388047482603356]


loading articles              : 100%|██████████| 100/100 [00:00<00:00, 182838.01it/s]
converting to embedding       : 100%|██████████| 1/1 [00:01<00:00,  1.30s/it]
converting to embedding       : 100%|██████████| 100/100 [01:54<00:00,  1.14s/it]
computing sim matrix          : 100%|██████████| 100/100 [00:00<00:00, 171.27it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 1184.49it/s]



2524

################################################################################
#                             bert-large-uncased 0                             #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 102586.38it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  2.28it/s]
converting to embedding       : 100%|██████████| 437/437 [01:48<00:00,  4.04it/s]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 15633.32it/s]



(0.2719318363091515, 2.198554282663682e-07)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 530937.63it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  3.99it/s]
converting to embedding       : 100%|██████████| 1028/1028 [04:16<00:00,  4.00it/s]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 15242.14it/s]



(0.13419228968636487, 2.0871221793264175e-05)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 564018.59it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  4.16it/s]
converting to embedding       : 100%|██████████| 905/905 [03:42<00:00,  4.06it/s]
calculating relations         : 100%|██████████| 19544/19544 [25:15<00:00, 12.90it/s]



[0.013456815390912813, 0.02849979533360622, 0.03919361440851412, 0.06022308636921817]


loading articles              : 100%|██████████| 100/100 [00:00<00:00, 233926.60it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  3.08it/s]
converting to embedding       : 100%|██████████| 100/100 [00:24<00:00,  4.06it/s]
computing sim matrix          : 100%|██████████| 100/100 [00:00<00:00, 164.08it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 1278.26it/s]



1318

################################################################################
#                            bert-large-uncased 100                            #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 142119.16it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  2.35it/s]
converting to embedding       : 100%|██████████| 437/437 [08:20<00:00,  1.15s/it]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 13813.58it/s]



(0.1398764399143609, 0.00859109079908093)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 313467.43it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
converting to embedding       : 100%|██████████| 1028/1028 [18:35<00:00,  1.08s/it]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 15816.57it/s]



(-0.01045945401479797, 0.7412588971581128)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 302265.10it/s]
converting to embedding       : 100%|██████████| 1/1 [00:01<00:00,  1.61s/it]
converting to embedding       : 100%|██████████| 905/905 [17:16<00:00,  1.15s/it]
calculating relations         : 100%|██████████| 19544/19544 [24:46<00:00, 13.15it/s]



[0.10304952926729431, 0.12387433483422022, 0.12940032746623004, 0.1395824805566926]


loading articles              : 100%|██████████| 100/100 [00:00<00:00, 233016.89it/s]
converting to embedding       : 100%|██████████| 1/1 [00:01<00:00,  1.70s/it]
converting to embedding       : 100%|██████████| 100/100 [02:26<00:00,  1.47s/it]
computing sim matrix          : 100%|██████████| 100/100 [00:00<00:00, 164.58it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 1175.56it/s]



63

################################################################################
#                            bert-large-uncased 300                            #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 115393.53it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  1.31it/s]
converting to embedding       : 100%|██████████| 437/437 [16:46<00:00,  2.30s/it]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 14697.81it/s]



(0.15935654757107354, 0.002714527306469719)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 542835.77it/s]
converting to embedding       : 100%|██████████| 1/1 [00:00<00:00,  3.94it/s]
converting to embedding       : 100%|██████████| 1028/1028 [35:11<00:00,  2.05s/it]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 15023.48it/s]



(0.0016512976245968325, 0.9584273463110788)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 140197.42it/s]
converting to embedding       : 100%|██████████| 1/1 [00:04<00:00,  4.08s/it]
converting to embedding       :  10%|█         | 93/905 [06:07<53:25,  3.95s/it]


KeyboardInterrupt: ignored