<a href="https://colab.research.google.com/github/Taedriel/ZSL-v2/blob/wordEmbedding/WordsEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation part

## import stuff


In [34]:
!yes | pip install transformers wget "wikipedia>=1.4.0" unzip mxnet gluonnlp "scipy>=1.7" scikit-bio wikipedia2vec orange3 python-louvain networkx --quiet --upgrade
!mkdir -p temp article

In [35]:
!yes | pip uninstall community



In [36]:
import numpy as  np
import torch
from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizer
import tensorflow as tf
import gluonnlp as nlp 
import nltk
nltk.download('omw-1.4')
nltk.download('wordnet')
from nltk.corpus import wordnet

import matplotlib as mpl
import matplotlib.pyplot as plt

from wikipedia2vec import Wikipedia2Vec
import wikipedia
wikipedia.set_rate_limiting(True)

import gc
import traceback
import pickle
import json
import math
import Orange

from tqdm import tqdm
from typing import List, Tuple, Dict, Callable
from os.path import exists, join, abspath
from os import system
from enum import Enum
from time import perf_counter
from scipy.spatial.distance import cityblock

from scipy.stats import SpearmanRConstantInputWarning
from scipy.stats import spearmanr
from scipy.cluster.hierarchy import linkage
from skbio import DistanceMatrix
from skbio.tree import nj

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [37]:
import logging
import warnings
from transformers import logging as transformer_logging

FORMAT = '%(levelname)-10s %(message)s'
logging.basicConfig(format=FORMAT, level = logging.INFO, filename = "WordsEmbeddings.log" )

transformer_logging.set_verbosity_error()

warnings.filterwarnings("ignore", category=UserWarning, module='wikipedia')

## Utils classes

### Usefull functions ✅

In [None]:
def dict2csv(filename : str, embeddings : Dict[str, List[float]]) -> None:
    """ write a dict of embeddings under a .CSV file

    the .CSV file is construct with a header looking like this :
    \tembeddings\t | 0 | 1 | 2 | 3 | ...
    where each line contain an embeddings for the word in the first row
    Args:
        filename (str) : a path to the file where the .csv is to be written
        embeddings (Dict[str, List[float]]): a dictionnary of embeddings

    """
    logging.info(f"writing dict to {filename} file")
    try:
        f = open(filename, "w")
    except OSError:
        raise OSError("Could not open file")

    dimension_number = len(next(iter(embeddings.values())))
    with f:
        print("embeddings", *[str(i) for i in range(dimension_number)], sep=",", file=f)
        for tag, embedding in embeddings.items():
            print(tag, *list(map(lambda x: str(float(x)), embedding)), sep=",", file=f)
    logging.info("done")

def sim2dist(mat : List[List[float]], func : Callable[[float], float] \
             = lambda x: 1 - x, hollow : bool = True) -> List[List[float]]:
    """ map the function func to each elements in the matrix

    apply the lambda function func to each element of the matrix. if hollow is set 
    to True, set the diagonal of the matrix to 0.
    Args:
        mat (List[List[float]]) : a matrix of number
        func (Callable[[float], float]) : a simple function to apply to each elem of the matrix
        hollow (bool) : whether to consider the diagonal of the matrix or not
    
    """
    logging.info(f"converting similarity matrix to distance matrix")
    inv_data = [[0 for i in range(len(mat[0]))] for j in range(len(mat))]

    for i, elem in enumerate(mat):
        for j, case in enumerate(elem):
            if i == j and hollow: 
                inv_data[i][j] = 0
            else:
                inv_data[i][j] = func(case)
                
    logging.info("done")
    return inv_data

def print_mat(mat : List[List[float]], format_function : Callable[[float], str]=lambda x: x) -> None:
    """ print a matrice on stdout

    format each number in the matrice using the format_function
    Args:
        mat (List[List[float]]) : a matrix of number
        format_function (Callable[[float], str]) : a simple format function to display numbers froms the matrix
    """
    for line in mat:
        for case in line:
            print(f"{format_function(case):8}", end="")
        print()

### Strategy ✅



In [None]:
class BERTMergeStrategy:
    """ strategy to extract BERT embeddings
    
    different approach exist, 
    see https://raw.githubusercontent.com/lbourdois/blog/master/assets/images/BERT/bert-feature-extraction-contextualized-embeddings.png
    for more possible strategy
    """

    def merge(self, vector : List[List[float]]) -> torch.Tensor:
        raise NotImplementedError

class Sum4LastLayers(BERTMergeStrategy):

    def merge(self, vector : List[List[float]]) -> torch.Tensor:
        return torch.sum(vector[-4:], dim = 0)

class Concat4LastLayer(BERTMergeStrategy):

    def merge(self, vector : List[List[float]]) -> torch.Tensor:
        return torch.concat(vector[-4:], dim = 0)

class SimilarityStrategy:
    """strategy to compute similarity between embeddings. Cosine similarity should 
    be the only valid one in word embeddings, other aren't relevant
    """

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        raise NotImplementedError

class CosineSim(SimilarityStrategy):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        cos = torch.nn.CosineSimilarity(dim=0)
        return cos(embed1, embed2)

class EuclidianDistSim(SimilarityStrategy):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        return np.linalg.norm(embed1-embed2)

class ManhattanDistSim(SimilarityStrategy):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        return cityblock(embed1, embed2)

### Article

In [None]:
class customArticle:
    """ store a wikipedia article for further processing by models"""

    def __init__(self, index : int, title : str, realtitle : str, summary : str, ambiguous : bool):
        self.index : int = index
        self.title : str = title
        self.realtitle : str = realtitle
        self.summary :str = summary
        self.ambiguous :bool = ambiguous

class ArticleRetriever:

    """ Class in charge of retrieveing article from different sources and store them in orer
    to not re retrieve them. 
    
    Act as a proxy between wikipedia and the model. This class save all the article 
    retrieve in a dict using the name given. Further call to this retriever will 
    then load the previously saved file if it hasn't been deleted.
    """

    article_dir = "./article"

    def __init__(self, name : str = None, list_title : List[str] = []):

        self.name : str = name
        if self.name is None:
            self.name = "temp"

        self.list_title : List[str] = list_title
        self.modified : bool = False
        self._load()

    def _load(self):
        if not exists(self.get_filename()):
            self.articles_map = {}
            logging.info(f"creating file {self.get_filename()}")
        else:
            with open(self.get_filename(), "rb") as mapfile:
                self.articles_map = pickle.load(mapfile)
                assert(type(self.articles_map) == type(dict()))
            logging.info(f"loading file {self.get_filename()} with {len(self.articles_map)} articles")
    
    def set_list_vocab(self, new_name : str, list_title : List[str]):
        logging.info("changing vocab, reloading file...")
        self.list_title : List[str] = list_title
        self.name = new_name
        self._load()

    def get_filename(self) -> str:
        """ return the filename of the file where article are saved"""
        return join(WikipediaArticleRetriever.article_dir, self.name)

    def load_article(self, title : str, force_reload : bool = False) -> customArticle:
        """ retrieve an article from wikipedia. If forcce reload is specified, re check the article 
        is summary isn't present or if article not alread in the dict""" 

        if title not in self.articles_map:
            self.modified = True
            realtitle, summary, ambiguous = self._retrieve_article(title)
            self.articles_map[title] = customArticle(len(self.articles_map), title, realtitle, summary, ambiguous)

        if title in self.articles_map and self.articles_map[title].summary == None and force_reload:
            self.modified = True
            realtitle, summary, ambiguous = self._retrieve_article(title)
            self.articles_map[title].summary = summary

        return self.articles_map[title]

    def load_all_articles(self, force_reload : bool = False) -> None:
        """retrieve all article from the vocab from sources"""
        
        logging.info(f"Starting loading articles... [Force reload : {force_reload}]")
        nb_success = 0

        nb_article = len(self.list_title)
        for i, title in tqdm(enumerate(self.list_title), total=nb_article, desc=f"{'loading articles':30}"):
            self.load_article(title, force_reload)

            if self.articles_map[title].summary is not None: 
                nb_success += 1

        logging.info(f"Finished loading {nb_success} article(s) / {nb_article} ({round(nb_success / nb_article * 100, 1)}%)!")
        return self.modified

    def __call__(self, force_reload : bool = False) -> None:
        return self.load_all_articles(force_reload)

    def _retrieve_article(self, title : str, closed_list : List[str]) -> Tuple[str, str, bool]:
        raise NotImplementedError

    def get_article(self, title) -> customArticle:
        """return the article if it's present, else, try to retrieve it"""

        if title not in self.articles_map:
            self.load_article(title)

        return self.articles_map[title]
        
    def save(self):
        """save the articles in a binary format using pickle"""
        logging.info(f"saving the file {self.get_filename()}")
        with open(self.get_filename(), "wb") as mapfile:
            pickle.dump(self.articles_map, mapfile)

class WikipediaArticleRetriever(ArticleRetriever):

    def __init__(self, name: str = None, list_title: List[str] = []):
        ArticleRetriever.__init__(self, name, list_title)

    def get_filename(self) -> str:
        """ return the filename of the file where article are saved"""
        return join(WikipediaArticleRetriever.article_dir, "Wiki-" + self.name)

    def _retrieve_article(self, title : str, closed_list : List = []) -> Tuple[str, str, bool]: 
        closed_list.append(title)
        try:
            article = wikipedia.page(title, auto_suggest=False, redirect=True)
            return (article.title, article.summary, False)

        except wikipedia.PageError as e:
            search_result = wikipedia.search(title, suggestion = False)

            logging.warning(f"{title} misspelled or article missing. Best find is {search_result[0]}")
            if search_result[0] is not None and search_result[0] not in closed_list:            
                return self._retrieve_article(search_result[0], closed_list)  
            else: return (None, None, None)

        except wikipedia.DisambiguationError as e:
            logging.warning(f"{title} is ambiguous, fallback on {e.options[0]}")
            return (None, None, None)
            # if e.options[0] is not None and e.options[0] not in closed_list:
            #     res = self._retrieve_article(e.options[0], closed_list)
            #     return (res[0], res[1], True)
        return (None, None, None)

class WordNetArticleRetriever(ArticleRetriever):

    def __init__(self, name: str = None, list_title: List[str] = []):
        super().__init__(name, list_title)

    def get_filename(self) -> str:
        """ return the filename of the file where article are saved"""
        return join(WikipediaArticleRetriever.article_dir, "Word-" + self.name)

    def _retrieve_article(self, title: str, closed_list : List = []) -> Tuple[str, str, bool]:
        result = wordnet.synsets(title)
        if len(result) > 0:
            return (title, result[0].definition(), True)

        return (None, None, None)

class ArticleViewer():

    def __init__(self, filename):
        self.name = filename

        if not exists(self.name):
            raise FileNotFoundError()
        else:
            with open(self.name, "rb") as mapfile:
                self.articles_map = pickle.load(mapfile)

    def get(self, title):
        return self.articles_map[title]

    def get_all_articles(self):
        return self.articles_map.keys()


### Embeddings operations 

In [None]:
class EmbeddingsLoader:

    """class that load an embeddings file to perform operation on it. Base class
     for multiple operations such as matrix similarity operations.
     """

    def __init__(self, filename : str):

        self.file = filename
        self.embeddings = {}

        self._load_file()

    def _load_file(self):
        try:
            with open(self.file, "r") as f:
                lines = f.readlines()
                
            for line in lines[1:]:
                data = line.split(",")
                self.embeddings[data[0]] = torch.FloatTensor(list(map(float, data[1:])))

        except IOError as e:
            raise IOError(f"No file {self.file}")

class SimilarityMatrix(EmbeddingsLoader):

    def __init__(self, embeddings : Dict[str, List[float]], strategy : SimilarityStrategy):
        EmbeddingsLoader.__init__(self, embeddings)
        self.strategy = strategy
        self._create_matrix()
        self.computed : bool = False

    def _create_matrix(self) -> None:
        n_tokens = len(self.embeddings)
        self.cosine_sim_matrix : Dict[Dict[float]] = {}
        for tag in self.embeddings.keys():
            self.cosine_sim_matrix[tag] = {}

    def compute_sim(self) -> None:
        """ compute cosine similarity between all vectors """

        closed_list = []

        logging.info("Computing cosine similarity, this could take some time...")
        for tag, vector in tqdm(self.embeddings.items(), total = len(self.embeddings), desc=f"{'computing sim matrix':30}"):

            for otag, other_vector in self.embeddings.items():

                if otag == tag: continue
                # if (tag, otag) in closed_list or (otag, tag) in closed_list: continue

                similarity = self.strategy.sim(vector, other_vector)

                self.cosine_sim_matrix[otag][tag] = similarity
                self.cosine_sim_matrix[tag][otag] = similarity

                # closed_list.append((tag, otag))
                # closed_list.append((otag, tag))

        self.computed = True

    def export_sim_matrix(self, filename):
        if not self.computed:
            self.compute_sim()
        
        try:
            f = open(filename, "w")
        except OSError:
            raise OSError("Could not open file")

        with f:
            print("/", *[tag for tag in self.embeddings.keys()], sep = ",", file = f)

            for tag in self.embeddings.keys():
                print(tag, *[str(round(float(self.cosine_sim_matrix[tag][otag]), 3)) for otag in self.embeddings.keys()], sep = ",", file = f)

    def get_sim_matrix(self) -> Tuple[List[str], List[List[float]]]:
        """return the similarity matrix of the embeddings
        """
        if not self.computed:
            self.compute_sim()

        X = len(self.embeddings)
        matrix = [[0 for j in range(X)] for i in range(X)]
        ids = []
        
        for i, tag in enumerate(self.embeddings.keys()):
            ids.append(tag)
            for j, otag in enumerate(self.embeddings.keys()):
                if i == j:
                    continue

                matrix[i][j] = self.cosine_sim_matrix[tag][otag]
                matrix[j][i] = self.cosine_sim_matrix[tag][otag]

        return ids, matrix

    def sim_between(self, token1 : str, token2 : str) -> float:
        v1 = self.embeddings[token1]
        v2 = self.embeddings[token2]

        if token2 not in self.cosine_sim_matrix[token1] or token1 not in self.cosine_sim_matrix[token2]:
            similarity = self.strategy.sim(v1, v2)
            self.computed = True

            self.cosine_sim_matrix[token1][token2] = similarity
            self.cosine_sim_matrix[token1][token2] = similarity

        return self.cosine_sim_matrix[token1][token2]

### Downloader

In [None]:
class Downloader:

    def __init__(self, base_addr : str, file_zipname : str):
        self.__address = base_addr
        self.__zip_filename = file_zipname
        self.__unzip_filename = "".join(file_zipname.split(".")[:-1])

    def download(self) -> str:
        """ download the embedding file and return a path to the file.
         If the file is already downloaded, only return the path
         """
        self.path = abspath(self.__unzip_filename)
        logging.info(f"Checking the presence of {self.path}...")
        if exists(self.path):
            logging.info("File found ! No download needed")
            return self.path

        logging.info("File Not present, need to download")
        logging.info(f"Starting to download {self.__address}{self.__unzip_filename}")
        try:
            ret = system(f"wget {self.__address}{self.__zip_filename}")
            if ret != 0:
                raise SystemError
        except:
            logging.error(f"can't retrieve the file {self.__zip_filename}")
            raise SystemError(f"can't retrieve the file {self.__zip_filename}")

        logging.info(f"Download finished")
        logging.info(f"Starting unziping the file")

        ext = self.__zip_filename[-4:]
        if ext == ".zip":
            unzip_command = "unzip -p"
        elif ext == ".bz2":
            unzip_command = "bunzip2 -c"

        try:
            ret = system(f"{unzip_command} ./{self.__zip_filename} > {self.__unzip_filename}")
            if ret != 0:
                raise SystemError
        except:
            logging.error(f"can't unzip the file {self.__zip_filename}")
            raise SystemError(f"can't unzip the file {self.__zip_filename}")

        logging.info(f"Unzipping finished ! ({self.path})")
        return self.path


### Solver

In [None]:
class Solver(EmbeddingsLoader):

    DEFAULT_MIN_LIST_RESULT = 10

    def __init__(self, embeddings):
        super(Solver, self).__init__(embeddings)

    def get_nearest_embedding_of(self, embedding, nb = 10):

        if nb > len(self.embeddings):
            raise Exception("nb too high, not enough token")

        nearest = []
        for tag, e in self.embeddings.items():

            cos = torch.nn.CosineSimilarity(dim=0)
            similarity = cos(embedding, e)

            nearest.append((tag, similarity))
        
        nearest.sort(key = lambda tup : tup[1])
        return nearest[-1:-nb-1:-1]

    def __call__(self, embeddeding, tag=None):
        result = self.get_nearest_embedding_of(embeddeding, min(Solver.DEFAULT_MIN_LIST_RESULT, len(self.embeddings)))
        if tag is not None:
            print(f"Nearest Word for {tag}:")
        for i in result:
            print(f"\t{i[0]:12}: {round(float(i[1]) * 100, 3)}%")
    
    def score(self, embedding, target):
        target_embeddings = self.embeddings[target]

        cos = torch.nn.CosineSimilarity(dim=0)
        return float(cos(embedding, target_embeddings))

    def least_squared_score(self, embedding, target):
        target_embeddings = self.embeddings[target]
        return float(np.linalg.norm(target_embeddings - embedding))

    def mean_squared_score(self, embedding, target):
        target_embeddings = self.embeddings[target]
        return float(np.square(np.subtract(embedding, target_embeddings)).mean())

class OutOfVocabSolver(Downloader):

    DEFAULT_MIN_LIST_RESULT = 10

    def __init__(self):
        address = "http://wikipedia2vec.s3.amazonaws.com/models/en/2018-04-20/"
        filename = "enwiki_20180420_300d.pkl.bz2"

        super(OutOfVocabSolver, self).__init__(address, filename)

        self.download()
        self.model = Wikipedia2Vec.load(self.path)

    def get_nearest_embedding_of(self, embedding, nb):
        embedding = np.array(embedding)
        return self.model.most_similar_by_vector(embedding, count=nb, min_count=nb)

    def __call__(self, embedding, tag = None):
        result = self.get_nearest_embedding_of(embedding,OutOfVocabSolver.DEFAULT_MIN_LIST_RESULT)
        if tag is not None:
            print(f"Nearest Word for {tag}:")
        for i in result:
            print(f"\t{repr(i[0]):12}: {round(float(i[1]) * 100, 3)}%")

    


## Models

In [None]:
class WordToVector:

    def __init__(self, list_tags : List[str] = []):
        self.list_tags = list_tags
        self.embeddings = {}

    def set_list_class(self, list_class : List[str]):
        self.list_tags = list_class
        self.reset_embeddings()

    def check_embeddings_exist(self, filename : str, article_ret : WikipediaArticleRetriever):
        temp_tags_list = self.list_tags
        first_tag = self.list_tags[0]
        self.list_tags = [first_tag]

        self.convert(article_ret)
        try:
            loader = EmbeddingsLoader(filename)
        except OSError:
            self.list_tags = temp_tags_list
            return False

        first_embedding = self.embeddings[first_tag]
        to_compare = loader.embeddings[first_tag]

        intersect = set(temp_tags_list) & set(loader.embeddings.keys())

        self.list_tags = temp_tags_list
        if torch.equal(first_embedding, to_compare) and len(intersect) == len(temp_tags_list):
            return True

        return False

    def convert(self, article_ret : WikipediaArticleRetriever):
        raise NotImplementedError

    def reset_embeddings(self):
        self.embeddings.clear()

    def get_embedding_of(self, token):
        if token not in self.embeddings:
            raise Exception(f"no such token {token}")
        
        return self.embeddings[token]

    def get_class_list(self):
        return self.embeddings.keys()

    def export(self, filename):
        """export all the embeddings in filename under a .csv format.
           Raise exception if embeddings hasn't been calculed yet."""

        if len(self.embeddings) == 0:
            raise Exception("Tags not converted yet !")
        
        dict2csv(filename, self.embeddings)

class FixedEmbedding(WordToVector):

    def __init__(self, base_addr : str, file_zipname : str):
        self.downloader = Downloader(base_addr, file_zipname)
        self.downloader.download()

    def check_embeddings_exist(self, filename : str, article_ret : WikipediaArticleRetriever):
        return False

    def _one_turn(self, resolve_dict = {}):
        print("here")
        raise NotImplementedError

    def convert(self, ar):
        resolve_filename = f"./temp/{ar.name[:-4]}_resolve.json"
        resolve = {}

        while True: 
            unk_list = self._one_turn(resolve)
        
            if unk_list is None or len(unk_list) == 0:
                break

            print(len(unk_list), "items haven't been found, resolve mode.")

            with open(resolve_filename, 'w') as f:
                resolve_dict = {word: "" for word in unk_list}
                json.dump(resolve_dict, f, indent = 4)

            input("press enter to resume resolve")

            resolve = {}
            with open(resolve_filename, 'r') as f:
                resolve = json.load(f)
                assert(type(resolve) == type(dict()))


### BERT model

In [None]:
class BERTModel(WordToVector):

    temp_dir = "./temp"

    def __init__(self, list_tag : List[str], big: bool = False, window : int = 100):
        super(BERTModel, self).__init__(list_tag)
        self.window_size = window

        self.model_size = "bert-large-uncased" if big else "bert-base-uncased"

        self.tokenizer = BertTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = BertModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

    def _one_pass(self, inputs):
        with torch.no_grad():
            outputs = self.model(input_ids = inputs["input_ids"], attention_mask = inputs["attention_mask"])

        hidden_states = outputs[2]

        # [# layers, # batches, # tokens, # features] ==> [# tokens, # layers, # features]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)

        return self.merging_strategy.merge(token_embeddings[0])

    def convert(self, article_ret : WikipediaArticleRetriever):
        """ convert all word in their embeddings"""

        if len(self.list_tags) == 0:
            raise Exception("no tags yet !")

        logging.info("Starting converting tokens...")
        nb_token = len(self.list_tags)
        for i, tag in tqdm(enumerate(self.list_tags), total = nb_token, desc=f"{'converting to embedding':30}"):
            
            if tag in self.embeddings: continue

            article = article_ret.get_article(tag)
            if article.summary is None or self.window_size == 0:
                self.embeddings[tag] = self._one_pass(self.tokenizer(tag, return_tensors = "pt"))
                continue

            sub_ids = self.tokenizer.encode(tag + ". " + article.summary)[0:self.window_size]
            subinputs = {   "input_ids": torch.IntTensor(sub_ids).unsqueeze(0), \
                            "token_type_ids": torch.IntTensor([0 for k in range(len(sub_ids))]).unsqueeze(0), \
                            "attention_mask": torch.IntTensor([1 for k in range(len(sub_ids))]).unsqueeze(0)  }

            self.embeddings[tag] = self._one_pass(subinputs)



### RoBERTa model

In [None]:
class ROBERTAModel(BERTModel):

    def __init__(self, list_tag : List[str], big: bool = False, window : int = 100):
        WordToVector.__init__(self, list_tag)
        self.window_size = window

        self.model_size = "roberta-large" if big else "roberta-base"

        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = RobertaModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

### DocBERTModel model

In [None]:
class DocBERTModel(BERTModel):

    def __init__(self, list_tag : List[str], big : bool = False):
        WordToVector.__init__(self, list_tag)
        self.window_size = "document"

        self.model_size = "bert-large-uncased" if big else "bert-base-uncased"

        self.tokenizer = BertTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = BertModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.max_size = self.tokenizer.model_max_length
        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

    def _one_pass(self, subinputs):
        with torch.no_grad():
            outputs = self.model(input_ids = subinputs["input_ids"], attention_mask = subinputs["attention_mask"])

        hidden_states = outputs[2]

        # [# layers, # batches, # tokens, # features] ==> [# tokens, # layers, # features]
        token_embeddings = torch.stack(hidden_states, dim=0)
        token_embeddings = torch.squeeze(token_embeddings, dim=1)
        token_embeddings = token_embeddings.permute(1,0,2)

        return self.merging_strategy.merge(token_embeddings[0])


    def convert(self, article_ret):
        """ convert all word in their embeddings"""

        if len(self.list_tags) == 0:
            raise Exception("no tags yet !")

        logging.info("Starting converting tokens...")
        nb_token = len(self.list_tags)
        for i, tag in tqdm(enumerate(self.list_tags), total = nb_token, desc=f"{'converting to embedding':30}"):
            
            if tag in self.embeddings: continue

            article = article_ret.get_article(tag)

            if article.summary is None:
                logging.warning(f"no article for {tag}")
                self.embeddings[tag] = self._one_pass(self.tokenizer(tag, return_tensors = "pt"))
                continue

            torch_cls = []

            ids = self.tokenizer.encode(article.summary)
            nb_token = len(ids)

            if nb_token < self.max_size:
                self.embeddings[tag] = self._one_pass(self.tokenizer(tag, return_tensors = "pt"))
                continue

            nb_pass = math.ceil(nb_token / self.max_size)
            logging.info(f"{tag} is {nb_pass} pass")

            stop = 50
            for j in range(nb_pass):
                start = stop - 50
                stop = min(nb_token, start + self.max_size)
                
                sub_ids = ids[start:stop]

                subinputs = { "input_ids": torch.IntTensor(sub_ids).unsqueeze(0), \
                            "token_type_ids": torch.IntTensor([0 for k in range(len(sub_ids))]).unsqueeze(0), \
                            "attention_mask": torch.IntTensor([1 for k in range(len(sub_ids))]).unsqueeze(0)  }
                torch_cls.append(self._one_pass(subinputs))
                if stop == nb_token: break

            self.embeddings[tag] = torch.mean(torch.stack(tuple(t for t in torch_cls)), axis=0)

### DocBERTAModel model

In [None]:
class DocBERTAModel(DocBERTModel):

    def __init__(self, list_tag : List[str], big : bool = False):
        WordToVector.__init__(self, list_tag)
        self.window_size = "document"

        self.model_size = "roberta-large" if big else "roberta-base"

        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = RobertaModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.max_size = self.tokenizer.model_max_length

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

### Wikipedia2Vec

In [None]:
class Wiki2VecModel(FixedEmbedding):

    address = "http://wikipedia2vec.s3.amazonaws.com/models/en/2018-04-20/"
    
    def __init__(self, list_tag : List[str], size : int = 300):
        WordToVector.__init__(self, list_tag)
        self.window_size = size
        self.model_size = "wikipedia2vec"

        assert size in [100, 300, 500], f"size should be one of this value (100, 300, 500)"
        filename = f"enwiki_20180420_{self.window_size}d.pkl.bz2"
        FixedEmbedding.__init__(self, Wiki2VecModel.address, filename)
        
        self.model = Wikipedia2Vec.load(self.downloader.path)

    def _retrieve(self, word):
        try: return self.model.get_word_vector(word)
        except: pass

        try: return self.model.get_word_vector(word.capitalize())
        except: pass

        try: return self.model.get_word_vector(word.lower())
        except: pass

        try: return self.model.get_entity_vector(word)
        except: pass

        try: return self.model.get_entity_vector(word.capitalize())
        except: pass
        
        try: return self.model.get_entity_vector(word.lower())
        except: pass

        return None
    
    def _one_turn(self, resolve_dict = {}):
        unk = []

        for word in self.list_tags:
            w = word.replace("_", " ")
            if w in self.embeddings: continue

            if w in resolve_dict:
                embed = self._retrieve(resolve_dict[w])
            else:
                embed = self._retrieve(w)

            if embed is None:
                logging.warning(f"{w} cannot be retrieved.")
                unk.append(word)
            else:
                self.embeddings[w] = torch.from_numpy(embed)
        
        return unk

In [None]:
gc.collect()

150

### GloVE

In [None]:
class GloVEModel(FixedEmbedding):

    address = "https://nlp.stanford.edu/data/"
    all_dict = "glove_all"
    
    def __init__(self, list_tag : List[str]):
        WordToVector.__init__(self, list_tag)

        self.window_size = 300
        self.model_size = "GloVe"

        filename = "glove.840B.300d.zip"
        FixedEmbedding.__init__(self, GloVEModel.address, filename)

        self.download()

    def _in_list(self, word, list_tag):
        if word in list_tag:
            return word
        
        if word.capitalize() in list_tag:
            return word.capitalize()

        if word.lower() in list_tag:
            return word.lower()

        return False

    def _one_turn(self, resolve_dict = {}):
        uncaped_list_tag = [x.replace("_", " ") for x in self.list_tags]
        unk = [item if item not in resolve_dict.keys() else resolve_dict[item] for item in uncaped_list_tag]

        with open(self.path, "r") as file:
            for line in file:
                tag, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")

                r_tag = self._in_list(tag, unk)
                if r_tag is not False:
                    self.embeddings[r_tag] = torch.from_numpy(coefs)
                    unk.remove(r_tag)

        return unk

# Practical part

## Embeddings to word proba

In [None]:
# solver = Solver("/content/animal10-roberta-base-0.csv")

# totest = solver.embeddings["cat"]

# solver(totest, "cat")
# print(solver.score(totest, "dog"))
# print(solver.mean_squared_score(totest, "dog"))
# print(solver.least_squared_score(totest, "dog"))

solver_advanced = OutOfVocabSolver()

totest = [ 0.7862, -1.3992, -4.2574, -2.4260, -1.8522, -4.9722,  1.5762,  4.2181, 2.7479,  0.4882, -3.8462,  0.3663,  1.6023,  1.0386, -2.1188, -3.6747, -3.3854, -1.9940, -0.9328,  0.2879, -2.9643,  4.0447,  2.3717, -1.2791, 2.0610,  3.1366, -3.9570, -1.6283,  0.8152, -2.0822,  1.6625, -0.3196,
        -2.6504, -4.9068,  2.3788,  0.9988,  4.3755,  0.7723, -0.3931, -1.9057, -5.7305,  0.1909, -2.5958, -2.7752, -2.6477,  2.0469, -0.4043,  1.5702, -1.8779,  1.7021,  0.8994,  0.5958, -7.5035,  1.0676,  2.3697, -1.0680, -3.2171,  1.1602, -2.5989, -1.4238,  0.7296,  2.6037, -2.8898,  2.3770,
         2.5836,  2.9701, -3.8032,  0.5565,  9.3480,  0.8646, -0.9671,  0.5712, -2.0034, -2.5540,  1.9856, -1.7896, -0.7399, -4.1039,  1.8379,  0.3972, -3.5441, -3.4940,  1.7804, -1.5334,  2.1030,  3.3775,  3.8066, -2.0847, -0.7945, -0.5985,  2.5008, -1.0824,  0.3379, -1.9175, -2.3626,  2.9385,
         2.5060,  0.5920, -2.1384,  1.0008, -1.5074, -1.7252,  1.9592, -1.6477, -2.2149,  0.5712, -5.1326, -3.3159, -0.0771,  5.4355, -0.7996,  2.2194, 2.8152,  3.1241,  0.2127, -0.4711, -1.0102,  2.6980,  3.4281,  3.9176, 3.4158,  4.7137,  0.0745, -1.0222, -0.5676, -3.1130, -1.5231, -2.9252,
        -0.8840,  1.0718,  5.9833, -0.0269, -1.9574, -0.8195,  1.0675,  2.6848, 1.4984,  1.1614,  1.4133, -3.3854, -3.5907,  0.9117,  3.2912,  2.6879,  0.7066,  0.4763,  2.1558, -2.6881,  1.3685, -2.8319, -1.3616,  2.9175, 2.3130,  3.4277,  0.4531, -1.3417,  0.6093, -0.9372, -0.7949, -4.0459,
        -0.3607,  2.7486,  3.3538,  3.0184,  1.3375, -2.2510,  2.5175, -2.4213, 2.3837, -0.0391,  0.2196,  4.3168,  2.0774, -1.7432,  2.5663,  3.0488, 4.4666,  0.4470, -0.7640, -3.6402,  3.4627, -0.0654, -2.9533, -3.1311, 5.8393,  4.9384, -3.4591,  0.9942,  4.2474, -3.5547,  3.4043,  2.3078,
        -0.2588,  0.9352,  2.2546, -1.2557,  0.5715,  1.5651, -0.8553,  4.0117, -1.8648,  4.4550, -1.7419,  2.7950, -0.4952,  1.2926, -0.4914, -1.1312, -0.9716, -0.3429,  1.6704,  2.2321,  3.8714,  3.3676,  1.7274, -1.3730, 3.8956, -0.5492,  1.2591, -2.2064,  4.4911, -4.2279, -0.5980,  1.6788,
        -0.6136, -2.7168,  3.7387,  3.8866,  3.1262,  2.0629, -2.6571, -1.4073, 3.9042,  0.0150,  1.5226, -0.8405,  0.8074,  1.5322, -1.4147, -2.0555, -0.4014, -0.8865,  1.5190, -0.6994, -3.5390, -2.2263,  0.8470,  3.6711, 2.4342, -0.9778, -1.2095,  2.3892,  2.9762,  1.0258, -1.6725,  1.0209,
         0.4480,  1.1777, -2.1264, -1.7206, -3.9378, -2.3451,  2.7490,  0.8951, 0.7263,  2.8524,  1.7494,  0.9067,  0.1311,  0.4994,  0.3548, -2.7868, -0.1992,  5.1012,  0.3275,  1.9652,  1.1535, -3.4700,  0.4942, -0.6859, 0.6918, -2.8642, -2.0151,  1.3709,  1.3846,  0.1103,  0.7877, -0.6042,
        -1.1805,  0.2325,  2.1466,  2.7526,  0.7050,  0.8945,  4.1594,  0.3798, -1.0938,  0.7618, -1.3813, -1.3719]

solver_advanced(totest, "chimpanzee")

NameError: ignored

##Word to embeddings

In [None]:
#@title List of vocab :
#@markdown * animal10
#@markdown * cifar 10 / 100 
#@markdown * imagenet / subimagenet (200 first)
#@markdown * king / queen 

labels_path = tf.keras.utils.get_file('ImageNetLabels.txt','https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt')
imagenet = list(np.array(open(labels_path).read().splitlines()))
subimagenet = imagenet[:200]

animal10 = ["dog", "cat", "horse", "spider", "butterfly", "chicken", "sheep", "cow", "squirrel", "elephant"]

cifar10  = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]

cifar100 = ["apple", "aquarium_fish", "baby", "bear", "beaver", "bed", "bee", "beetle", "bicycle", "bottle", "bowl", "boy", "bridge", "bus", "butterfly", "camel", "can", \
            "castle", "caterpillar", "cattle", "chair", "chimpanzee", "clock", "cloud", "cockroach", "computer_keyboard", "couch", "crab", "crocodile", "cup", \
            "dinosaur", 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'lamp', 'lawn_mower', 'leopard', 'lion', \
            'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', \
            'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark',\
            'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', \
            'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm']

king = ["king", "woman", "man", "queen", "boy", "girl", "male", "female"]

custom = ["outdoor", "food", "clothes", "objects", "small animal", "aquatic animal", "animal", "plant", "insect", "transport"]

In [None]:
#@title Choose a dataset
save_name = "custom" #@param ["animal10", "cifar10", "cifar100", "king", "imagenet", "subimagenet", "custom"]
mapping_save_list = {
    "animal10": animal10,
    "cifar10" : cifar10,
    "cifar100" : cifar100,
    "king" : king,
    "imagenet" : imagenet,
    "subimagenet": subimagenet,
    "custom" : custom
}

vocab = mapping_save_list[save_name]

In [None]:
#@title Choose a model and params
window_size  = 300 #@param ["0", "100", "200", "300", "400", "512"] {type:"raw"}
is_big       = False #@param {type:"boolean"}
model_choice = "Wikipedia2Vec" #@param ["ROBERTA", "BERT", "Wikipedia2Vec", "DocBERT", "DocBERTA", "GloVe"]

if model_choice == "ROBERTA":
    model = ROBERTAModel(vocab, big = is_big, window = window_size)
elif model_choice == "BERT":
    model = BERTModel(vocab, big = is_big, window = window_size)
elif model_choice == "Wikipedia2Vec":
    model = Wiki2VecModel(vocab, size = window_size)
elif model_choice == "DocBERT":
    model = DocBERTModel(vocab, big = is_big)
elif model_choice == "DocBERTA":
    model = DocBERTAModel(vocab, big = is_big)
elif model_choice == "GloVe":
    model = GloVEModel(vocab)
    
articlesRetriever = WikipediaArticleRetriever(save_name + ".art", vocab)

In [None]:
csv_file = f"{save_name}-{model.model_size}-{model.window_size}.csv"

if articlesRetriever():
    articlesRetriever.save()

model.reset_embeddings()
for i, super in enumerate(model.__class__.__bases__):
    print(i, super.__name__)

if not model.check_embeddings_exist(csv_file, articlesRetriever):
    model.convert(articlesRetriever)
    model.export(csv_file)

print("\n", len(model.get_class_list()))

loading articles              : 100%|██████████| 10/10 [00:12<00:00,  1.20s/it]


0 FixedEmbedding
['outdoor', 'food', 'clothes', 'objects', 'small animal', 'aquatic animal', 'animal', 'plant', 'insect', 'transport']
1 items haven't been found, resolve mode.
press enter to resume resolve
['outdoor', 'food', 'clothes', 'objects', 'small animal', 'aquatic animal', 'animal', 'plant', 'insect', 'transport']

 10


In [None]:
# wiki2vec_dict = model.model.dictionary

# with open("./temp/wiki_article.txt", "w") as f:
#     for word in wiki2vec_dict.words():
#         print(word.text, file=f)

#     for ent in wiki2vec_dict.entities():
#         print(ent.title, file=f)
# !unzip -p /content/glove.840B.300d.zip > test

with open("glove_keys.txt", "w") as gf:

    with open("/content/glove.840B.300d", "r") as file:
        for line in file:
            tag, coefs = line.split(maxsplit=1)
            print(tag, file = gf)



## Neighboor joining Tree

In [None]:
neighboor_sim = SimilarityMatrix(csv_file, CosineSim())

ids, data = neighboor_sim.get_sim_matrix()

ids = [tids.replace(" ", "_") for tids in ids]
inv_data  = sim2dist(data, lambda x: 1 - x) 

# print()
# print_mat(data_inv, format_function = lambda x: round(float(x), 2))

computing sim matrix          : 100%|██████████| 68/68 [00:00<00:00, 282.01it/s]


In [None]:
dm = DistanceMatrix(inv_data, ids)
tree = nj(dm)

with open(f"{save_name}-{model_choice}-.tree", "w") as f:
    print(tree, file = f)

print("\n\n", tree.ascii_art())

## wikipedia debug

In [None]:
#@title Article to search for { run: "auto", vertical-output: true, display-mode: "both" }
totest = "buck" #@param {type:"string"}

result = wikipedia.search(totest, suggestion = False)
print(result)
print(f"first result is: {result[0]}")

try:
    print(wikipedia.page(totest, auto_suggest=False, redirect=True))
    print(wikipedia.page(result[0], auto_suggest=False, redirect=True))
except Exception as e:
    print(f"best option envisaged: {e.options[0]}")
    print(e)

In [None]:
articleviewer = ArticleViewer("/content/article/cifar100.art")
print(articleviewer.get_all_articles())
articleviewer.get(articleviewer.get_all_articles()[0]).summary

# New Solver

In [120]:
# toguess 2
totest = [-1.122,2.5493,-6.4934,5.1244,1.8322,-2.4903,1.7885,0.28059,-0.6473,2.7094,-1.7041,0.12659,-2.1201,1.7909,-3.4064,-1.5044,-2.3428,-0.21746,-0.51735,-0.87518,-1.8803,4.1981,-0.57839,-0.75715,-3.2789,0.094055,-2.5323,3.5364,-1.1887,-0.32185,0.83655,-2.5367,-1.5664,-1.6144,0.96499,1.101,5.7738,-1.4313,1.6498,-1.4691,-1.2193,4.6139,1.1892,0.95094,0.28716,5.3983,-2.1434,1.9556,-2.588,3.8573,1.706,-1.4814,-3.8649,3.3019,1.1959,2.1542,-0.88526,-2.5411,-0.2668,-2.5881,1.697,5.5959,-4.8047,-1.3409,0.88701,2.4658,-2.609,1.9621,3.1595,2.8576,-0.74223,-0.89093,-4.2198,3.4845,-1.988,1.8015,-1.7093,-3.737,-2.9994,0.74645,-5.3243,1.5403,-0.60917,-4.2911,-2.9025,-1.7992,-0.8596,0.21249,-3.7441,0.28825,-0.88344,-2.254,-1.3114,-2.6239,0.14179,-1.3166,2.6839,-1.4652,-0.83398,2.9815,-4.0996,-2.8034,3.9212,-2.2096,-0.87033,1.7632,-2.8293,-2.144,2.2181,2.7939,-3.6677,5.0149,-0.79786,2.0267,-0.1555,0.12539,-2.2854,1.7482,0.88181,4.9972,3.4045,6.2988,-2.4562,-0.41591,-1.8707,-2.8566,-3.9587,0.08361,-2.1874,-2.5128,3.8934,-0.45989,-0.082235,0.67877,-0.16172,1.5006,3.0004,2.9907,3.9425,3.0808,-2.2809,-1.0965,-0.55863,1.0862,-2.4945,-0.0015425,5.6565,-2.087,0.11066,-1.2479,4.0543,-0.60213,-0.53165,3.5236,-1.9206,1.6709,1.2394,0.50941,3.7742,-2.5508,-0.50954,-2.0542,1.1629,1.821,2.4256,-3.0941,2.8343,-0.2542,0.6906,0.59896,1.784,0.9476,2.8348,-0.27728,0.64667,4.4519,4.7255,-0.99693,-5.1451,-2.0515,2.8238,2.2864,-0.86283,0.97159,-0.4185,8.5407,-1.5291,1.0881,2.1247,-4.0368,3.1599,1.4262,0.41397,-1.8739,2.0356,2.9822,1.5995,-0.69884,1.3198,0.92781,4.842,2.0718,-5.3633,-0.47037,-2.0025,0.71227,1.6302,-1.3304,-1.6301,0.92339,1.2812,-2.1767,3.3847,2.6614,-0.89659,-0.3342,1.2194,-2.9878,3.2775,-1.5613,0.30171,-3.9471,1.941,0.83132,0.21692,0.93135,7.3327,-1.6191,5.4914,1.9914,-3.0784,1.3881,-1.5109,2.6814,0.041593,-3.7304,2.3987,-0.13929,-4.45,-1.5211,-5.2136,0.40236,-0.50133,-0.096171,-3.9962,0.26384,0.29978,3.2868,2.6498,0.94902,-2.5184,1.526,2.8469,-0.35854,-3.2803,2.3363,-0.56372,-1.8674,1.3576,-1.5291,-1.4955,-4.8288,1.019,-1.4444,3.4142,2.5302,1.64,-1.2125,0.034562,-0.96021,-5.5353,-3.1208,0.59937,3.9008,1.6573,-0.10828,-1.2077,0.13008,-1.8282,-2.2147,-0.50857,-1.5127,-0.23297,0.44002,1.3732,1.5556,0.93295,-1.888,0.46196,-1.9693,2.7558,1.6251,4.6096,-1.1334,-0.95527,0.79511,2.7323,-0.16591,-1.4546,2.6422]

replace __community__ by __community.community_louvain__ in module file

In [121]:
from Orange.clustering.hierarchical import dist_matrix_linkage, tree_from_linkage, data_clustering, leaves, WEIGHTED
from Orange.data import Table, Domain
from Orange.distance.distance import Cosine
from Orange.widgets.unsupervised.owhierarchicalclustering import clusters_at_height
from scipy.cluster.hierarchy import dendrogram
from itertools import chain
from Orange.data.variable import StringVariable

In [126]:
generic_table = Table("/content/imagenet-wikipedia2vec-300.csv")
supp_info_table = Table("/content/class_map_imagenet.csv")

superclass_embeddings = Table("/content/custom-wikipedia2vec-300_superclass.csv")

CLUSTER_THRESOLD = 0.75
GROUP_BY = "first superclass"
SIM_THRESOLD = 0.3


def left_join(generic_table, supp_info_table, key: str = "embeddings") -> Table:
    """add all <b> metas </b> column from supp_info_table to generic_table using key as joint
    """
    assert key in list(map(lambda x : x.name, supp_info_table.domain.metas)), "embeddings name not present in additional data"
    assert len(generic_table) == len(supp_info_table), "table don't contain the same number of line"

    name_supp_data = [i.name for i in chain(supp_info_table.domain.metas, 
                                            supp_info_table.domain.variables, 
                                            supp_info_table.attributes) if i.name != key]
                                            
    supp_list_list = [[] for i in range(len(name_supp_data))]

    for s in generic_table:
        for d in supp_info_table:
            if s[key] == d[key]:
                for i, name in enumerate(name_supp_data):
                    supp_list_list[i].append(d[name])
                break

    for i, name in enumerate(name_supp_data):
        # print(f"adding {name}")
        generic_table = generic_table.add_column(StringVariable(name), supp_list_list[i])

    return generic_table

def add_to_list(cluster, list_to_add_to):
    """ decompose a cluster tree by adding the index of all children in the list
    """
    if cluster.is_leaf:
        list_to_add_to.append(cluster.value.index)

    for branch in cluster.branches:
        add_to_list(branch, list_to_add_to)

def clusterize(table : Table, thresold) -> Table:
    """clusterize a Oranga Table based on the height of THRESOLD
    """
    cluster_tree = clusters_at_height(data_clustering(table, distance=Cosine, linkage=WEIGHTED), thresold)

    list_cluster = {}
    for i, cluster in enumerate(cluster_tree):
        cluster_name     = 'C' + str(i) 

        current = []
        add_to_list(cluster, current)

        for item_index in current:
            list_cluster[item_index] = cluster_name
        # print(cluster_name, list(map(lambda x: table[x]["embeddings"].value, current)))

    table = table.add_column(StringVariable("Cluster"), [list_cluster[i] for i in range(len(table))])
    return table

def compute(lst):
    return max(lst,key=lst.count)

def one_pass(table, keep_cluster_line : bool = False, cluster_thresold : float = CLUSTER_THRESOLD, sim_thresold : float = SIM_THRESOLD):
    assert GROUP_BY in list(map(lambda x: x.name, chain(generic_table.domain.metas, generic_table.domain.variables, generic_table.domain.attributes))), "Group by not in the Table !"

    table = clusterize(table, cluster_thresold)

    #===========================================================================
    # Cluster split
    toguess_cluster = [d["Cluster"] for d in table if d["embeddings"] == "TOGUESS"][0]

    in_cluster_table  = Table.from_list(table.domain, [d for d in table if d["Cluster"].value == toguess_cluster])
    out_cluster_table = Table.from_list(table.domain, [d for d in table if d["Cluster"].value != toguess_cluster])
    
    #===========================================================================
    # Group by computation

    filter_list = []
    for row in in_cluster_table:
        filter_list.append(row[GROUP_BY].value)

    main_superclass = compute(filter_list)
    main_superclass_table = Table.from_list(superclass_embeddings.domain, [i for i in superclass_embeddings if i["embeddings"] == main_superclass])
    main_superclass_table = Table.concatenate([in_cluster_table, Table.from_table(out_cluster_table.domain, main_superclass_table)])
    #===========================================================================
    # thresold computation

    to_copy_row_instance = [d for d in main_superclass_table if d["embeddings"] == "TOGUESS"][0]
    to_copy = list(to_copy_row_instance.attributes())

    to_compare_row_instance = [d for d in main_superclass_table if d["Cluster"] == "?"][0]
    to_compare = list(to_compare_row_instance.attributes())

    dead_row = [k for k, (i, j) in enumerate(zip(to_copy, to_compare)) if abs(i - j) < sim_thresold]

    #===========================================================================
    # reconstruct the table filtering dead row and cluster. Remove used cluster row if 
    # keep_cluster_line is set to False
    new_domain = Domain(attributes = [i for i in out_cluster_table.domain.attributes if int(i.name) not in dead_row], 
                        metas      = [i for i in out_cluster_table.domain.metas if i.name != "Cluster"])

    # do the same on the data
    data_attr = []
    data_meta = []
    whole_data = list(out_cluster_table) + list(toguess_table)
    if keep_cluster_line: whole_data += list(in_cluster_table)

    for rowinstance in whole_data:
        data_attr.append([rowinstance[k] for k, i in enumerate(out_cluster_table.domain.attributes) if int(i.name) not in dead_row])
        data_meta.append([rowinstance.metas[k] for k, i in enumerate(out_cluster_table.domain.metas) if i.name != "Cluster"])

    return Table.from_numpy(new_domain, X = data_attr, metas = data_meta), \
        { "cluster" : {
                "name" : toguess_cluster,
                "size" : len(in_cluster_table),
                "thresold": cluster_thresold
            },
           "keep_cluster_line"  : keep_cluster_line,
           "sim_thresold"       : sim_thresold,
           "removed_col"        : len(dead_row) 
        }

In [127]:
generic_table = left_join(generic_table, supp_info_table)

toguess_table = Table.from_numpy(generic_table.domain, [np.array(totest)], Y = None, metas = np.char.asarray([["TOGUESS", "?", "?"]]))
table = Table.concatenate([generic_table, toguess_table])

In [128]:
data_list = []

for i in tqdm(range(5), total=5, desc="hierarchical clustering pass"):
    table, data = one_pass(table)
    data_list.append(data)

hierarchical clustering pass:  40%|████      | 2/5 [00:08<00:12,  4.28s/it]


IndexError: ignored

In [118]:
print("\n".join(data_list))

[-1.122, 2.5493, -6.4934, 5.1244, 1.8322, -2.4903, 1.7885, -0.6473, 2.7094, -1.7041, -2.1201, 1.7909, -3.4064, -1.5044, -2.3428, -0.51735, -0.87518, -1.8803, 4.1981, -0.57839, -0.75715, -3.2789, -2.5323, 3.5364, -1.1887, -0.32185, 0.83655, -2.5367, -1.5664, -1.6144, 0.96499, 1.101, 5.7738, -1.4313, 1.6498, -1.4691, -1.2193, 4.6139, 1.1892, 0.95094, 0.28716, 5.3983, -2.1434, 1.9556, -2.588, 3.8573, 1.706, -1.4814, -3.8649, 3.3019, 1.1959, 2.1542, -0.88526, -2.5411, -0.2668, -2.5881, 1.697, 5.5959, -4.8047, -1.3409, 0.88701, 2.4658, -2.609, 1.9621, 3.1595, 2.8576, -0.74223, -0.89093, -4.2198, 3.4845, -1.988, 1.8015, -1.7093, -3.737, -2.9994, 0.74645, -5.3243, 1.5403, -0.60917, -4.2911, -2.9025, -1.7992, -0.8596, 0.21249, -3.7441, 0.28825, -0.88344, -2.254, -1.3114, -2.6239, -1.3166, 2.6839, -1.4652, -0.83398, 2.9815, -4.0996, -2.8034, 3.9212, -2.2096, -0.87033, 1.7632, -2.8293, -2.144, 2.2181, 2.7939, -3.6677, 5.0149, -0.79786, 2.0267, -0.1555, -2.2854, 1.7482, 0.88181, 4.9972, 3.4045, 6

# Test part

## Tests 

### Test superclass

In [None]:
class Test:

    def __init__(self, name):
        self.name = name
        self.vocab = []
    
    def _start(self, model : WordToVector, articlesRetriever : ArticleRetriever):
        articlesRetriever.set_list_vocab(f"{self.name}.art", self.vocab)

        if articlesRetriever(force_reload = False):
            articlesRetriever.save()

        self.save_file = f"test-{self.name}-{model.model_size}-{model.window_size}.csv"
        model.set_list_class(self.vocab)

        if not model.check_embeddings_exist(self.save_file, articlesRetriever):
            model.convert(articlesRetriever)
            model.export(self.save_file)


    def _end(self):
        raise NotImplementedError

    def __call__(self, model, articlesRetriever):

        logging.info(f"Start test {self.name}")

        tic = perf_counter()
        self._start(model, articlesRetriever)
        result = self._end(model)
        toc = perf_counter()

        logging.info(f"End test {self.name}")

        return result, toc - tic

### Embedding distance Test

In [None]:
class EmbeddingDistanceTest(Test):

    def __init__(self, vocab, thresold, name):
        Test.__init__(self, name)

        self.thresold = thresold
        self.vocab = vocab

    def _end(self, model):
        
        sim_matrix = SimilarityMatrix(self.save_file, CosineSim())
        ids, cosine_mat = sim_matrix.get_sim_matrix()

        sim_list = []

        for i, idsa in tqdm(enumerate(ids), total = len(ids), desc=f"{'listing sim matrix':30}"):
            for j, idsb in enumerate(ids):

                if i == j: continue 

                cos_val = cosine_mat[i][j]

                if cos_val >= self.thresold:
                    sim_list.append((idsa, idsb, cos_val))

        sim_list.sort(key=lambda x: x[2], reverse = True)
        return len(sim_list) // 2


### Syntactic test

In [None]:
class SyntacticTest(Test):

    def __init__(self, quadruple_set : Tuple[str, str, str, str], name : str):
        Test.__init__(self, name)

        self.relations : Tuple[str, str, str, str] = quadruple_set

        for relation in self.relations:
            for item in relation:
                if item not in self.vocab:
                    self.vocab.append(item)

    def _end(self, model):

        solver = Solver(self.save_file)
        top1, top3, top5, top10 = 0, 0, 0, 0

        for w1, w2, w3, w4 in tqdm(self.relations, total=len(self.relations), desc=f"{'calculating relations':30}"):

            w1_emb = model.get_embedding_of(w1).numpy()
            w2_emb = model.get_embedding_of(w2).numpy()
            w3_emb = model.get_embedding_of(w3).numpy()

            totest = w1_emb - w2_emb + w3_emb
            result = solver.get_nearest_embedding_of(torch.from_numpy(totest), 13)
            filtered_result = list(filter(lambda x: x != None, map(lambda x: x[0] if x[0] not in [w1, w2, w3] else None, result)))

            if filtered_result[0] == w4:
                top1 += 1

            if w4 in filtered_result[:3]:
                top3 += 1

            if w4 in filtered_result[:5]:
                top5 += 1

            if w4 in filtered_result[:10]:
                top10 += 1

        
        return list(map(lambda x : x / len(self.relations), (top1, top3, top5, top10)))

### Similarity Test

In [None]:
class SimilarityTest(Test):

    def __init__(self, pair_set : Tuple[str, str, float], pair_name):
        Test.__init__(self, pair_name)

        self.pair : Tuple[str, str, float] = pair_set

        for w1, w2, i in self.pair:
            if w1 not in self.vocab:
                self.vocab.append(w1)
            if w2 not in self.vocab:
                self.vocab.append(w2)
    
    def _end(self, model):
        sim_list = []
        i_list = []

        sim_computer = SimilarityMatrix(self.save_file, CosineSim())

        for w1, w2, i in tqdm(self.pair, total=len(self.pair), desc=f"{'calculating pair':30}"):
            sim = sim_computer.sim_between(w1, w2)

            sim_list.append(sim)
            i_list.append(i)

        result = spearmanr(sim_list, i_list)
        return (result.correlation, result.pvalue)

In [None]:
def print_list(listed):
    print(f"{'word':20}{'target':20}{'similarity':10}")
    print("="*50)
    for ida, idb, sim in listed:
        print(f"{ida:20}{idb:20}{round(float(sim), 6):6}")


## Pipeline

In [None]:
class TestPipeline():

    list_test = [ SimilarityTest(nlp.data.WordSim353('all'), "Wordsim353"), 
                  SimilarityTest(nlp.data.SimLex999('all') , "SimLex999") ,
                  SyntacticTest(nlp.data.GoogleAnalogyTestSet(), "GoogleAnalogy"),
                  EmbeddingDistanceTest(cifar100, 0.80, "Imagenet")
                ]

    def __init__(self, model, articleRetriever, list_test = None):
        self.model = model
        self.articleRetriver = articleRetriever

        if list_test == None:
            self.list_test = TestPipeline.list_test
        else:
            self.list_test = list_test 

    def execute(self):

        for i, test in enumerate(self.list_test):

            print(f"Test {i} : {test.name}".center(80, "="))
            res, time_elapsed = test(self.model, self.articleRetriver)
            print(f"\n{res}")
            print(f"{round(time_elapsed, 2)} sec.".center(80, "="))

# main

In [None]:
gc.collect()

22132

In [None]:
def all_models_gen():
    # yield BERTModel([], big = False, window = 0)    
    # yield BERTModel([], big = False, window = 100)
    # yield BERTModel([], big = False, window = 300)
    # yield BERTModel([], big = False, window = 512)
    # yield BERTModel([], big = True, window = 0)
    # yield BERTModel([], big = True, window = 100)
    # yield BERTModel([], big = True, window = 300)
    # yield BERTModel([], big = True, window = 512)
    # yield ROBERTAModel([], big = False, window = 0)
    # yield ROBERTAModel([], big = False, window = 100)
    # yield ROBERTAModel([], big = False, window = 300)
    # yield ROBERTAModel([], big = False, window = 512)
    # yield ROBERTAModel([], big = True, window = 0)
    # yield ROBERTAModel([], big = True, window = 100)
    # yield ROBERTAModel([], big = True, window = 300)
    # yield ROBERTAModel([], big = True, window = 512)
    # yield DocBERTModel([], big = False)
    # yield DocBERTModel([], big = True)
    # yield DocBERTAModel([], big = False)
    # yield DocBERTAModel([], big = True)
    # yield Wiki2VecModel([])
    yield GloVEModel([])

def split_test(name):
    print()
    print("#" * 80)
    print("#", name.center(78, " "), "#", sep="")
    print("#" * 80)
    print()

def one_test_all_model(test_pipeline, articleRetriever):

    for model in all_models_gen():
        split_test(f"{model.model_size} {model.window_size}")
        TestPipeline(model, articleRetriever, test_pipeline).execute()
        gc.collect()


def all_test_all_model(articleRetriever):

    for model in all_models_gen():
        split_test(f"{model.model_size} {model.window_size}")
        TestPipeline(model, articleRetriever).execute()
        gc.collect()

gc.collect()
logging.info('Starting Main'.center(40, "="))

wikiRetriever = WikipediaArticleRetriever()
wordRetriever = WordNetArticleRetriever()

all_test_all_model(wordRetriever)



################################################################################
#                                  GloVe 300                                   #
################################################################################



loading articles              : 100%|██████████| 437/437 [00:00<00:00, 538457.95it/s]
calculating pair              : 100%|██████████| 352/352 [00:00<00:00, 9956.07it/s]



(0.5755996374954984, 1.9089867180240961e-32)


loading articles              : 100%|██████████| 1028/1028 [00:00<00:00, 249305.84it/s]
calculating pair              : 100%|██████████| 999/999 [00:00<00:00, 13526.78it/s]



(0.3764003606135919, 5.611502131461014e-35)


loading articles              : 100%|██████████| 905/905 [00:00<00:00, 350623.05it/s]
calculating relations         : 100%|██████████| 19544/19544 [23:03<00:00, 14.12it/s]



[0.08089439214081048, 0.16537044617273844, 0.21014121981170691, 0.2770671305771592]


loading articles              : 100%|██████████| 100/100 [00:00<00:00, 3997.58it/s]


10 items haven't been found, resolve mode.
press enter to resume resolve


computing sim matrix          : 100%|██████████| 100/100 [00:00<00:00, 183.69it/s]
listing sim matrix            : 100%|██████████| 100/100 [00:00<00:00, 589.35it/s]



2
