<a href="https://colab.research.google.com/github/Taedriel/ZSL-v2/blob/wordEmbedding/WordsEmbeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preparation part

## import stuff


In [61]:
!yes | pip install transformers wget wikipedia unzip mxnet gluonnlp "scipy>=1.7" scikit-bio wikipedia2vec --quiet --upgrade
!mkdir -p temp article
%xmode Verbose

Exception reporting mode: Verbose


In [62]:
import numpy as  np
import torch
from transformers import BertTokenizer, BertModel, RobertaModel, RobertaTokenizer
import tensorflow as tf
import gluonnlp as nlp

import matplotlib as mpl
import matplotlib.pyplot as plt

from wikipedia2vec import Wikipedia2Vec
import wikipedia
wikipedia.set_rate_limiting(True)

import gc
import traceback
import pickle
import json
import logging
logging.basicConfig(level = logging.INFO, filename = "BERT.log" )
# logging.basicConfig(level = logging.INFO)

from tqdm import tqdm
from typing import List, Tuple, Dict, Callable
from os.path import exists, join, abspath
from os import system
from enum import Enum
from scipy.spatial.distance import cityblock

from scipy.stats import SpearmanRConstantInputWarning
from skbio import DistanceMatrix
from skbio.tree import nj


## Utils classes

### Usefull functions

In [63]:
def dict2csv(filename : str, embeddings : Dict[str, List[float]]) -> None:
    """ write a dict of embeddings under a .CSV file

    the .CSV file is construct with a header looking like this :
    \tembeddings\t | 0 | 1 | 2 | 3 | ...
    where each line contain an embeddings for the word in the first row
    Args:
        filename (str) : a path to the file where the .csv is to be written
        embeddings (Dict[str, List[float]]): a dictionnary of embeddings

    """
    try:
        f = open(filename, "w")
    except OSError:
        raise OSError("Could not open file")

    dimension_number = len(next(iter(embeddings.values())))
    with f:
        print("embeddings", *[str(i) for i in range(dimension_number)], sep=",", file=f)
        for tag, embedding in embeddings.items():
            print(tag, *list(map(lambda x: str(float(x)), embedding)), sep=",", file=f)

def sim2dist(mat : List[List[float]], func : Callable[[float], float] \
             = lambda x: 1 - x, hollow : bool = True) -> List[List[float]]:
    """ map the function func to each elements in the matrix

    apply the lambda function func to each element of the matrix. if hollow is set 
    to True, set the diagonal of the matrix to 0.
    Args:
        mat (List[List[float]]) : a matrix of number
        func (Callable[[float], float]) : a simple function to apply to each elem of the matrix
        hollow (bool) : whether to consider the diagonal of the matrix or not
    
    """
    inv_data = [[0 for i in range(len(mat[0]))] for j in range(len(mat))]

    for i, elem in enumerate(mat):
        for j, case in enumerate(elem):
            if i == j and hollow: 
                inv_data[i][j] = 0
            else:
                inv_data[i][j] = func(case)
    
    return inv_data

def print_mat(mat : List[List[float]], format_function : Callable[[float], str]=lambda x: x) -> None:
    """ print a matrice on stdout

    format each number in the matrice using the format_function
    Args:
        mat (List[List[float]]) : a matrix of number
        format_function (Callable[[float], str]) : a simple format function to display numbers froms the matrix
    """
    for line in mat:
        for case in line:
            print(f"{format_function(case):8}", end="")
        print()

### Strategy

In [64]:
class BERTMergeStrategy:
    """ strategy to extract BERT embeddings
    
    different approach exist, 
    see https://raw.githubusercontent.com/lbourdois/blog/master/assets/images/BERT/bert-feature-extraction-contextualized-embeddings.png
    for more possible strategy
    """

    def merge(self, vector : List[List[float]]) -> List[float]:
        raise NotImplementedError

class Sum4LastLayers(BERTMergeStrategy):

    def merge(self, vector : List[List[float]]) -> List[float]:
        return torch.sum(vector[-4:], dim = 0)

class Concat4LastLayer(BERTMergeStrategy):

    def merge(self, vector : List[List[float]]) -> List[float]:
        return torch.concat(vector[-4:], dim = 0)

class Similarity:
    """strategy to compute similarity between embeddings. Cosine similarity should 
    be the only valid one in word embeddings, other aren't relevant
    """

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        raise NotImplementedError

class CosineSim(Similarity):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        cos = torch.nn.CosineSimilarity(dim=0)
        return cos(embed1, embed2)

class EuclidianDistSim(Similarity):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        return np.linalg.norm(embed1-embed2)

class ManhattanDistSim(Similarity):

    def sim(self, embed1 : List[float], embed2 : List[float]) -> float:
        return cityblock(embed1, embed2)

### Article

In [65]:
class customWikiArticle:

    def __init__(self, index : int, title : str, realtitle : str, summary : str, ambiguous : bool):
        self.index = index
        self.title = title
        self.realtitle = realtitle
        self.summary = summary
        self.ambiguous = ambiguous

class ArticleRetriever:

    article_dir = "./article"

    def __init__(self, name : str, list_title : List[str]):

        self.name = name
        self.list_title = list_title
        self.modified = False

        if not exists(self.get_filename()):
            self.articles_map = {}
        else:
            with open(self.get_filename(), "rb") as mapfile:
                self.articles_map = pickle.load(mapfile)

    def get_filename(self):
        return join(ArticleRetriever.article_dir,self.name)

    def load_article(self, title, force_reload : bool = False) -> customWikiArticle:
        if title not in self.articles_map:
            self.modified = True
            realtitle, summary, ambiguous = self._retrieve_article(title)
            self.articles_map[title] = customWikiArticle(len(self.articles_map), title, realtitle, summary, ambiguous)

        if title in self.articles_map and self.articles_map[title].summary == None and force_reload:
            self.modified = True
            realtitle, summary, ambiguous = self._retrieve_article(title)
            self.articles_map[title].summary = summary

        return self.articles_map[title]

    def _retrieve_article(self, title, closed_list : List = []) -> Tuple[str, str, bool]: 
        closed_list.append(title)
        try:
            article = wikipedia.page(title, auto_suggest=False, redirect=True)
            return (article.title, article.summary, False)

        except wikipedia.PageError as e:
            search_result = wikipedia.search(title, suggestion = False)

            logging.warning(f"{title} misspelled or article missing. Best find is {search_result[0]}")
            if search_result[0] is not None and search_result[0] not in closed_list:            
                return self._retrieve_article(search_result[0], closed_list)  
            else: return (title, None, False)

        except wikipedia.DisambiguationError as e:
            logging.warning(f"{title} is ambiguous, trying first {e.options[0]}")
            if e.options[0] is not None and e.options[0] not in closed_list:
                res = self._retrieve_article(e.options[0], closed_list)
                return (res[0], res[1], True)
        
            return (None, None, None)


    def load_all_articles(self, force_reload : bool = False) -> None:
        logging.info(f"Starting loading articles...")
        nb_success = 0

        nb_article = len(self.list_title)
        for i, title in tqdm(enumerate(self.list_title), total=nb_article):
            self.load_article(title, force_reload)

            if self.articles_map[title].summary is not None: 
                nb_success += 1
        logging.info(f"Finished loading {nb_success} article(s) !")
        return self.modified

    def __call__(self, force_reload : bool = True) -> None:
        return self.load_all_articles(force_reload)

    def get_article(self, title):
        if title not in self.articles_map:
            self.load_article(title)

        return self.articles_map[title]
        
    def save(self):
        with open(self.get_filename(), "wb") as mapfile:
            pickle.dump(self.articles_map, mapfile)

class ArticleViewer():

    def __init__(self, filename):
        self.name = filename

        if not exists(self.name):
            raise FileNotFoundError()
        else:
            with open(self.name, "rb") as mapfile:
                self.articles_map = pickle.load(mapfile)

    def get(self, title):
        return self.articles_map[title]

    def get_all_articles(self):
        return self.articles_map.keys()


### Embeddings operations 

In [66]:
class WordToVector:

    def __init__(self, list_tags : List[str] = []):
        self.list_tags = list_tags
        self.embeddings = {}

    def convert(self):
        return NotImplementedError

    def reset_embeddings(self):
        self.embeddings.clear()

    def get_embedding_of(self, token):
        if token not in self.embeddings:
            raise Exception(f"no such token {token}")
        
        return self.embeddings[token]

    def get_class_list(self):
        return self.embeddings.keys()

    def ask_alt(self, word):
        print(f"Unable to find {word} in the dict, choose a replacement : ")
        alt = str(input())
        return alt

    def export(self, filename):
        """export all the embeddings in filename under a .csv format.
           Raise exception if embeddings hasn't been calculed yet."""

        if len(self.embeddings) == 0:
            raise Exception("Tags not converted yet !")
        
        dict2csv(filename, self.embeddings)

class EmbeddingsLoader:

    def __init__(self, filename : str):

        self.file = filename
        self.embeddings = {}

        self._load_file()

    def _load_file(self):
        try:
            with open(self.file, "r") as f:
                lines = f.readlines()
                
            for line in lines[1:]:
                data = line.split(",")
                self.embeddings[data[0]] = torch.FloatTensor(list(map(float, data[1:])))

        except IOError as e:
            raise IOError(f"No file {self.file}")

class SimilarityMatrix(EmbeddingsLoader):

    def __init__(self, embeddings : Dict[str, List[float]], strategy : Similarity):
        super(SimilarityMatrix, self).__init__(embeddings)
        self.strategy = strategy
        self._create_matrix()
        self.computed : bool = False

    def _create_matrix(self) -> None:
        n_tokens = len(self.embeddings)
        self.cosine_sim_matrix : Dict[Dict[float]] = {}
        for tag in self.embeddings.keys():
            self.cosine_sim_matrix[tag] = {}

    def compute_sim(self) -> None:
        """ compute cosine similarity between all vectors """

        logging.info("Computing cosine similarity, this could take some time...")
        for tag, vector in tqdm(self.embeddings.items(), total = len(self.embeddings)):

            for otag, other_vector in self.embeddings.items():

                if otag == tag:
                    continue

                similarity = self.strategy.sim(vector, other_vector)

                self.cosine_sim_matrix[otag][tag] = similarity
                self.cosine_sim_matrix[tag][otag] = similarity

        self.computed = True

    def export_sim_matrix(self, filename):
        if not self.computed:
            self.compute_sim()
        
        try:
            f = open(filename, "w")
        except OSError:
            raise OSError("Could not open file")

        with f:
            print("/", *[tag for tag in self.embeddings.keys()], sep = ",", file = f)

            for tag in self.embeddings.keys():
                print(tag, *[str(round(float(self.cosine_sim_matrix[tag][otag]), 3)) for otag in self.embeddings.keys()], sep = ",", file = f)

    def get_sim_matrix(self) -> Tuple[List[str], List[List[float]]]:
        """return the similarity matrix of the embeddings
        """
        if not self.computed:
            self.compute_sim()

        X = len(self.embeddings)
        matrix = [[0 for j in range(X)] for i in range(X)]
        ids = []
        
        for i, tag in enumerate(self.embeddings.keys()):
            ids.append(tag)
            for j, otag in enumerate(self.embeddings.keys()):
                if i == j:
                    continue

                matrix[i][j] = self.cosine_sim_matrix[tag][otag]
                matrix[j][i] = self.cosine_sim_matrix[tag][otag]

        return ids, matrix

    def sim_between(self, token1 : str, token2 : str) -> float:
        v1 = self.embeddings[token1]
        v2 = self.embeddings[token2]

        if token2 not in self.cosine_sim_matrix[token1] or token1 not in self.cosine_sim_matrix[token2]:
            similarity = self.strategy.sim(v1, v2)
            self.computed = True

            self.cosine_sim_matrix[token1][token2] = similarity
            self.cosine_sim_matrix[token1][token2] = similarity

        return self.cosine_sim_matrix[token1][token2]

class SimilarityMatrixFromDict(SimilarityMatrix):

    def __init__(self, embeddings : Dict[str, List[float]], strategy : Similarity):
        self.embeddings = embeddings
        self._create_matrix()
        self.strategy = strategy
        self.computed : bool = False

### Solver

In [67]:
class Solver(EmbeddingsLoader):

    DEFAULT_MIN_LIST_RESULT = 10

    def __init__(self, embeddings):
        super(Solver, self).__init__(embeddings)

    def get_nearest_embedding_of(self, embedding, nb = 10):

        if nb > len(self.embeddings):
            raise Exception("nb too high, not enough token")

        nearest = []
        for tag, e in self.embeddings.items():

            cos = torch.nn.CosineSimilarity(dim=0)
            similarity = cos(embedding, e)

            nearest.append((tag, similarity))
        
        nearest.sort(key = lambda tup : tup[1])
        return nearest[-1:-nb-1:-1]

    def __call__(self, embeddeding, tag=None):
        result = self.get_nearest_embedding_of(embeddeding, min(Solver.DEFAULT_MIN_LIST_RESULT, len(self.embeddings)))
        if tag is not None:
            print(f"Nearest Word for {tag}:")
        for i in result:
            print(f"\t{i[0]:12}: {round(float(i[1]) * 100, 3)}%")
    
    def score(self, embedding, target):
        target_embeddings = self.embeddings[target]

        cos = torch.nn.CosineSimilarity(dim=0)
        return float(cos(embedding, target_embeddings))

    def least_squared_score(self, embedding, target):
        target_embeddings = self.embeddings[target]
        return float(np.linalg.norm(target_embeddings - embedding))

    def mean_squared_score(self, embedding, target):
        target_embeddings = self.embeddings[target]
        return float(np.square(np.subtract(embedding, target_embeddings)).mean())
        

## Models

### BERT model

In [68]:
class BERTModel(WordToVector):

    temp_dir = "./temp"

    def __init__(self, list_tag : List[str], big: bool = False, window : int = 100):
        super(BERTModel, self).__init__(list_tag)
        self.window_size = window

        self.model_size = "bert-large-uncased" if big else "bert-base-uncased"

        self.tokenizer = BertTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = BertModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

    def convert(self, article_ret : ArticleRetriever):
        """ convert all word in their embeddings"""

        if len(self.list_tags) == 0:
            raise Exception("not tags yet !")

        logging.info("Starting converting tokens...")
        nb_token = len(self.list_tags)
        for i, tag in tqdm(enumerate(self.list_tags), total = nb_token):
            
            if tag in self.embeddings: continue

            article = article_ret.get_article(tag)
            tag_plus_context = tag
            if article.summary is not None:
                max_size_article = min(len(article.summary), self.window_size)
                tag_plus_context = tag + ". " + article.summary[:max_size_article]

            inputs = self.tokenizer(tag_plus_context, return_tensors = "pt")

            with torch.no_grad():
                outputs = self.model(**inputs)

            hidden_states = outputs[2]

            # [# layers, # batches, # tokens, # features] ==> [# tokens, # layers, # features]
            token_embeddings = torch.stack(hidden_states, dim=0)
            token_embeddings = torch.squeeze(token_embeddings, dim=1)
            token_embeddings = token_embeddings.permute(1,0,2)

            self.embeddings[tag] = self.merging_strategy.merge(token_embeddings[0])

### RoBERTa model

In [79]:
class ROBERTAModel(BERTModel):

    def __init__(self, list_tag : List[str], big: bool = False, window : int = 100):
        WordToVector.__init__(self, list_tag)
        self.window_size = window

        self.model_size = "roberta-large" if big else "roberta-base"

        self.tokenizer = RobertaTokenizer.from_pretrained(self.model_size, padding=True, truncation=True)
        self.model = RobertaModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

In [None]:
class DocBERT(BERTModel):

    def __init__(self, list_tag : List[str], big : bool = False, window :int = 100):
        WordToVector.__init__(self, list_tag)
        self.window_size = "document"

        self.model_size = "docbert-large-uncased" if big else "docbert-base-uncased"

        self.tokenizer = BertTokenizer.from_pretrained(padding=True, truncation=True)
        self.model = BertModel.from_pretrained(self.model_size, output_hidden_states = True)

        self.merging_strategy = Sum4LastLayers()

        self.model.eval()

    def convert(self, ar):
        

### Wikipedia2Vec

In [93]:
class Wiki2VecModel(WordToVector):

    file = []

    def __init__(self, list_tag : List[str], size : int = 300):
        WordToVector.__init__(self, list_tag)
        self.window_size = size

        self.model_size = "wikipedia2vec"
        assert size in [100, 300, 500], f"size should be one of this value (100, 300, 500)"
        self.zip_filename = f"enwiki_20180420_{self.window_size}d.pkl.bz2"
        self.unzip_filename = self.zip_filename[:-4]
        
        try:
            self._load()
        except:
            return
        
        self.model = Wikipedia2Vec.load(self.path)

    def _load(self):

        self.path = abspath(self.unzip_filename)

        if exists(self.path):
            return

        try:
            system(f"wget http://wikipedia2vec.s3.amazonaws.com/models/en/2018-04-20/{self.zip_filename}")
        except:
            raise SystemError(f"can't retrieve the file {self.zip_filename}")

        try:
            system(f"bunzip2 ./{self.zip_filename}")
        except:
            raise SystemError(f"can't unzip the file {self.zip_filename}")

    def _retrieve(self, word):
        try:
            return self.model.get_word_vector(word)
        except:
            pass

        try:
            return self.model.get_word_vector(word.capitalize())
        except:
            pass

        try:
            return self.model.get_entity_vector(word)
        except:
            pass

        try:
            return self.model.get_entity_vector(word.capitalize())
        except:
            pass
        
        return None
    
    def _one_turn(self, resolve_dict = {}):
        unk = []

        for word in vocab:
            w = word.replace("_", " ")
            if w in self.embeddings: continue

            if w in resolve_dict:
                embed = self._retrieve(resolve_dict[w])
            else:
                embed = self._retrieve(w)

            if embed is None:
                logging.warning(f"{w} cannot be retrieved.")
                unk.append(word)
            else:
                self.embeddings[w] = torch.from_numpy(embed)
        
        return unk

    def convert(self, ar):
        resolve_filename = f"./temp/{ar.name[:-4]}_resolve.json"
        resolve = {}
        while True: 
            unk_list = self._one_turn(resolve)
        
            if unk_list is None or len(unk_list) == 0:
                break

            print(len(unk_list), "items haven't been found, resolve mode.")

            with open(resolve_filename, 'w') as f:
                resolve_dict = {word: "" for word in unk_list}
                json.dump(resolve_dict, f)

            input("press enter to resume resolve")

            resolve = {}
            with open(resolve_filename, 'r') as f:
                resolve = json.load(f)
                assert(type(resolve) == type(dict()))

# Practical part

## Embeddings to word proba

In [None]:
solver = Solver("/content/animal10-roberta-base-0.csv")

totest = solver.embeddings["cat"]

solver(totest, "cat")
print(solver.score(totest, "dog"))
print(solver.mean_squared_score(totest, "dog"))
print(solver.least_squared_score(totest, "dog"))

##Word to embeddings

In [71]:
labels_path = tf.keras.utils.get_file('ImageNetLabels.txt','https://storage.googleapis.com/download.tensorflow.org/data/ImageNetLabels.txt')
imagenet = list(np.array(open(labels_path).read().splitlines()))
subimagenet = imagenet[:200]

In [87]:
animal10 = ["dog", "cat", "horse", "spider", "butterfly", "chicken", "sheep", "cow", "squirrel", "elephant"]
cifar10  = ["airplane", "automobile", "bird", "cat", "deer", "dog", "frog", "horse", "ship", "truck"]
cifar100 = ["apple", "aquarium_fish", "baby", "bear", "beaver", "bed", "bee", "beetle", "bicycle", "bottle", "bowl", "boy", "bridge", "bus", "butterfly", "camel", "can", \
            "castle", "caterpillar", "cattle", "chair", "chimpanzee", "clock", "cloud", "cockroach", "computer_keyboard", "couch", "crab", "crocodile", "cup", \
            "dinosaur", 'dolphin', 'elephant', 'flatfish', 'forest', 'fox', 'girl', 'hamster', 'house', 'kangaroo', 'lamp', 'lawn_mower', 'leopard', 'lion', \
            'lizard', 'lobster', 'man', 'maple_tree', 'motorcycle', 'mountain', 'mouse', 'mushroom', 'oak_tree', 'orange', 'orchid', 'otter', 'palm_tree', 'pear', \
            'pickup_truck', 'pine_tree', 'plain', 'plate', 'poppy', 'porcupine', 'possum', 'rabbit', 'raccoon', 'ray', 'road', 'rocket', 'rose', 'sea', 'seal', 'shark',\
            'shrew', 'skunk', 'skyscraper', 'snail', 'snake', 'spider', 'squirrel', 'streetcar', 'sunflower', 'sweet_pepper', 'table', 'tank', 'telephone', \
            'television', 'tiger', 'tractor', 'train', 'trout', 'tulip', 'turtle', 'wardrobe', 'whale', 'willow_tree', 'wolf', 'woman', 'worm']

king = ["king", "woman", "man", "queen", "boy", "girl", "male", "female"]


#@title Choose a dataset
save_name = "imagenet" #@param ["animal10", "cifar10", "cifar100", "king", "imagenet", "subimagenet"]
mapping_save_list = {
    "animal10": animal10,
    "cifar10" : cifar10,
    "cifar100" : cifar100,
    "king" : king,
    "imagenet" : imagenet,
    "subimagenet": subimagenet
}

vocab = mapping_save_list[save_name]

In [94]:
window_size  = 100 #@param ["0", "100", "200", "300", "400", "500"] {type:"raw"}
is_big       = False #@param {type:"boolean"}
model_choice = "Wikipedia2Vec" #@param ["ROBERTA", "BERT", "Wikipedia2Vec"]

if model_choice == "ROBERT":
    model = ROBERTAModel(vocab, big = is_big, window = window_size)
elif model_choice == "BERT":
    model = BERTModel(vocab, big = is_big, window = window_size)
elif model_choice == "Wikipedia2Vec":
    model = Wiki2VecModel(vocab, size = window_size)

articlesRetriever = ArticleRetriever(save_name + ".art", vocab)

In [95]:
if articlesRetriever():
    articlesRetriever.save()

model.reset_embeddings()

model.convert(articlesRetriever)

csv_file = f"{save_name}-{model.model_size}-{model.window_size}.csv"

model.export(csv_file)

print("\n", len(model.get_class_list()))

100%|██████████| 1001/1001 [00:00<00:00, 207486.94it/s]


62 items haven't been found, resolve mode.
press enter to resume resolve

 999


In [None]:
# wiki2vec_dict = model.model.dictionary

# with open("./temp/wiki_article.txt", "w") as f:
#     for word in wiki2vec_dict.words():
#         print(word.text, file=f)

#     for ent in wiki2vec_dict.entities():
#         print(ent.title, file=f)

## Neighboor joining Tree

In [36]:
neighboor_sim = SimilarityMatrix(csv_file, CosineSim())

ids, data = neighboor_sim.get_sim_matrix()

ids = [tids.replace(" ", "_") for tids in ids]
inv_data  = sim2dist(data, lambda x: 1 - x) 

# print()
# print_mat(data_inv, format_function = lambda x: round(float(x), 2))

100%|██████████| 999/999 [01:02<00:00, 16.03it/s]


In [37]:
dm = DistanceMatrix(inv_data, ids)
tree = nj(dm, result_constructor=str)

with open(f"{save_name}-.tree", "w") as f:
    print(tree, file = f)

## similarity test

In [None]:
filename = "/content/cifar100-roberta-base-0.csv"


cosine_sim = SimilarityMatrix(filename, CosineSim())
euclidian_dist_sim = SimilarityMatrix(filename, EuclidianDistSim())
manhattan_dist_sim = SimilarityMatrix(filename, ManhattanDistSim())

ids, cosine_mat = cosine_sim.get_sim_matrix()
_  , euclid_mat = euclidian_dist_sim.get_sim_matrix()
_  , manhat_mat = manhattan_dist_sim.get_sim_matrix()

In [None]:
print(max(list(map(max, euclid_mat))))
print(min(list(map(min, euclid_mat))))

print(max(list(map(max, manhat_mat))))
print(min(list(map(min, manhat_mat))))

In [None]:
cosine_sim_list = []
euclid_sim_list = []
manhat_sim_list = []

closed_list = []

# cos_thresold = 0.5
# euc_thresold = 5
# man_thresold = 60

for i, idsa in tqdm(enumerate(ids)):
    for j, idsb in enumerate(ids):

        if i == j: continue 
        if (i, j) in closed_list or (j, i) in closed_list: continue

        cos_val = cosine_mat[i][j]
        euc_val = euclid_mat[i][j]
        man_val = manhat_mat[i][j]

        # if cos_val >= cos_thresold:
        cosine_sim_list.append((idsa, idsb, cos_val))
        
        # if euc_val <= euc_thresold:
        euclid_sim_list.append((idsa, idsb, euc_val))

        # if man_val <= man_thresold:
        manhat_sim_list.append((idsa, idsb, man_val))

        closed_list.append((i, j))
        closed_list.append((j, i))

cosine_sim_list.sort(key=lambda x: x[2], reverse = True)
euclid_sim_list.sort(key=lambda x: x[2], reverse = False)
manhat_sim_list.sort(key=lambda x: x[2], reverse = False)

def print_list(listed):
    print(f"{'word':20}{'target':20}{'similarity':10}")
    print("="*50)
    for ida, idb, sim in listed:
        print(f"{ida:20}{idb:20}{round(float(sim), 6):6}")

print(" Cosine similarity ")
print_list(cosine_sim_list[0:10])
print()

print(" Euclidian distance ")
print_list(euclid_sim_list[0:10])
print()

print(" Manhattan distance ")
print_list(manhat_sim_list[0:10])
print()


## wikipedia debug

In [None]:
#@title Article to search for { run: "auto", vertical-output: true, display-mode: "both" }
totest = "great white shark" #@param {type:"string"}

result = wikipedia.search(totest, suggestion = False)
print(result)
print(f"first result is: {result[0]}")

try:
    print(wikipedia.page(totest, auto_suggest=False, redirect=True))
    print(wikipedia.page(result[0], auto_suggest=False, redirect=True))
except Exception as e:
    print(f"best option envisaged: {e.options[0]}")
    print(e)

In [None]:
articleviewer = ArticleViewer("/content/article/cifar100.art")
print(articleviewer.get_all_articles())
articleviewer.get("king").summary

# Test part

### Different result for King - men + women equation with different context window size

In [None]:
gc.collect()

save_name = "king-test"
vocab = ["king", "queen", "men", "woman"]

# king_test_model = ROBERTAModel(vocab, big = False, window = 0)
try:
    king_test_model = Wiki2VecModel(vocab)
except:
    print("haha")

king_test_articlesRetriever = ArticleRetriever(save_name + ".art", vocab)
if king_test_articlesRetriever():
    king_test_articlesRetriever.save()

result = king_test_model.convert(king_test_articlesRetriever)

if result is not None and len(result) > 0:
    print(result)
king_test_model.export(save_name + ".csv")
king_test_solver = Solver(save_name + ".csv")

men = king_test_model.get_embedding_of("men")
woman = king_test_model.get_embedding_of("woman")
king = king_test_model.get_embedding_of("king")

print(type(men), type(woman), type(king))
#totest = king - men + woman
input("debug")
print()
king_test_solver(totest, "King - man + woman")

100%|██████████| 4/4 [00:00<00:00, 970.51it/s]


trying to retrieve king
trying to retrieve queen
trying to retrieve men
trying to retrieve woman
<class 'torch.Tensor'> <class 'torch.Tensor'> <class 'torch.Tensor'>
debug



NameError: ignored

| model | window size | rank of Queen | distance with first |
|-------|-------------|---------------|-----------|
| bert-large | 0  | 2 |  .0693 |
| bert-large | 10 | 3 |  .1191 | 
| bert-large | 50 | 2 |  .1672 |
| bert-large | 100 | 2 | .1275 |
| bert-large | 150 | 3 | .1134 |
| bert-large | 200 | 3 | .1923 |
| bert-large | 300 | 3 | .0939 |
| bert-large | 400 | 3 | .1455 |
| roberta-large | 0 | 2 | .0001 |
| roberta-large | 10 | 3 | .0011 |
| roberta-large | 50 | 2 | .0029 |
| roberta-large | 100 | 4 | .0061 |
| roberta-large | 150 | 3 | .0023 |
| roberta-large | 200 | 4 | .0055 |
| roberta-large | 300 | 4 | .0127 |
| roberta-large | 400 | 4 | .0045 |








### Pearson correlation rank with different context window size

In [None]:
wordsim353 = nlp.data.WordSim353('all')

Downloading /root/.mxnet/datasets/wordsim353/ws353simrel.tar.gz from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/ws353/ws353simrel.tar.gz...


In [None]:
gc.collect()

save_name = "wordsim353"
vocab = []

for w1, w2, i in wordsim353:
    if w1 not in vocab:
        vocab.append(w1)
    if w2 not in vocab:
        vocab.append(w2)

print(len(vocab))

# wordsim353_model = ROBERTAModel(vocab, big = True, window = 75)
# wordsim353_model = BERTModel(vocab, big = True, window = 75)
wordsim353_model = Wiki2VecModel(vocab, size = 500)

articlesRetriever = ArticleRetriever(save_name + ".art", vocab)

if articlesRetriever(force_reload = True):
    articlesRetriever.save()

resolve = {}
while True: 

    unk_list = wordsim353_model.convert(articlesRetriever, resolve_dict = resolve)

    if unk_list is None or len(unk_list) == 0:
        break

    print(len(unk_list), "items haven't been found, resolve mode.")

    with open("./temp/resolve_test.txt", 'w') as f:
        for word in unk_list:
            print(word, "=>", file=f)

    input("press enter to resume resolve")

    resolve = {}
    with open("./temp/resolve_test.txt", 'r') as f:
        data = f.readlines()
        for line in data:
            line_split = line.split("=>")
            resolve[line_split[0].strip()] = line_split[1].strip()

wordsim353_model.export(save_name + ".csv")

In [None]:
from scipy.stats import spearmanr

total_comparison = 0
sim_list = []
i_list = []

sim_computer = SimilarityMatrix(save_name + ".csv", CosineSim())

for w1, w2, i in wordsim353:
    # print(f"how much {w1} is similar to {w2}:")
    sim = sim_computer.sim_between(w1, w2)

    sim_list.append(sim)
    i_list.append(i)

    # print(f"\t{sim} & {i}")
    total_comparison += 1

result = spearmanr(sim_list, i_list)
print(f"{round(result.correlation, 4)} ({result.pvalue:.4E})")

0.7104 (2.4653E-55)



| model |window size | pearson rank correlation |
|-------|------------|--------------------------|
| bert large | 100 | 0.2158 (5.8353e-05)|
| bert base  | 100 | 0.2284 (6.8215e-05)|
| bert-base  | 300 | 0.1326 (1.27e-02) |
| bert-large  | 300 | 0.2117 (6.2153e-05) |
| bert-large  | 0 | 0.2638 (5.1232-07) |
| bert-base   | 0 | 0.3721 (5.2306-13) |
| wiki2vec  | 300 | 0.6913 (2.4595e-51) |
| roberta-base | 0 | 0.1656 (1.8187E-03) |
| roberta-large | 0 | 0.2416 (4.5395E-06) |
| roberta-large | 100 | 0.0419 (4.3274E-01) |
| roberta-base | 100 | 0.0672 (2.0850E-01) |
| roberta-base | 200 | 0.1926 (2.7871E-04) | 
| roberta-large | 200 | 0.1712 (1.2653E-03) |
| bert-base | 200 | 0.1587 (2.8283E-03) |
| bert-large | 200 | 0.1216 (2.2481E-02) |
| bert-large | 400 | 0.1947 (2.3855E-04)|
| bert-base | 400 | 0.1709 (1.2842E-03) |
| roberta-basae | 300 | 0.1389 (9.0722E-03) |
| roberta-large | 300 | 0.1335 (1.2189E-02) |
| roberta-large | 400 | 0.2301 (1.2986E-05) |
| roberta-base | 400 | 0.1728 (1.1315E-03) |


