# Menu <a class="anchor" id="menu"></a>
   
* [1. Préparatifs](#init)
* [2. Recherche du `plongement` le plus adapté](#EMBEDDING)
    * [3.1 Pretrained Word2Vec `word2vec-google-news-300`](#EMBEDDING_Word2Vec)
    * [3.2 Pretrained FastText `fasttext-wiki-news-subwords-300`](#EMBEDDING_FastText)
    * [3.3 Pretrained Glove `Stanford's GloVe 100d`](#EMBEDDING_glove6B100d)
    * [3.4 Pretrained Glove `glove-twitter-25`](#EMBEDDING_glove25)
    * [3.5 Pretrained Glove `glove-twitter-100`](#EMBEDDING_glove25)
    * [3.6 Word2Vec `local training`](#EMBEDDING_Word2Vec_local)
    * [3.7 FastText `local training`](#EMBEDDING_FastText_local)
    * [3.8 Comparaison des scores](#EMBEDDING_scores)

In [18]:
import os
import time
import pathlib
import gzip

import joblib
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
try:
    from keras.utils import pad_sequences
except ImportError:
    from keras.preprocessing.sequence import pad_sequences

import joblib

random_seed = 0
np.random.seed(random_seed)

# set random seed for keras reproductibility
try:
    keras.utils.set_random_seed(random_seed)
except Exception:
    tf.random.set_seed(random_seed)

#### Définissons une fonction permettant de charger les embeddings pre-calculés

In [67]:
from gensim.models import KeyedVectors
import gensim.downloader as api

def load_genim_embedding(embedding_name, binary=False):
    
    embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.gz')
    try:
        if embedding_path.is_file():
            print(f"Loading from {embedding_path}")
            embedding_model = KeyedVectors.load_word2vec_format(embedding_path, binary=binary)
        else:
            print("Loading from the Git repos with API")
            embedding_model = api.load(embedding_name)
            
        return embedding_model
    except Exception as e:
        print(f"The provided embedding model couldn't be loaded correctly: {e}")
        
def load_trained_glove(embedding_name):
    
    def parse_file(file):
        embeddings_index = {}
        for line in file:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
        return embeddings_index

    try:
        embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.gz')
        if embedding_path.is_file():
            print(f"Loading from gZip: {embedding_path}")
            with gzip.open(embedding_path, mode='rt') as f:
                return parse_file(f)
        
        embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.txt')
        if embedding_path.is_file():
            print(f"Loading from TXT: {embedding_path}")
            with open(embedding_path) as f:
                return parse_file(f)
        else:
            raise FileNotFoundError(f"No such model found (it must be {embedding_name}.txt or {embedding_name}.gz)")
            
    except Exception as e:
        print(f"The provided embedding model couldn't be loaded correctly: {e}")

In [95]:
em_model = load_genim_embedding("glove-twitter-25")

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/glove-twitter-25.gz


In [96]:
em_model.most_similar(positive=['fruit', 'flower'], topn=1)

[('cherry', 0.9183273911476135)]

In [None]:
em_model.get_vector('like') # Vector OK

In [None]:
em_model.get_vector('likexxx') # Error OK

In [97]:
em_model_100 = load_genim_embedding("glove-twitter-100")

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/glove-twitter-100.gz


In [98]:
em_model_100.most_similar(positive=['fruit', 'flower'], topn=1)

[('peach', 0.766598641872406)]

In [None]:
em_model_100.get_vector('like') # Vector OK

In [None]:
em_model_100.get_vector('likexxx') # Error OK

In [100]:
em_model_ft = load_genim_embedding("fasttext-wiki-news-subwords-300")

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/fasttext-wiki-news-subwords-300.gz


In [102]:
em_model_ft.most_similar(positive=['fruit', 'flower'], topn=1)

[('flowers', 0.7749223113059998)]

In [None]:
em_model_ft.get_vector('like') # Vector OK

In [None]:
em_model_ft.get_vector('likexxx') # Error OK

In [49]:
em_model_w2v = load_genim_embedding("word2vec-google-news-300", binary=True)

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/word2vec-google-news-300.gz


In [50]:
em_model_w2v.most_similar(positive=['fruit', 'flower'], topn=1)

[('flowers', 0.6933968663215637)]

In [None]:
em_model_w2v.get_vector('like') # Vector OK

In [None]:
em_model_w2v.get_vector('likexxx') # Error OK

In [69]:
em_model_gv100 = load_trained_glove("glove.6B.100d")

Loading from gZip: /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/glove.6B.100d.gz


In [None]:
em_model_gv100.get('like') # Vector OK

In [139]:
em_model_gv100.get('likexxx') # No Error No return