In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Word2vec et autres embeddings

![word2vec](https://cdn-images-1.medium.com/max/2600/1*sXNXYfAqfLUeiDXPCo130w.png)

Le `word embedding` est capable de capturer le contexte, la similarité sémantique et syntaxique (genre, synonymes, …) d’un mot et fournir une représentation vectorielle *dense*.

Il existe trois scénarios pour produire des représentations vectorielles des documents :
- Utiliser un modèle pré-entraîné
- Entraîner son propre modèle 
- Prendre un modèle pré-entraîné (par exemple, [fastText](https://fasttext.cc/docs/en/english-vectors.html) ou [GloVe](https://nlp.stanford.edu/projects/glove/) et l'affiner.

Pour récupérer un modèle pré-entraîné ou entraîner son propre modèle, on peut utiliser la librarie `gensim`, le module : `models.word2vec`. Plus de [détails](https://radimrehurek.com/gensim/models/word2vec.html).

## Modèle pré-entraîné avec la méthode word2vec

Pour charger des vecteurs de type `word2vec` et travailler avec on utilise la classe `KeyedVectors`. Cette classe contienne une méthode `load_word2vec_format` pour charger le modèle entraîné avec la méthode word2vec (à partir d'un fichier texte ou d'un fichier binaire) et `load` pour charger un modèle entraîné avec une autre méthode (*glove, fasttext*).

Source: 

    @misc{fauconnier_2015,
        author = {Fauconnier, Jean-Philippe},
        title = {French Word Embeddings},
        url = {http://fauconnier.github.io},
        year = {2015}}


In [None]:
import re
import logging
import pandas as pd

import nltk.data 
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, RegexpTokenizer
# nltk.download('punkt')

import gensim
from gensim.models import word2vec, KeyedVectors

In [None]:
model_fr = KeyedVectors.load_word2vec_formodel = KeyedVectors.load_word2vec_format("/content/drive/My Drive/NLP/frWac_non_lem_no_postag_no_phrase_200_cbow_cut100.bin", binary=True, unicode_errors="ignore")
print(model_fr.most_similar("intéressant"))
print(model_fr.similarity('formation', 'professionnalisante'))

[('intéressante', 0.6818215847015381), ('instructif', 0.6677659749984741), ('assez', 0.6302736401557922), ('intéressants', 0.6282306909561157), ('très', 0.6215654611587524), ('utile', 0.5970137119293213), ('rébarbatif', 0.5915195345878601), ('intéressantes', 0.5864251255989075), ('judicieux', 0.5620384216308594), ('beaucoup', 0.5607017874717712)]
0.6323689


In [None]:
# (roi - homme) + femme = ?
print(model_fr.most_similar(positive=['roi', 'femme'], negative=['homme'], topn=3))
# (espagne - madrid) + paris = ?
print(model_fr.most_similar(positive=['espagne', 'paris'], negative=['madrid'], topn=3))
# (basques - espagne) + bretagne = ?
print(model_fr.most_similar(positive=['basques', 'bretagne'], negative=['espagne'], topn=3))

[('reine', 0.6945513486862183), ('duchesse', 0.6247551441192627), ('épouse', 0.6083102822303772)]
[('france', 0.5135271549224854), ('nanterre', 0.45087674260139465), ('bd', 0.43005678057670593)]
[('bretons', 0.6322782039642334), ('bretonne', 0.6221523880958557), ('bretonnes', 0.5482804775238037)]


In [None]:
model_fr.doesnt_match('basque gascon ananas breton'.split())

'ananas'

In [None]:
del model_fr

In [None]:
model_fr_postag = KeyedVectors.load_word2vec_formodel = KeyedVectors.load_word2vec_format("/content/drive/My Drive/NLP/frWac_postag_no_phrase_1000_skip_cut100.bin", binary=True, unicode_errors="ignore")
print(model_fr_postag.most_similar("intéressant_a"))

del model_fr_postag

[('très_adv', 0.5709680914878845), ('assez_adv', 0.5124066472053528), ('inconvénient_a', 0.512165904045105), ('inconvénients_n', 0.5095574259757996), ('avantages_et', 0.48624387383461), ('avantages_n', 0.48457926511764526), ('plutôt_adv', 0.4844917356967926), ('ciao_n', 0.4828088879585266), ('instructif_a', 0.4705738127231598), ('beaucoup_adv', 0.468425452709198)]


## Modèle pré-entraîné : GloVe

Lien pour le téléchargement du modèle pré-entraîné: https://nlp.stanford.edu/projects/glove/

In [None]:
# import numpy as np

# glove = {}
# with open('/content/drive/My Drive/NLP/glove.6B/glove.6B.200d.txt', 'r') as f:
#     for line in f:
#         word, embedding = line.split(' ',1)
#         wordEmbedding = np.array([float(value) for value in embedding[1:].split(' ')])
#         glove[word] = wordEmbedding

# print(len(glove))

In [None]:
# from gensim.test.utils import datapath, get_tmpfile
# from gensim.models import KeyedVectors
# from gensim.scripts.glove2word2vec import glove2word2vec

# glove_file = datapath('/content/drive/My Drive/NLP/glove.6B/glove.6B.200d.txt')
# tmp_file = get_tmpfile("glove.6B.200d.vec")
# _ = glove2word2vec(glove_file, tmp_file)
# model_glove = KeyedVectors.load_word2vec_format(tmp_file)
# model_glove.save_word2vec_format("/content/drive/My Drive/NLP/glove.6B/glove.6B.200d.bin", binary=True)

In [None]:
model_glove = KeyedVectors.load_word2vec_formodel = KeyedVectors.load_word2vec_format("/content/drive/My Drive/NLP/glove.6B/glove.6B.200d.bin", binary=True, unicode_errors="ignore")

# (king - man) + woman = ?
print(model_glove.most_similar(positive=['king', 'woman'], negative=['man']))

[('queen', 0.6978678107261658), ('princess', 0.6081745028495789), ('monarch', 0.5889754891395569), ('throne', 0.5775108933448792), ('prince', 0.5750998258590698), ('elizabeth', 0.5463595986366272), ('daughter', 0.5399126410484314), ('kingdom', 0.5318052768707275), ('mother', 0.5168544054031372), ('crown', 0.5164473056793213)]


In [None]:
del model_glove

## Modèle pré-entraîné : fastText

FastText utilise non seulement des vecteurs de mots, mais aussi des vecteurs de n-grammes. Dans le corpus, chaque mot est automatiquement représenté comme un ensemble de n-grammes de caractères. Par exemple, si nous fixons n=3, un vecteur pour le mot "where" sera représenté par la somme des vecteurs des trigrammes suivants : "<wh", "whe", "her", "ere", "re>" (où "<" et ">" sont des symboles indiquant le début et la fin d'un mot). Cela permet de travailler efficacement avec des textes contenant des erreurs et des fautes de frappe.

* [Article](https://aclweb.org/anthology/Q17-1010)
* [Site](https://fasttext.cc/)
* [Get started](https://fasttext.cc/docs/en/support.html)
* [Github](https://github.com/facebookresearch/fasttext)

Pour python il existe une librarie - `fasttext`. Si on récupère des fichiers de modèles, on peut utiliser `gensim`.

Sur le [site](https://fasttext.cc/docs/en/crawl-vectors.html) on trouve des modèles pré-entraînés pour 157 langues (dont le français).

In [None]:
# ! pip install fasttext

In [None]:
import fasttext
# import fasttext.util
# fasttext.util.download_model('en', if_exists='ignore') 

In [None]:
# !cp "/content/cc.en.300.bin" "/content/drive/My Drive/NLP/"

In [None]:
ft_en = fasttext.load_model('/content/drive/My Drive/NLP/cc.en.300.bin')

# print(ft_en['model'])
print(ft_en.get_nearest_neighbors('intelligence'))
print(ft_en.get_analogies("woman", "man", "actor"))

# Comme le modèle est entraîné sur des n-grammes de caractères, 
# il n'y a pas de problème de OOV (out of vocabulary)
print(ft_en.get_nearest_neighbors('formatoin'))



[(0.7630482912063599, 'inteligence'), (0.7555618286132812, 'intelligence.The'), (0.7255445718765259, 'intelligence-'), (0.716752290725708, 'intellgence'), (0.7112532258033752, 'intellegence'), (0.709306001663208, 'Intelligence'), (0.7025996446609497, 'non-intelligence'), (0.6886445879936218, 'intelligence.'), (0.6828545928001404, 'intelligence.But'), (0.6662881970405579, 'intelligenc')]
[(0.8941132426261902, 'actress'), (0.7075303196907043, 'actresses'), (0.6792172193527222, 'actress-'), (0.6782666444778442, 'actresss'), (0.6682973504066467, 'actress.'), (0.6645402908325195, 'Actress'), (0.659998893737793, 'actress.She'), (0.6558604836463928, 'actres'), (0.646649956703186, 'actess'), (0.6443707346916199, 'actress.The')]
[(0.5347930192947388, 'informatoin'), (0.3836231231689453, 'Year.ReplyDeleteAdd'), (0.3629437983036041, 'itself.ReplyDeleteAdd'), (0.3616984486579895, 'serviceReplyDeleteAdd'), (0.3609582483768463, 'money.ReplyDeleteAdd'), (0.35249876976013184, 'service.ReplyDeleteAdd')

In [None]:
del ft_en

## Entraînement d'un modèle word2vec

* Dataset: IMDB Movie Reviews [source sur kaggle](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) ou [lien stanford](http://ai.stanford.edu/~amaas/data/sentiment/)

In [None]:
import pandas as pd

pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('max_colwidth', 800)

In [None]:
data = pd.read_csv("/content/drive/My Drive/NLP/IMDB Dataset.csv")
data.shape

(50000, 2)

In [None]:
data.head()

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangst...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen- Michael Sheen not only ""has got all the polari"" but he has all the voices down pat too! You can truly see the seamless editing guided by the references to Williams' diary entries, not only is it well worth the watching but it is a terrificly written and performed piece. A masterful production about one of the great master's of comedy and his life. <br /><br />The realism really comes home with the little things: the fantasy of the guard which, rather than use the traditional 'dream' techniques remains solid then disappears. It plays on ou...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well bread suspected serial killer). While some may be disappointed when they realize this is not Match Point 2: Risk Addiction, I thought it was proof that Woody Allen is still fully in control of the style many of us have grown to love.<br /><br />This was the most I'd laughed at one of Woody's comedies in years (dare I say a decade?). While I've never been impressed with Scarlet Johanson, in this she managed to tone down her ""sexy"" image and jumped right into a average, but spirited young woman.<br /><br />This may not be the crown jewel of h...",positive
3,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br /><br />OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.<br /><br />3 out of 10 just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them.",negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situations we encounter. <br /><br />This being a variation on the Arthur Schnitzler's play about the same theme, the director transfers the action to the present time New York where all these different characters meet and connect. Each one is connected in one way, or another to the next person, but no one seems to know the previous point of contact. Stylishly, the film has a sophisticated luxurious look. We are taken to see how these people live and the world they live in their own habitat.<br /><br />The only thing one gets out of all these ...",positive


### Nettoyage et tokénisation des documents

Nettoyage

* supprimer des liens hypertexte et des balises html
* supprimer des caractères autres que des lettres
* mettre les textes en minuscules

In [None]:
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from multiprocessing import Pool
import warnings
warnings.filterwarnings("ignore")

In [None]:
import nltk
nltk.download('punkt')
# Sentences tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
def review_to_words(review, remove_stopwords=False):
    # supprimer des liens hypertextes
    review = re.sub(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", " ", review)
    # supprimer des balises html
    review_text = BeautifulSoup(review, "lxml").get_text()
    # remplacer les caractères autres que des lettres par des expaces
    review_text = re.sub("[^a-zA-Z]", " ", review_text)
    # mettre en minuscules et convertir en liste de mots
    words = review_text.lower().split()
    if remove_stopwords:
        # supprimer les mots vides de la langue anglaise
        words = [w for w in words if not w in stopwords.words("english")]
    return(words)

def review_to_sentences(review, tokenizer=tokenizer, remove_stopwords=False):
    # token = phrase
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_words(raw_sentence, remove_stopwords))
    return sentences

In [None]:
with Pool(4) as p:
    sentences = list(tqdm(p.imap(review_to_sentences, data["review"]), total=len(data)))

print(len(sentences))
print(*sentences[:3])

  0%|          | 0/50000 [00:00<?, ?it/s]

50000
[['one', 'of', 'the', 'other', 'reviewers', 'has', 'mentioned', 'that', 'after', 'watching', 'just', 'oz', 'episode', 'you', 'll', 'be', 'hooked'], ['they', 'are', 'right', 'as', 'this', 'is', 'exactly', 'what', 'happened', 'with', 'me', 'the', 'first', 'thing', 'that', 'struck', 'me', 'about', 'oz', 'was', 'its', 'brutality', 'and', 'unflinching', 'scenes', 'of', 'violence', 'which', 'set', 'in', 'right', 'from', 'the', 'word', 'go'], ['trust', 'me', 'this', 'is', 'not', 'a', 'show', 'for', 'the', 'faint', 'hearted', 'or', 'timid'], ['this', 'show', 'pulls', 'no', 'punches', 'with', 'regards', 'to', 'drugs', 'sex', 'or', 'violence'], ['its', 'is', 'hardcore', 'in', 'the', 'classic', 'use', 'of', 'the', 'word', 'it', 'is', 'called', 'oz', 'as', 'that', 'is', 'the', 'nickname', 'given', 'to', 'the', 'oswald', 'maximum', 'security', 'state', 'penitentary'], ['it', 'focuses', 'mainly', 'on', 'emerald', 'city', 'an', 'experimental', 'section', 'of', 'the', 'prison', 'where', 'all', '

In [None]:
flat_sentences = [item for sublist in sentences for item in sublist]
len(flat_sentences)

537072

In [None]:
with open('clean_text.txt', 'w') as f:
    for s in flat_sentences:
        f.write(' '.join(s))
        f.write('\n')

On entraîne le modèle et sauvegarde le résultat. 

Paramètres principaux :
* data — doivent être un objet itérable
* vector_size — la taille du vecteur, 
* window — taille de la fenêtre d'observation,
* min_count — fréquence minimale d'un mot dans le corpus,
* sg — algorithme d'apprentissage utilisé (0 — CBOW, 1 — Skip-gram),
* sample — seuil de sous-échantillonnage des mots à haute fréquence,
* workers — nombre de threads,
* alpha — learning rate,
* iter — nombre d'itérations,
* max_vocab_size — permet de fixer une limite de mémoire lors de la création du dictionnaire (c'est-à-dire que si la limite est dépassée, les mots de basse fréquence seront écartés). À titre de comparaison : 10 millions de mots = 1 Go de RAM.

**Important !** Lors de l'apprentissage d'un modèle il n'y a pas de prétraitement. Cela doit être fait avant l'apprentissage.

In [None]:
print("Training model...")

%time model_imdb_en = word2vec.Word2Vec(flat_sentences, workers=4, vector_size=300, min_count=10, window=10, sample=1e-3)

Training model...
CPU times: user 4min 29s, sys: 1.12 s, total: 4min 30s
Wall time: 3min 15s


In [None]:
# La taille du vocabulaire obtenu
print(len(model_imdb_en.wv))

27864


On fait des premières vérifications manuelles.

In [None]:
print(model_imdb_en.wv.most_similar(positive=["woman", "actor"], negative=["man"], topn=3))
print(model_imdb_en.wv.most_similar(positive=["dogs", "man"], negative=["dog"], topn=3))
print(model_imdb_en.wv.most_similar("usa", topn=3))
print(model_imdb_en.wv.doesnt_match("comedy thriller western novel".split()))

[('actress', 0.7802090048789978), ('performer', 0.6007119417190552), ('dancer', 0.5078648924827576)]
[('men', 0.545170247554779), ('businessmen', 0.4426037073135376), ('americans', 0.43313008546829224)]
[('germany', 0.7680897116661072), ('japan', 0.7602989673614502), ('europe', 0.7565326690673828)]
novel


Cherchons les mots similaires pour le mot `star` :
- avec le modèle `model_imdb_en` entraîné sur les critiques des films
- avec un autre modèle (`fasttext`) entraîné sur wikipédia

In [None]:
print(*model_imdb_en.wv.most_similar("star", topn=10), sep='\n')

('stars', 0.5878415107727051)
('hudson', 0.42976656556129456)
('hardware', 0.40808263421058655)
('singer', 0.40657293796539307)
('starred', 0.4049895703792572)
('superstar', 0.4049648940563202)
('fame', 0.3929443955421448)
('napoleonic', 0.3907529413700104)
('stardom', 0.38516944646835327)
('celebrity', 0.3849700391292572)


In [None]:
print(model_imdb_en.wv.similarity('star', 'celebrity'))
print(model_imdb_en.wv.similarity('star', 'sky'))
print(model_imdb_en.wv.similarity('star', 'shine'))

0.38497004
0.13420257
0.16179784


In [None]:
ft_en = fasttext.load_model('/content/drive/My Drive/NLP/cc.en.300.bin')



In [None]:
from sklearn.metrics.pairwise import cosine_similarity

print(*cosine_similarity([ft_en['star']], [ft_en['celebrity']])[0])
print(*cosine_similarity([ft_en['star']], [ft_en['sky']])[0])
print(*cosine_similarity([ft_en['star']], [ft_en['shine']])[0])

0.48565808
0.29916823
0.28851572


In [None]:
del ft_en

### Fine-tuning d'un modèle pré-entraîné

Lors de l'apprentissage d'un modèle à partir de zéro, les poids sont initialisés de manière aléatoire. Il est possible d'utiliser des poids d'un modèle pré-entraîné pour initialiser les vecteurs.

Utilisons l'[ebook](https://www.gutenberg.org/files/11/11-0.txt) «Alice’s Adventures in Wonderland» pour effectuer un fine-tuning d'un modèle entraîné sur les textes des critiques IMDB.

In [None]:
with open("/content/drive/My Drive/NLP/alice.txt", 'r', encoding='utf-8') as f:
    text = f.read()

text = re.sub('\n', ' ', text)
sents = sent_tokenize(text)

punct = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~„“«»†*—/\-‘’'

alice_clean_sents = []
for sent in sents:
    s = [w.lower().strip(punct) for w in sent.split()]
    alice_clean_sents.append(s)
    
print(alice_clean_sents[:2])

[['alice’s', 'adventures', 'in', 'wonderland', 'by', 'lewis', 'carroll', 'the', 'millennium', 'fulcrum', 'edition', '3.0', 'contents', 'chapter', 'i'], ['down', 'the', 'rabbit-hole', 'chapter', 'ii']]


Pour faire un tuning d'un modèle, il faut d'abord le sauvegarder et ensuite charger. Tous les paramètres d'entraînement (taille du vecteur, fréquence minimale des mots etc.) seront pris du modèle chargé, on ne peut pas redéfinir leurs valeurs ([la documentation](https://radimrehurek.com/gensim/models/keyedvectors.html))

In [None]:
imdb_model_path = "/content/drive/My Drive/NLP/IMDB.word2vec"

print("Saving model...")
model_imdb_en.save(imdb_model_path)

Saving model...


In [None]:
model_imdb_alice_en = word2vec.Word2Vec.load(imdb_model_path)

model_imdb_alice_en.build_vocab(alice_clean_sents, update=True)
model_imdb_alice_en.train(alice_clean_sents, total_examples=model_imdb_alice_en.corpus_count, epochs=5)



(91636, 147270)

Comparons la similarité des mots `white` et `rabbit` calculée avec le modèle initial avec celle calculée avec le modèle fine-tuned.

In [None]:
print(model_imdb_en.wv.similarity('white', 'rabbit'))
print(model_imdb_alice_en.wv.similarity('white', 'rabbit'))

0.29565364
0.36994138


## Classification des critiques IMDB avec Word2Vec

### Préparation des données

* Faire un prétraitement des données.

* Diviser le dataset initial IMDB en ensemble pour l'apprentissage et pour le test.



### Entraînement d'un modèle de classification

* Entraîner et sauvegarder un modèle word2vec sur les données d'entraînement.

* Vectoriser les documents (le modèle `Word2Vec` contient les vecteurs des mots).

  * **Option 1** `MeanEmbeddingVectorizer` : pour chaque document calculez un vecteur correspondant à la moyenne des vecteurs des mots du vocabulaire qui apparaissent dans ce document.
  * **Option 2** `TfidfEmbeddingVectorizer` : pour chaque document calculez un vecteur correspondant à la moyenne pondérée des vecteurs des mots du vocabulaire qui apparaissent dans ce document, avec des poids *tf-idf*.

* Entraîner un modèle avec chacun des deux vectorisateurs. Pour cela, utiliser `sklearn.pipeline` avec deux étapes : vectorisation et application d'un algorithme de classiffication de votre choix.


#### MeanEmbeddingVectorizer

In [None]:
import numpy as np

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = len(list(word2vec.values())[0])

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

#### TfidfEmbeddingVectorizer

In [None]:
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = len(list(word2vec.values())[0])

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])

Convertir le modèle Word2Vec en dictionnaire qui sera utilisé ensuite pour la vectorisation des textes.

In [None]:
w2v_dict = dict(zip(w2v_model.wv.index_to_key, w2v_model.wv.vectors))

In [18]:
# your code here

### Evaluation du modèle de classification

* Faire des prédictions sur les données de test avec chacun des deux modèles obtenus à l'étape précédente

* Afficher la matrice de confusion et les valeurs des métriques suivantes : `precision_score`, `recall_score`, `f1_score`, `accuracy_score`

#### MeanEmbeddingVectorizer

In [19]:
# your code here

#### TfidfEmbeddingVectorizer 

In [20]:
# your code here