In [1]:
import os
import re
import pickle
import numpy as np
import pandas as pd

from collections import Counter
from gensim.models import KeyedVectors

from sklearn.metrics import f1_score, make_scorer, confusion_matrix, \
    classification_report
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score, \
    StratifiedShuffleSplit, RandomizedSearchCV

# Load data

In [73]:
with open("../Data/Learn/sequences.pkl", "rb") as f:
    learn_sequences = pickle.load(f)
    
with open("../Data/Test/sequences.pkl", "rb") as f:
    test_sequences = pickle.load(f)
    
with open("../Data/dict.pkl", "rb") as f:
    vocabulary = pickle.load(f)

# Embeddings

### Load fasttext embeddings

In [38]:
embeddings_dict = KeyedVectors.load_word2vec_format("../Data/wiki.fr.vec")

### Handle unknown words

In [159]:
new_vocab_mapping, new_reverse_vocabulary = {}, {}

# Known words
i = -1
for word in vocabulary:
    if word in embeddings_dict:
        i += 1
        new_vocab_mapping[vocabulary[word]] = i
        new_reverse_vocabulary[i] = word
print("%d known words out of %d" % (i + 1, len(vocabulary)))

# Unknown words
i += 1
for word in vocabulary:
    if word not in embeddings_dict:
        new_vocab_mapping[vocabulary[word]] = i
new_reverse_vocabulary[i] = "<UNK>"

28934 known words out of 30432


In [160]:
print("%d original words" % len(new_vocab_mapping))
print("%d new words (1 unknown)" % len(np.unique(list(new_vocab_mapping.values()))))
print("%d max new words index" % max(new_vocab_mapping.values()))

30432 original words
28935 new words (1 unknown)
28934 max new words index


In [161]:
num_new_words = 28935
new_reverse_vocabulary[num_new_words - 1]

'<UNK>'

### Apply new mappings to sequences

In [162]:
new_learn_sequences = [list(map(new_vocab_mapping.get, seq)) for seq in learn_sequences]
new_test_sequences = [list(map(new_vocab_mapping.get, seq)) for seq in test_sequences]

### Create corresponding embeddings array

In [163]:
embeddings = np.zeros((num_new_words, 300))
# Last row is for unknown words, we leave it to zero for the moment
for i in range(num_new_words - 1):
    embeddings[i, :] = embeddings_dict[new_reverse_vocabulary[i]]

### Verification

In [164]:
(learn_sentences[0], new_learn_sequences[0][:5], 
(embeddings[2147] == embeddings_dict.word_vec("aurai")).all())

("J'aurai l'occasion de dire aux Français comment notre enseignement devra évoluer pour permettre à chaque jeune de trouver sa place, d'entrer dans le monde du travail, de savoir s'adapter et, à partir de là, d'acquérir, tout au long de la vie, de nouvelles compétences et de nouveaux savoirs.",
 [28934, 2147, 28934, 309, 1],
 True)

In [165]:
(test_sentences[0], new_test_sequences[0][:5], 
(embeddings[48] == embeddings_dict.word_vec("tous")).all())

('Et tous se demandaient : " Après elle, après nous, qui se souviendra et comment ?',
 [4, 48, 36, 11372, 28934],
 True)

In [167]:
# Last row for unknown words
embeddings[num_new_words - 1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

### Save new data

In [174]:
with open("../Data/generated/my_learn_sequences.pkl", "wb") as f:
    pickle.dump(new_learn_sequences, f)
    
with open("../Data/generated/my_test_sequences.pkl", "wb") as f:
    pickle.dump(new_test_sequences, f)
    
with open("../Data/generated/my_reverse_vocabulary.pkl", "wb") as f:
    pickle.dump(new_reverse_vocabulary, f)
    
with open("../Data/generated/my_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)