# Menu <a class="anchor" id="menu"></a>
   
* [1. Préparatifs](#init)
* [2. Recherche du `plongement` le plus adapté](#EMBEDDING)
    * [2.1 Avec un embedding à `Zero`](#EMBEDDING_Zero)
    * [2.2 Pretrained Word2Vec `word2vec-google-news-300`](#EMBEDDING_Word2Vec)
    * [2.3 Pretrained FastText `fasttext-wiki-news-subwords-300`](#EMBEDDING_FastText)
    * [2.4 Pretrained Glove `Stanford's GloVe 100d`](#EMBEDDING_glove6B100d)
    * [2.5 Pretrained Glove `glove-twitter-25`](#EMBEDDING_glove25)
    * [2.6 Pretrained Glove `glove-twitter-100`](#EMBEDDING_glove25)
    * [2.7 Word2Vec `local training`](#EMBEDDING_Word2Vec_local)
    * [2.8 FastText `local training`](#EMBEDDING_FastText_local)
    * [2.9 Comparaison des scores](#EMBEDDING_scores)

In [41]:
import os
import time
import pathlib
import gzip

import joblib
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
try:
    from keras.utils import pad_sequences
except ImportError:
    from keras.preprocessing.sequence import pad_sequences

import joblib

random_seed = 0
np.random.seed(random_seed)

# set random seed for keras reproductibility
try:
    keras.utils.set_random_seed(random_seed)
except Exception:
    tf.random.set_seed(random_seed)

#### Définissons une fonction permettant de charger les embeddings pre-calculés

In [2]:
from gensim.models import KeyedVectors
import gensim.downloader as api

def load_gensim_embedding(embedding_name, binary=False):
    
    embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.gz')
    try:
        if embedding_path.is_file():
            print(f"Loading from {embedding_path}")
            embedding_model = KeyedVectors.load_word2vec_format(embedding_path, binary=binary)
        else:
            print("Loading from the Git repos with API")
            embedding_model = api.load(embedding_name)
            
        return embedding_model
    except Exception as e:
        print(f"The provided embedding model couldn't be loaded correctly: {e}")
        
def load_trained_glove(embedding_name):
    
    def parse_file(file):
        embeddings_index = {}
        for line in file:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
        return embeddings_index

    try:
        embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.gz')
        if embedding_path.is_file():
            print(f"Loading from gZip: {embedding_path}")
            with gzip.open(embedding_path, mode='rt') as f:
                return parse_file(f)
        
        embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.txt')
        if embedding_path.is_file():
            print(f"Loading from TXT: {embedding_path}")
            with open(embedding_path) as f:
                return parse_file(f)
        else:
            raise FileNotFoundError(f"No such model found (it must be {embedding_name}.txt or {embedding_name}.gz)")
            
    except Exception as e:
        print(f"The provided embedding model couldn't be loaded correctly: {e}")

---
---
# 1. Préparatifs pour de la classification avec des réseaux de neurones <a class="anchor" id="init"></a> [⇪](#menu)

#### Chargeons les fonctions de classification écrites sur les projets précédents

In [3]:
from classification_utils import fit_model, get_scores, init_scores

init_scores("data/scores_NN_SelectEmbedding.csv")

#### Définissons les fonctions génériques de notre Spot Checking 

In [4]:
from keras import layers
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, ReduceLROnPlateau

In [5]:
def summarize_diagnostics(history):
    
    figure = plt.figure(figsize=(8,8))
            
    # plot loss
    plt.subplot(211)
    plt.title('Cross Entropy Loss')
    plt.plot(history.history['loss'], color='blue', label='train')
    plt.plot(history.history['val_loss'], color='orange', label='val')
    plt.legend()
    
    # plot accuracy
    plt.subplot(212)
    plt.title('Classification Accuracy')
    plt.plot(history.history['accuracy'], color='blue', label='train')
    plt.plot(history.history['val_accuracy'], color='orange', label='val')
    plt.legend()
    
    # save plot to file
    #filename = sys.argv[0].split('/')[-1]
    #plt.savefig(filename + '_plot.png')
    #plt.close()
    
    plt.tight_layout(pad=1.0)

In [6]:
def init_callbacks(model_name):

    # Define savepoints
    filepath = pathlib.Path("models", f"{model_name}.epoch{{epoch:02d}}-accuracy{{val_accuracy:.2f}}.hdf5")

    checkpoint = ModelCheckpoint(
        monitor="val_accuracy",
        mode="max",
        filepath=filepath,
        save_best_only=True,
        save_weights_only=True,
        verbose=1,
    )

    # Define EarlyStopping conditions
    es = EarlyStopping(
        monitor='val_loss', # 'binary_accuracy'
        mode='min', # 'max'
        patience=10,  # 6 because ReduceLROnPlateau is 5 
        min_delta=0.01, 
        restore_best_weights=True, 
        verbose=1,
    )

    # Define Automatic LearningRate adjustments
    lr_reducer = ReduceLROnPlateau(
        monitor='val_loss',
        mode='min',
        factor=0.1,
        cooldown=5,
        patience=5,
        min_lr= 0.1e-5,
        verbose=1,
    )
    
    return [checkpoint, es, lr_reducer]

#### Définissons une méthode de `cross-validation`

In [7]:
from sklearn.model_selection import StratifiedKFold

skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)

#### Définissons la taille de batch utilisé

In [8]:
batch_size = 32 # 8192 # 4096

---
---
# 2. Recherche du `plongement` le plus adapté <a class="anchor" id="EMBEDDING"></a> [⇪](#menu)

> https://fr.wikipedia.org/wiki/Word_embedding
>
> Pour utiliser les données en apprentissage machine, il est nécessaire de leur trouver une représentation mathématique, typiquement des vecteurs. Certaines données s'y prêtent directement, comme par exemple les images, qui engendrent des vecteurs riches en information, encodant toutes les nuances et les couleurs qui les composent. Les mots, quant à eux, sont des éléments d'information isolés, et certaines représentations rudimentaires se limitent à un simple identifiant par mot. Par exemple le mot « chat » sera encodé par un seul identifiant arbitraire, disons X87. C'est une représentation discrète, relativement pauvre, qui ne permet notamment pas de comparer deux mots entre eux5. Les plongements lexicaux, eux, représentent un mot par un vecteur. Par exemple, un chat sera représenté par le vecteur [0,43 0,88 0,98 1,3]. Si l'on encode tous les mots d'un dictionnaire ainsi, il devient alors **possible de comparer les vecteurs des mots entre eux**, par exemple en mesurant l'angle entre les vecteurs. Une bonne représentation de mots permettra alors de trouver que le mot « chien » est plus près du mot « chat » qu'il ne l'est du mot « gratte-ciel »6. Qui plus est, ces représentations permettent d'espérer que, dans l'espace vectoriel où le plongement est fait, on aura l'équation roi - homme + femme = reine ou encore l'équation Paris - France + Pologne = Varsovie7.
> 
> Les plongements lexicaux sont également très **utiles pour mitiger le fléau de la dimension**, un problème récurrent en intelligence artificielle. Sans les plongements de mots, **les identifiants uniques représentant les mots engendrent des données éparses**, des points isolés dans un espace vaste et presque vide6. Avec les plongements de mots, en revanche, l'espace devient beaucoup plus restreint et il est plus facile pour un ordinateur d'y établir des regroupements, d'y découvrir des régularités, en apprentissage machine. 

In [9]:
def architecture01(f_opti, f_loss, f_metrics, embedding):
        
    inputs = keras.Input(shape=(None,), dtype="int64")
    
    x = embedding(inputs)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.Dense(24, activation='relu')(x)
    
    predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)
    
    model = keras.Model(inputs, predictions)
    model.compile(loss=f_loss, optimizer=f_opti, metrics=f_metrics)
    
    return model

### Chargeons le jeu de données avec le PRE-PROCESSING sélectionné

In [10]:
data_work = pd.read_csv(
    pathlib.Path(pathlib.Path().absolute(), 'data', 'data_nlp_1563108.csv'), 
    usecols=['target', 'lemmas_not_filtered'],
    encoding='ISO-8859-1',
)
data_work.rename(columns={'lemmas_not_filtered':'text'}, inplace=True)
display(data_work.head(), data_work.shape)

Unnamed: 0,target,text
0,0,"$ url$ - awww , that be a bummer . you shoulda..."
1,0,be upset that he can not update his facebook b...
2,0,I dive many time for the ball . manage to save...
3,0,my whole body feel itchy and like its on fire
4,0,"no , it be not behave at all . I be mad . why ..."


(1452791, 2)

### Définissons une fonction permattant de Tokenizer notre jeu de données
Nous avons établis que pour ce jeu de données il convient de choisir une tokenization avec environ 4500 mots retenus; c'est donc ce que nous allons faire.

In [13]:
def preprocess_data_RAW(X_train, X_test, tokenizer_num_words=4500, padding_size=50, verbose=1):
    tokenizer = Tokenizer(num_words=tokenizer_num_words)
    tokenizer.fit_on_texts(X_train)

    dictionary = tokenizer.word_index
    vocab_size = len(dictionary)+1 # Adding 1 because of reserved 0 index
    
    X_train_enc = tokenizer.texts_to_sequences(X_train)
    X_test_enc = tokenizer.texts_to_sequences(X_test)
    
    X_train_ready = pad_sequences(X_train_enc, padding='post', maxlen=padding_size)
    X_test_ready = pad_sequences(X_test_enc, padding='post', maxlen=padding_size)
    
    # y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
    # y_test = np.asarray(y_test).astype('float32').reshape((-1,1))
    
    if verbose > 0:
        print("----- One sample outputs for demo -----")
        print(f">> Original sentence: {X_train.iloc[0]}\n")
        print(f">> Tokenized sentence: {tokenizer.sequences_to_texts(X_train_ready[:1])}\n")
        print(f">> X_train_enc: {X_train_enc[:1]}\n")
        print(f">> X_train_ready: {X_train_ready[:1]}")
        
        print(f"\nVocab size: {vocab_size}")
    
    return X_train_ready, X_test_ready, tokenizer, vocab_size

### Définissons une fonction permettant d'entrainer le modèle

In [14]:
def train_model(X_train_, X_test_, y_train_, y_test_, vocab_size_, embedding_layer_, preprocess_desc=""):
    
    print(f"Testing model with \"{preprocess_desc}\"".upper().center(100,"-"), end='\n\n')
    
    # Embedding simple
    #padding_size = 50
    #embedding_dim = 100
    #embedding = layers.Embedding(input_dim=vocab_size_, output_dim=embedding_dim, input_length=padding_size, trainable=True)
    
    # Prepare model
    f_opti = keras.optimizers.Adam(learning_rate=0.0005)
    f_loss = keras.losses.BinaryCrossentropy(from_logits=False)
    f_metrics = ['accuracy'] # in this context 'accuracy' == keras.metrics.BinaryAccuracy()
    
    model = architecture01(f_opti, f_loss, f_metrics, embedding_layer_)
    model.summary()
    
    t0 = time.perf_counter()
    history = model.fit(
        x=X_train_, y=y_train_, 
        validation_data=(X_test_, y_test_), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"archi01_PREPROCESS_{preprocess_desc}"),
        verbose=1,
        # validation_split=0.2,
        # train_labels
    )
    train_time = time.perf_counter() - t0
    
    # Print accuracy scores
    loss, accuracy = model.evaluate(X_train_ready, y_train, verbose=False)
    print("\nTraining Accuracy: {:.4f}".format(accuracy))
    loss, accuracy = model.evaluate(X_test_ready, y_test, verbose=False)
    print("Testing Accuracy:  {:.4f}".format(accuracy))
    
    return history, model, train_time

### Divisons le jeu de données en `Train` et `Test` sets

In [15]:
X_train, X_test, y_train, y_test = train_test_split(data_work['text'], data_work.target, test_size=0.2, random_state=random_seed)

---
## 2.1 Avec un embedding simple <a class="anchor" id="EMBEDDING_Zero"></a> [⇪](#menu)

### Définissons une fonction permattant de Tokenizer notre jeu de données
Nous avons établis que pour ce jeu de données il convient de choisir une tokenization avec environ 4500 mots retenus; c'est donc ce que nous allons faire.

In [16]:
def preprocess_data_RAW(X_train, X_test, tokenizer_num_words=4500, padding_size=50, verbose=1):
    tokenizer = Tokenizer(num_words=tokenizer_num_words)
    tokenizer.fit_on_texts(X_train)

    dictionary = tokenizer.word_index
    vocab_size = len(dictionary)+1 # Adding 1 because of reserved 0 index
    
    X_train_enc = tokenizer.texts_to_sequences(X_train)
    X_test_enc = tokenizer.texts_to_sequences(X_test)
    
    X_train_ready = pad_sequences(X_train_enc, padding='post', maxlen=padding_size)
    X_test_ready = pad_sequences(X_test_enc, padding='post', maxlen=padding_size)
    
    # y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
    # y_test = np.asarray(y_test).astype('float32').reshape((-1,1))
    
    if verbose > 0:
        print("----- One sample outputs for demo -----")
        print(f">> Original sentence: {X_train.iloc[0]}\n")
        print(f">> Tokenized sentence: {tokenizer.sequences_to_texts(X_train_ready[:1])}\n")
        print(f">> X_train_enc: {X_train_enc[:1]}\n")
        print(f">> X_train_ready: {X_train_ready[:1]}")
        
        print(f"\nVocab size: {vocab_size}")
    
    return X_train_ready, X_test_ready, tokenizer, vocab_size

In [17]:
X_train_ready, X_test_ready, tokenizer, vocab_size = preprocess_data_RAW(X_train, X_test)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> Tokenized sentence: ['you too be fake x your link to be something else']

>> X_train_enc: [[10, 46, 2, 1411, 197, 44, 456, 3, 2, 199, 447]]

>> X_train_ready: [[  10   46    2 1411  197   44  456    3    2  199  447    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0]]

Vocab size: 228026


### Préparons l'embedding layer que nous allons utiliser

In [19]:
# Embedding simple
padding_size = 50
embedding_dim = 100
embedding_layer = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=padding_size, trainable=True)

### Entrainons notre modèle

In [None]:
history_BASE, model_BASE, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "BASE")

In [None]:
summarize_diagnostics(history_BASE)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_BASE.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (BASE + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.2 Pretrained Word2Vec `word2vec-google-news-300` <a class="anchor" id="EMBEDDING_Word2Vec"></a> [⇪](#menu)

### Préparons l'embedding layer que nous allons utiliser

In [122]:
em_model = load_gensim_embedding("word2vec-google-news-300", binary=True)

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/word2vec-google-news-300.gz


In [123]:
np.set_printoptions(threshold=25)
print(em_model.get_vector('like')) # Vector OK
np.set_printoptions(threshold=1000)

[ 0.10351562  0.13769531 -0.00297546 ...  0.04394531 -0.14550781
  0.07128906]


In [124]:
weights = em_model.vectors
embedding_layer = Embedding(
    input_dim=weights.shape[0],
    output_dim=weights.shape[1],
    weights=[weights],
    trainable=True,
)

### Préparons le jeu de données avec l'embedding à tester

In [26]:
def word_embedding_transform(embedding_model, sentences, max_len=65):

    print('\nPreparing the data with the provided embedding model...')
    data = np.zeros([len(sentences), max_len], dtype=np.int32)
    for i, (id,sentence) in enumerate(sentences.iteritems()):
        for t, word in enumerate(sentence.split()):
            if t == max_len:
                break
            if word in embedding_model.key_to_index:
                data[i, t] = embedding_model.key_to_index[word]
                
    return data

In [27]:
def print_embedding_demo(X_train, X_train_ready, index=0):
    print("----- One sample outputs for demo -----")
    print(f">> Original sentence: {X_train.iloc[index]}\n")
    print(f">> X_train_ready: {X_train_ready[index]}")

In [125]:
padding_size = 50

X_train_ready = word_embedding_transform(em_model, X_train, padding_size)
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)

X_test_ready = word_embedding_transform(em_model, X_test, padding_size)
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)


Preparing the data with the provided embedding model...
X_train_ready shape: (1162232, 50) y_train shape: (1162232,)

Preparing the data with the provided embedding model...
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [136]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [   30   213    38  4656   623     0    90 16758    54   211     5    38
   252  1097     2     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


### Entrainons notre modèle

In [None]:
history_W2V300, model_W2V300, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "W2V300")

In [None]:
summarize_diagnostics(history_W2V300)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_W2V300.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (PT_W2V_300 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.3 Pretrained FastText `fasttext-wiki-news-subwords-300` <a class="anchor" id="EMBEDDING_FastText"></a> [⇪](#menu)

### Préparons l'embedding layer que nous allons utiliser

In [126]:
em_model = load_gensim_embedding("fasttext-wiki-news-subwords-300")

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/fasttext-wiki-news-subwords-300.gz


In [127]:
np.set_printoptions(threshold=25)
print(em_model.get_vector('like')) # Vector OK
np.set_printoptions(threshold=1000)

[ 0.0095372  0.01431    0.066626  ...  0.025587   0.010337  -0.03046  ]


In [128]:
weights = em_model.vectors
embedding_layer = Embedding(
    input_dim=weights.shape[0],
    output_dim=weights.shape[1],
    weights=[weights],
    trainable=True,
)

### Préparons le jeu de données avec l'embedding à tester

In [129]:
padding_size = 50

X_train_ready = word_embedding_transform(em_model, X_train, padding_size)
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)

X_test_ready = word_embedding_transform(em_model, X_test, padding_size)
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)


Preparing the data with the provided embedding model...
X_train_ready shape: (1162232, 50) y_train shape: (1162232,)

Preparing the data with the provided embedding model...
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [136]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [   30   213    38  4656   623     0    90 16758    54   211     5    38
   252  1097     2     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


### Entrainons notre modèle

In [None]:
history_FT300, model_FT300, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "FT300")

In [None]:
summarize_diagnostics(history_FT300)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_FT300.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (PT_FT_300 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.4 Pretrained Glove `Stanford's GloVe 100d` <a class="anchor" id="EMBEDDING_glove6B100d"></a> [⇪](#menu)

### Préparons le jeu de données avec l'embedding à tester

In [None]:
padding_size = 50

try:
    from tensorflow.keras.layers import TextVectorization
except ImportError:
    from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vectorizer = TextVectorization(max_tokens=4500, output_sequence_length=padding_size)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(128)
vectorizer.adapt(text_ds)

2022-08-03 21:22:51.398789: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-08-03 21:22:51.398816: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-08-03 21:22:51.398839: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (valkea-XPS): /proc/driver/nvidia/version does not exist
2022-08-03 21:22:51.400281: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [51]:
vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'i', 'be', 'to']

In [58]:
output = vectorizer([["the cat sat on the mat"]])
output.numpy()[0, :6]

array([   5,  495, 1619,   20,    5,    1])

In [59]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [60]:
padding_size = 50

X_train_ready = vectorizer(np.array([[s] for s in X_train])).numpy()
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)
y_train_ready = np.array(y_train)

X_test_ready = vectorizer(np.array([[s] for s in X_test])).numpy()
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)
y_test_ready = np.array(y_test)

2022-08-03 21:28:35.219947: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 464892800 exceeds 10% of free system memory.


X_train_ready shape: (1162232, 50) y_train shape: (1162232,)
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [61]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [  11   48    3 1404  199    1   46  450    4    3  196  441    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0]


### Préparons l'embedding layer que nous allons utiliser

In [45]:
em_model = load_trained_glove("glove.6B.100d")

Loading from gZip: /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/glove.6B.100d.gz


In [47]:
np.set_printoptions(threshold=25)
print(em_model.get('like')) # Vector OK
np.set_printoptions(threshold=1000)

[-0.2687   0.81708  0.69896 ... -0.4011   0.74657  0.31122]


In [54]:
num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = em_model.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

Converted 4333 words (167 misses)


In [55]:
embedding_layer = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

### Entrainons notre modèle

In [None]:
history_GL100, model_GL100, train_time = train_model(X_train_ready, X_test_ready, y_train_ready, y_test_ready, vocab_size, embedding_layer, "GL100")

In [None]:
summarize_diagnostics(history_GL100)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_GL100.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (PT_GL_100 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test_ready, training_time=train_time, inference_time=inf_time)

---
## 2.5 Pretrained Glove `glove-twitter-25` <a class="anchor" id="EMBEDDING_glove25"></a> [⇪](#menu)

### Préparons l'embedding layer que nous allons utiliser

In [92]:
em_model = load_gensim_embedding("glove-twitter-25")

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/glove-twitter-25.gz


In [119]:
np.set_printoptions(threshold=25)
print(em_model.get_vector('like')) # Vector OK
np.set_printoptions(threshold=1000)

[-0.21063  -0.010992 -0.17552  ... -0.37547   0.58029   0.16067 ]


In [93]:
weights = em_model.vectors
embedding_layer = Embedding(
    input_dim=weights.shape[0],
    output_dim=weights.shape[1],
    weights=[weights],
    trainable=True,
)

### Préparons le jeu de données avec l'embedding à tester

In [96]:
padding_size = 50

X_train_ready = word_embedding_transform(em_model, X_train, padding_size)
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)

X_test_ready = word_embedding_transform(em_model, X_test, padding_size)
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)


Preparing the data with the provided embedding model...
X_train_ready shape: (1162232, 50) y_train shape: (1162232,)

Preparing the data with the provided embedding model...
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [136]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [   30   213    38  4656   623     0    90 16758    54   211     5    38
   252  1097     2     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


### Entrainons notre modèle

In [None]:
history_GLT25, model_GLT25, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "GLT25")

In [None]:
summarize_diagnostics(history_GL25)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_GLT25.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (PT_GLT_25 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.6 Pretrained Glove `glove-twitter-100` <a class="anchor" id="EMBEDDING_glove25"></a> [⇪](#menu)

### Préparons l'embedding layer que nous allons utiliser

In [101]:
em_model = load_gensim_embedding("glove-twitter-100")

Loading from /home/valkea/Dev/OpenClassrooms/Projets_AI/P7/data/embedding_models/glove-twitter-100.gz


In [119]:
np.set_printoptions(threshold=25)
print(em_model.get_vector('like')) # Vector OK
np.set_printoptions(threshold=1000)

[-0.21063  -0.010992 -0.17552  ... -0.37547   0.58029   0.16067 ]


In [120]:
weights = em_model.vectors
embedding_layer = Embedding(
    input_dim=weights.shape[0],
    output_dim=weights.shape[1],
    weights=[weights],
    trainable=True,
)

### Préparons le jeu de données avec l'embedding à tester

In [121]:
padding_size = 50

X_train_ready = word_embedding_transform(em_model, X_train, padding_size)
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)

X_test_ready = word_embedding_transform(em_model, X_test, padding_size)
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)


Preparing the data with the provided embedding model...
X_train_ready shape: (1162232, 50) y_train shape: (1162232,)

Preparing the data with the provided embedding model...
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [136]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [   30   213    38  4656   623     0    90 16758    54   211     5    38
   252  1097     2     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]


### Entrainons notre modèle

In [None]:
history_GLT100, model_GLT100, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "GLT100")

In [None]:
summarize_diagnostics(history_GL100)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_GLT100.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (PT_GLT_100 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.7 Word2Vec `local training` <a class="anchor" id="EMBEDDING_Word2Vec_local"></a> [⇪](#menu)

### Préparons l'embedding layer que nous allons utiliser

In [20]:
from gensim.models import Word2Vec

In [22]:
w2v_model = Word2Vec(X_train, vector_size=150, min_count=5, window=5, sg=0, epochs=100)

In [23]:
pretrained_weights = w2v_model.wv.vectors
vocab_size, emdedding_size = pretrained_weights.shape
print('Embedding shape:', pretrained_weights.shape)

Embedding shape: (130, 256)


In [42]:
weights = w2v_model.wv.vectors
embedding_layer = Embedding(
    input_dim=weights.shape[0],
    output_dim=weights.shape[1],
    weights=[weights],
    trainable=True,
)

#### Regardons un peu ce que notre modèle propose comme mots similaires pour quelques mots choisi

In [24]:
for word in ['good', 'bad', 'sad', 'fabulous', 'difficult', 'easy', 'boring', 'fun']:
    if word in w2v_model.wv:
        most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in w2v_model.wv.most_similar(word)[:8])
        print(f'{word.rjust(15)} -> {most_similar}')
    else:
        print(f'⚠️ {word} is not in the Word2Vec vocabulary')

⚠️ good is not in the Word2Vec vocabulary
⚠️ bad is not in the Word2Vec vocabulary
⚠️ sad is not in the Word2Vec vocabulary
⚠️ fabulous is not in the Word2Vec vocabulary
⚠️ difficult is not in the Word2Vec vocabulary
⚠️ easy is not in the Word2Vec vocabulary
⚠️ boring is not in the Word2Vec vocabulary
⚠️ fun is not in the Word2Vec vocabulary


### Préparons le jeu de données avec l'embedding à tester

In [29]:
padding_size = 50

X_train_ready = word_embedding_transform(w2v_model.wv, X_train, padding_size)
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)

X_test_ready = word_embedding_transform(w2v_model.wv, X_test, padding_size)
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)


Preparing the data with the provided embedding model...
X_train_ready shape: (1162232, 50) y_train shape: (1162232,)

Preparing the data with the provided embedding model...
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [30]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


### Entrainons notre modèle

In [None]:
history_W2Vlocal, model_W2Vlocal, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "W2Vlocal")

In [None]:
summarize_diagnostics(history_W2Vlocal)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_W2Vlocal.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (W2V_150 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.8 FastText `local training` <a class="anchor" id="EMBEDDING_FastText_local"></a> [⇪](#menu)

### Préparons l'embedding layer que nous allons utiliser

In [31]:
from gensim.models import FastText

In [32]:
ft_model = FastText(X_train, vector_size=150, min_count=5, window=5, sg=0, epochs=100)

In [33]:
pretrained_weights = ft_model.wv.vectors
vocab_size, emdedding_size = pretrained_weights.shape
print('Embedding shape:', pretrained_weights.shape)

Embedding shape: (130, 150)


In [42]:
weights = ft_model.wv.vectors
embedding_layer = Embedding(
    input_dim=weights.shape[0],
    output_dim=weights.shape[1],
    weights=[weights],
    trainable=True,
)

#### Regardons un peu ce que notre modèle propose comme mots similaires pour quelques mots choisi

In [34]:
for word in ['good', 'bad', 'sad', 'fabulous', 'difficult', 'easy', 'boring', 'fun']:
    if word in ft_model.wv:
        most_similar = ', '.join('%s (%.2f)' % (similar, dist) for similar, dist in ft_model.wv.most_similar(word)[:8])
        print(f'{word.rjust(15)} -> {most_similar}')
    else:
        print(f'⚠️ {word} is not in the Word2Vec vocabulary')

           good -> z (0.27), $ (0.18), & (0.15), < (0.12), ¹ (0.09), ¯ (0.09),  (0.09), ¸ (0.09)
            bad ->  (0.20), & (0.17), u (0.15), % (0.14), m (0.12), ! (0.12), 2 (0.12), ~ (0.12)
            sad -> « (0.13),  (0.12),  (0.12),  (0.12),  (0.12), $ (0.12),  (0.11), ¶ (0.10)
       fabulous -> * (0.27), : (0.22), 5 (0.18), 6 (0.16), 7 (0.16), | (0.15), 9 (0.14), \ (0.13)
      difficult -> 1 (0.24), 4 (0.23), # (0.22), 8 (0.22), 2 (0.22), 9 (0.21), 6 (0.20), 7 (0.20)
           easy -> < (0.16), w (0.13), t (0.12), l (0.12), * (0.10), , (0.09), ´ (0.09), ? (0.09)
         boring -> ] (0.17), | (0.16), Ã (0.16), [ (0.15), } (0.15), { (0.14), \ (0.14),  (0.13)
            fun -> k (0.19), l (0.19), h (0.17), ) (0.12), ^ (0.12), e (0.10), I (0.09), a (0.09)


### Préparons le jeu de données avec l'embedding à tester

In [37]:
padding_size = 50

X_train_ready = word_embedding_transform(ft_model.wv, X_train, padding_size)
print('X_train_ready shape:', X_train_ready.shape, 'y_train shape:', y_train.shape)

X_test_ready = word_embedding_transform(ft_model.wv, X_test, padding_size)
print('X_test_ready shape:', X_test_ready.shape, 'y_test shape:', y_test.shape)


Preparing the data with the provided embedding model...
X_train_ready shape: (1162232, 50) y_train shape: (1162232,)

Preparing the data with the provided embedding model...
X_test_ready shape: (290559, 50) y_test shape: (290559,)


In [38]:
print_embedding_demo(X_train, X_train_ready)

----- One sample outputs for demo -----
>> Original sentence: you too be fake .. :x ... disguise your link to be something else .

>> X_train_ready: [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0 17  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0]


### Entrainons notre modèle

In [None]:
history_FTlocal, model_FTlocal, train_time = train_model(X_train_ready, X_test_ready, y_train, y_test, vocab_size, embedding_layer, "FTlocal")

In [None]:
summarize_diagnostics(history_FTlocal)

### Affichons les scores du modèle

In [26]:
t0 = time.perf_counter()
y_preds_proba = model_FTlocal.predict(X_test_ready)
# y_preds_proba = pd.Series([x[0] for x in y_preds_proba])

y_preds = np.where(y_preds_proba > 0.5, 1,0)
# y_preds = pd.Series([1 if x > 0.5 else 0 for x in y_preds_proba])
inf_time = time.perf_counter() - t0

y_preds_proba.shape
y_preds.shape

(290559, 1)

In [None]:
model_name = "NN Archi01 (FT_150 + Lemma_nof + Tokenizer4500)"
get_scores(model_name, y_pred=y_preds, y_pred_proba=y_preds_proba, register=True, X_ref=X_test_ready, y_ref=y_test, training_time=train_time, inference_time=inf_time)

---
## 2.9 Comparaison des scores <a class="anchor" id="EMBEDDING_scores"></a> [⇪](#menu)

In [None]:
scores_df

>#### Conclusion de la sélection du plongement:
> TODO

----
----
----
----

----
----
----
----