# Menu <a class="anchor" id="menu"></a>
   
* [1. Préparatifs](#init)
* [2. Modèles `Bag of words` ou `Bag of N-gram`](#EMBEDDING_BOW)
    * [2.1 `EMB-A0` NN | Ngram=2 | int](#EMBEDDING_BOW_A0)
    * [2.2 `EMB-A1` NN | Ngram=1 | multi_hot](#EMBEDDING_BOW_A1)
    * [2.3 `EMB-A2` NN | Ngram=2 | multi_hot](#EMBEDDING_BOW_A2)
    * [2.4 `EMB-A3` NN | Ngram=2 | count](#EMBEDDING_BOW_A3)
    * [2.5 `EMB-A4` NN | Ngram=2 | tf-idf](#EMBEDDING_BOW_A4)
* [3. Modèles `Séquentiels`](#EMBEDDING_SEQ)
    * [3.1 `EMB-B0` RNN | Ngram=2 | int | One-hot-embedding](#EMBEDDING_SEQ_B0)
    * [3.2 `EMB-B1` RNN | Ngram=2 | int | Keras embedding](#EMBEDDING_SEQ_B1)
    * [3.3 `EMB-B2` RNN | Ngram=2 | int | Keras embedding + MASK](#EMBEDDING_SEQ_B2)
    * [3.4 `EMB-B3` RNN | Ngram=2 | int | Glove 100d + MASK + NOT trainable](#EMBEDDING_SEQ_B3)
    * [3.5 `EMB-B4` RNN | Ngram=1 | int | Glove 100d + MASK + NOT trainable](#EMBEDDING_SEQ_B4)
    * [3.6 `EMB-B5` RNN | Ngram=1 | int | GloveTwitter 25d + MASK + NOT trainable](#EMBEDDING_SEQ_B5)
    * [3.7 `EMB-B6` RNN | Ngram=1 | int | GloveTwitter 100d + MASK + NOT trainable](#EMBEDDING_SEQ_B6)
    * [3.8 `EMB-B7` RNN | Ngram=1 | int | GloveTwitter 200d + MASK + NOT trainable](#EMBEDDING_SEQ_B7)
    * [3.9 `EMB-B8` RNN | Ngram=1 | int | FastText300 + MASK + NOT trainable](#EMBEDDING_SEQ_B8)
    * [3.10 `EMB-B9` RNN | Ngram=1 | int | Word2Vec300_GoogleNews + MASK + NOT trainable](#EMBEDDING_SEQ_B9)
* [4. Modèles `classiques` pour comparaison](#NO_EMBEDDING)
    * [4.1 `EMB-C0` LogisticRegression | TfidVectorize](#NO_EMBEDDING_C0)
* [5. Comparaison des scores](#EMBEDDING_scores)

In [None]:
import os
import time
import pathlib
import gzip

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras import layers
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from dagshub import dagshub_logger, DAGsHubLogger

try:
    from tensorflow.keras.layers import TextVectorization
except ImportError:
    from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

# set random seed for reproductibility
random_seed = 0
np.random.seed(random_seed)

try:
    keras.utils.set_random_seed(random_seed)
except Exception:
    tf.random.set_seed(random_seed)

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

---
---
# 1. Préparatifs pour de la classification avec des réseaux de neurones <a class="anchor" id="init"></a> [⇪](#menu)

### Chargeons quelques fonctions *maison* pour entrainer et évaluer les modèles, et initialisons un fichier pour enregistrer les scores
Ces fonctions ont été déportées dans le fichier classification_utils.py pour allèger le notebook

In [None]:
from classification_utils import fit_model, get_scores, init_scores, find_best_threshold
init_scores("data/scores_NN_SelectEmbedding.csv")

### Chargeons le jeu de données et préparons un `échantillon de travail` représentatif

In [None]:
data_work = pd.read_csv(
    pathlib.Path(pathlib.Path().absolute(), 'data', 'data_nlp_1563108.csv'), 
    usecols=['target', 'lemmas_not_filtered'],
    encoding='ISO-8859-1',
    #nrows=100000,
)
data_work.rename(columns={'lemmas_not_filtered':'text'}, inplace=True)
display(data_work.head(), data_work.shape)

# Select samples
sample_size = 100000
data_work = data_work.groupby('target', group_keys=False).apply(lambda x: x.sample(sample_size//2, random_state=random_seed))
display(data_work.head(), data_work.shape)

In [None]:
data_work.target.value_counts(dropna=False)

### Définissons les différentes `architectures` dont nous allons avoir besoin

In [None]:
def architecture001(f_opti, f_loss, f_metrics, max_tokens=20000, hidden_dim=16):
        
    inputs = keras.Input(shape=(max_tokens,), dtype="int64")
    
    x = layers.Dense(hidden_dim, activation='relu')(inputs)
    x = layers.Dropout(0.5)(x)
    
    predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)
    
    model = keras.Model(inputs, predictions)
    model.compile(loss=f_loss, optimizer=f_opti, metrics=f_metrics)
    
    return model, f"Dense {hidden_dim} + Dropout 0.5"

In [None]:
def architecture002_oh(f_opti, f_loss, f_metrics, voc_size):
        
    inputs = keras.Input(shape=(None,), dtype="int64")
    
    x = tf.one_hot(inputs, depth=voc_size)
    x = layers.Bidirectional(layers.LSTM(32))(x) # 64
    x = layers.Dropout(0.5)(x)
    
    predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)
    
    model = keras.Model(inputs, predictions)
    model.compile(loss=f_loss, optimizer=f_opti, metrics=f_metrics)
    
    return model, "one_hot + Bi-LSTM 32 + Dropout 0.5"

In [None]:
def architecture002(f_opti, f_loss, f_metrics, embedding):
        
    inputs = keras.Input(shape=(None,), dtype="int64")
    
    x = embedding(inputs)
    x = layers.Bidirectional(layers.LSTM(32))(x) # 64
    x = layers.Dropout(0.5)(x)
    
    predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)
    
    model = keras.Model(inputs, predictions)
    model.compile(loss=f_loss, optimizer=f_opti, metrics=f_metrics)
    
    return model, "Bi-LSTM 32 + Dropout 0.5"

In [None]:
def architecture003(f_opti, f_loss, f_metrics, embedding):
        
    inputs = keras.Input(shape=(None,), dtype="int64")
    
    x = embedding(inputs)
    x = layers.Bidirectional(layers.LSTM(64))(x)
    x = layers.Dense(24, activation='relu')(x)
    
    predictions = layers.Dense(1, activation='sigmoid', name='predictions')(x)
    
    model = keras.Model(inputs, predictions)
    model.compile(loss=f_loss, optimizer=f_opti, metrics=f_metrics)
    
    return model, "Bi-LSTM 64 + Dense 24"

### Définissons les fonctions utiles à l'`embedding`

In [None]:
from gensim.models import KeyedVectors
import gensim.downloader as api

def load_gensim_embedding(embedding_name, binary=False):
    
    embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.gz')
    try:
        if embedding_path.is_file():
            print(f"Loading from {embedding_path}")
            embedding_model = KeyedVectors.load_word2vec_format(embedding_path, binary=binary)
        else:
            print("Loading from the Git repos with API")
            embedding_model = api.load(embedding_name)
            
        return embedding_model
    except Exception as e:
        print(f"The provided embedding model couldn't be loaded correctly: {e}")
        
def load_trained_glove(embedding_name):
    
    def parse_file(file):
        embeddings_index = {}
        for line in file:
            word, coefs = line.split(maxsplit=1)
            coefs = np.fromstring(coefs, "f", sep=" ")
            embeddings_index[word] = coefs
        return embeddings_index

    try:
        embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.gz')
        if embedding_path.is_file():
            print(f"Loading from gZip: {embedding_path}")
            with gzip.open(embedding_path, mode='rt') as f:
                return parse_file(f)
        
        embedding_path = pathlib.Path(pathlib.Path().absolute(), 'data', 'embedding_models', f'{embedding_name}.txt')
        if embedding_path.is_file():
            print(f"Loading from TXT: {embedding_path}")
            with open(embedding_path) as f:
                return parse_file(f)
        else:
            raise FileNotFoundError(f"No such model found (it must be {embedding_name}.txt or {embedding_name}.gz)")
            
    except Exception as e:
        print(f"The provided embedding model couldn't be loaded correctly: {e}")

In [None]:
def convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim=100):
    
    hits = 0
    misses = 0
    misses_list = []
    
    word_index = dict(zip(vocabulary, range(len(vocabulary))))
    
    embedding_matrix = np.zeros((max_tokens, embedding_dim))
    
    for word, i in word_index.items():
        if i < max_tokens:
            try:
                embedding_vector = embeddings_index.get_vector(word)
            except AttributeError:
                embedding_vector = embeddings_index.get(word)
            except KeyError:
                embedding_vector = None
        if embedding_vector is not None:
            hits += 1
            embedding_matrix[i] = embedding_vector
        else:
            misses += 1
            misses_list.append(word)
            
    print(f"Converted {hits} words, and couldn't find {misses} words")
    print(f"Missing words: {misses_list}")
            
    return embedding_matrix

### Définissons les `fonctions et paramètres commun` à l'entrainement des différents modèles que l'on veut tester

In [None]:
# DagsHub 

def save_hyperparameters_to_dagshub(dlogger, model_name, archi_desc, layers, batch_size, sample_size):    
    dlogger.log_hyperparams(model_name=model_name)
    dlogger.log_hyperparams(archi_desc=archi_desc)
    dlogger.log_hyperparams(layers=layers)
    dlogger.log_hyperparams(batch_size=batch_size)
    dlogger.log_hyperparams(sample_size=sample_size)
    
def save_metrics_to_dagshub(dlogger, scores, best_threshold, history, step_num=1000, close=True):
    dlogger.log_metrics(scores, step_num=step_num)
    dlogger.log_metrics(best_threshold=best_threshold, step_num=step_num)
    add_scores_to_dagshub(dlogger, history)

def get_layers_for_dagshub(model):
    
    layers_infos = []
    
    def search(layer):
        layer = dict(layer)
        
        nonlocal layers_infos
        
        if 'config' in layer:
    
            #if 'name' in layer['config']:
            #    print(" name:", layer['config']['name'])
            conf = {}
            if 'units' in layer['config']:
                conf["units"] = layer['config']['units']
            if 'output_dim' in layer['config']:
                conf['output_dim'] = layer['config']['output_dim']
            if 'rate' in layer['config']:
                conf['rate'] = layer['config']['rate']
            
            layers_infos.append((layer['class_name'], conf))
            
            if 'layer' in layer['config']:
                search(layer['config']['layer'])
        
    for layer in model.get_config()['layers']:
        search(layer)
        
    return layers_infos


def add_scores_to_dagshub(dlogger, history):
    for epoch, loss, accuracy, val_loss, val_accuracy in zip(
        history.epoch, 
        history.history['loss'], 
        history.history['accuracy'], 
        history.history['val_loss'], 
        history.history['val_accuracy']):
        scores = {
            'loss':loss, 
            'accuracy':accuracy, 
            'val_loss':val_loss, 
            'val_accuracy':val_accuracy
        }
        dlogger.log_metrics(scores, step_num=epoch) 

In [None]:
def summarize_diagnostics(history):
    
    figure = plt.figure(figsize=(8,8))
            
    # plot loss
    plt.subplot(211)
    plt.title('Cross Entropy Loss')
    plt.plot(history.history['loss'], color='blue', label='train')
    plt.plot(history.history['val_loss'], color='orange', label='val')
    plt.legend()
    
    # plot accuracy
    plt.subplot(212)
    plt.title('Classification Accuracy')
    plt.plot(history.history['accuracy'], color='blue', label='train')
    plt.plot(history.history['val_accuracy'], color='orange', label='val')
    plt.legend()
    
    # save plot to file
    #filename = sys.argv[0].split('/')[-1]
    #plt.savefig(filename + '_plot.png')
    #plt.close()
    
    plt.tight_layout(pad=1.0)

In [None]:
from keras import layers
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback, ReduceLROnPlateau

def init_callbacks(model_name):

    # Define savepoints
    filepath = pathlib.Path("models", f"{model_name}.keras")

    checkpoint = ModelCheckpoint(
        monitor="val_accuracy",
        mode="max",
        filepath=filepath,
        save_best_only=True,
        verbose=1,
    )

    # Define EarlyStopping conditions
    es = EarlyStopping(
        monitor='val_loss', # 'binary_accuracy'
        mode='min', # 'max'
        patience=5,
        min_delta=0.01, 
        restore_best_weights=True, 
        verbose=1,
    )
    
    return [checkpoint, es]

In [None]:
batch_size=1024

### Divisons le jeu de données en `Train`, `Valid` et `Test` sets

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(data_work['text'], data_work['target'], test_size=0.2, random_state=random_seed, stratify=data_work['target'])
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=X_test.shape[0], random_state=random_seed, stratify=y_train_full)

assert X_train.shape[0] + X_valid.shape[0] + X_test.shape[0] == data_work.shape[0]
assert y_train.shape[0] + y_valid.shape[0] + y_test.shape[0] == data_work.shape[0]

print(f"X_train: {X_train.shape} + X_valid: {X_valid.shape} + X_test: {X_test.shape}")
print(f"y_train: {X_train.shape} + y_valid: {X_valid.shape} + y_test: {X_test.shape}")

### Préparons les différentes version du text_vectorizer que l'on veut tester

In [None]:
text_dataset = tf.data.Dataset.from_tensor_slices(list(data_work.text)) 

In [None]:
text_vectorization0 = TextVectorization(
    output_mode='int', # int, multi_hot, count, tf_idf
    max_tokens=20000,
    ngrams=2,
    output_sequence_length=79 # search the max size by default
)

In [None]:
text_vectorization0.adapt(text_dataset.batch(batch_size), True)

In [None]:
text_vectorization1 = TextVectorization(
    output_mode='multi_hot', # int, multi_hot, count, tf_idf
    max_tokens=20000,
    ngrams=1,
    # output_sequence_length=60 # search the max size by default
)

In [None]:
text_vectorization1.adapt(text_dataset.batch(batch_size), True)

In [None]:
text_vectorization2 = TextVectorization(
    output_mode='multi_hot', # int, multi_hot, count, tf_idf
    max_tokens=20000,
    ngrams=2,
    # output_sequence_length=60 # search the max size by default
)

In [None]:
text_vectorization2.adapt(text_dataset.batch(batch_size), True)

In [None]:
text_vectorization3 = TextVectorization(
    output_mode='count', # int, multi_hot, count, tf_idf
    max_tokens=20000,
    ngrams=2,
    # output_sequence_length=60 # search the max size by default
)

In [None]:
text_vectorization3.adapt(text_dataset.batch(batch_size), True)

In [None]:
text_vectorization4 = TextVectorization(
    output_mode='tf_idf', # int, multi_hot, count, tf_idf
    max_tokens=20000,
    ngrams=2,
    # output_sequence_length=60 # search the max size by default
)

In [None]:
text_vectorization4.adapt(text_dataset.batch(batch_size), True)

In [None]:
max_length = 60
max_tokens = 20000

text_vectorizationBng2= TextVectorization(
    output_mode='int', # int, multi_hot, count, tf_idf
    max_tokens=max_tokens,
    ngrams=2,
    output_sequence_length=max_length # search the max size by default
)

In [None]:
text_vectorizationBng2.adapt(text_dataset.batch(batch_size), True)

In [None]:
max_length = 60
max_tokens = 20000

text_vectorizationBng1= TextVectorization(
    output_mode='int', # int, multi_hot, count, tf_idf
    max_tokens=max_tokens,
    ngrams=1,
    output_sequence_length=max_length # search the max size by default
)

In [None]:

text_vectorizationBng1.adapt(text_dataset.batch(batch_size), True)

---
---
# 2. Recherche de l'`embedding` le plus adapté avec des modèles `Bag of words` ou `Bag of N-gram` <a class="anchor" id="EMBEDDING_BOW"></a> [⇪](#menu)

---
## 2.1 `EMB-A0` NN | Ngram=2 | int <a class="anchor" id="EMBEDDING_BOW_A0"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorization0.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorization0(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorization0(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Define model

In [None]:
model_name = "EmbedA0"
model, archi_desc = architecture001("adam", "binary_crossentropy", ['accuracy'], X_train_ready[:0].shape[1])
archi_desc = "No Embedding + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorization0(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 2.2 `EMB-A1` NN | Ngram=1 | multi_hot <a class="anchor" id="EMBEDDING_BOW_A1"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorization1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorization1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorization1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Define model

In [None]:
model_name = "EmbedA1"
model, archi_desc = architecture001("adam", "binary_crossentropy", ['accuracy'], X_train_ready[:0].shape[1])
archi_desc = "No Embedding + Unigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorization1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 2.3 `EMB-A2` NN | Ngram=2 | multi_hot <a class="anchor" id="EMBEDDING_BOW_A2"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorization2.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorization2(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorization2(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Define model

In [None]:
model_name = "EmbedA2"
model, archi_desc = architecture001("adam", "binary_crossentropy", ['accuracy'], X_train_ready[:0].shape[1])
archi_desc = "No Embedding + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorization2(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 2.4 `EMB-A3` NN | Ngram=2 | count <a class="anchor" id="EMBEDDING_BOW_A3"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorization3.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorization3(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorization3(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Define model

In [None]:
model_name = "EmbedA3"
model, archi_desc = architecture001("adam", "binary_crossentropy", ['accuracy'], X_train_ready[:0].shape[1])
archi_desc = "No Embedding + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorization3(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 2.5 `EMB-A4` NN | Ngram=2 | tf-idf <a class="anchor" id="EMBEDDING_BOW_A4"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorization4.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorization4(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorization4(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Define model

In [None]:
model_name = "EmbedA4"
model, archi_desc = architecture001("adam", "binary_crossentropy", ['accuracy'], X_train_ready[:0].shape[1])
archi_desc = "No Embedding + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorization4(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
---
# 3. Recherche de l'`embedding` le plus adapté avec des modèles `Séquentiels` <a class="anchor" id="EMBEDDING_SEQ"></a> [⇪](#menu)

---
## 3.1 `EMB-B0` RNN | Ngram=2 | int | One-hot-embedding <a class="anchor" id="EMBEDDING_SEQ_B0"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng2.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng2(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng2(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Define model

In [None]:
model_name = "EmbedB0"
model, archi_desc = architecture002_oh("adam", "binary_crossentropy", ['accuracy'], max_tokens)
archi_desc = "One-Hot Embedding layer + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng2(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.2 `EMB-B1` RNN | Ngram=2 | int | Keras embedding <a class="anchor" id="EMBEDDING_SEQ_B1"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng2.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng2(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng2(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
# Embedding simple
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=128, 
    input_length=X_train_ready.shape[1],
    # mask_zero=True,
    # trainable=True, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB1"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "Keras Embedding + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng2(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.3 `EMB-B2` RNN | Ngram=2 | int | Keras embedding + MASK <a class="anchor" id="EMBEDDING_SEQ_B2"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng2.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng2(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng2(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=128, 
    input_length=X_train_ready.shape[1],
    mask_zero=True,
    # trainable=True, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB2"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "Keras Embedding + MASK + Bigrams | " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng2(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.4 `EMB-B3` RNN | Ngram=2 | int | Glove 100d + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B3"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng2.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng2(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng2(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_trained_glove("glove.6B.100d")

In [None]:
embedding_dim = 100
vocabulary = text_vectorizationBng2.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB3"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "GloVe 100d + MASK + NT + Bigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng2(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.5 `EMB-B4` RNN | Ngram=1 | int | Glove 100d + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B4"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_trained_glove("glove.6B.100d")

In [None]:
embedding_dim = 100
vocabulary = text_vectorizationBng1.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB4"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "GloVe 100d + MASK + NT + Unigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.6 `EMB-B5` RNN | Ngram=1 | int | GloveTwitter 25d + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B5"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_gensim_embedding("glove-twitter-25")

In [None]:
embedding_dim = 25
vocabulary = text_vectorizationBng1.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB5"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "GloVeTwitter 25d + MASK + NT + Unigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.7 `EMB-B6` RNN | Ngram=1 | int | GloveTwitter 100d + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B6"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_gensim_embedding("glove-twitter-100")

In [None]:
embedding_dim = 100
vocabulary = text_vectorizationBng1.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB6"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "GloVeTwitter 100d + MASK + NT + Unigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.8 `EMB-B7` RNN | Ngram=1 | int | GloveTwitter 200d + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B7"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_gensim_embedding("glove-twitter-200")

In [None]:
embedding_dim = 200
vocabulary = text_vectorizationBng1.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB7"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "GloVeTwitter 200d + MASK + NT + Unigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.9 `EMB-B8` RNN | Ngram=1 | int | FastText300 + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B8"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_gensim_embedding("fasttext-wiki-news-subwords-300")

In [None]:
embedding_dim = 300
vocabulary = text_vectorizationBng1.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB8"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "FastText 300d + MASK + NT + Unigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
## 3.10 `EMB-B9` RNN | Ngram=1 | int | Word2Vec300_GoogleNews + MASK + NOT trainable <a class="anchor" id="EMBEDDING_SEQ_B9"></a> [⇪](#menu)

### Standardize & Tokenize

In [None]:
# Done before
print(text_vectorizationBng1.get_vocabulary()[:20], "...")

### Prepare dataset

In [None]:
X_train_ready = text_vectorizationBng1(X_train)
X_valid_ready = text_vectorization0(X_valid)
X_test_ready = text_vectorizationBng1(X_test)

In [None]:
X_train[:2]

In [None]:
X_train_ready[:2]

### Embedding

In [None]:
embeddings_index = load_gensim_embedding("word2vec-google-news-300", binary=True)

In [None]:
embedding_dim = 300
vocabulary = text_vectorizationBng1.get_vocabulary()
embedding_matrix = convert_embedding_matrix(vocabulary, embeddings_index, max_tokens, embedding_dim)
embedding_matrix

In [None]:
# Embedding avec masque
embedding_layer = layers.Embedding(
    input_dim=max_tokens,
    output_dim=embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    # input_length=max_length,
    mask_zero=True,
    trainable=False, # <== default is True
)

### Define model

In [None]:
model_name = "EmbedB9"
model, archi_desc = architecture002("adam", "binary_crossentropy", ['accuracy'], embedding_layer)
archi_desc = "Word2Vec 300d google + MASK + NT + Unigrams| " + archi_desc

In [None]:
model.summary()

#### Définissons un logger DagsHub pour enregistrer notre essai

In [None]:
dagslogger = DAGsHubLogger()

In [None]:
save_hyperparameters_to_dagshub(dagslogger, model_name, archi_desc, get_layers_for_dagshub(model), batch_size, data_work.shape[0])

### Train model

In [None]:
t0 = time.perf_counter()
history = model.fit(
        x=X_train_ready, y=y_train, 
        validation_data=(X_test_ready, y_test), 
        epochs=50, 
        batch_size=batch_size, 
        callbacks=init_callbacks(f"{model_name}"),
        verbose=1,
)
train_time = time.perf_counter() - t0

### Evaluate

In [None]:
model = keras.models.load_model(f"models/{model_name}.keras")

In [None]:
model.evaluate(X_test_ready, y_test)

In [None]:
raw_txt = tf.convert_to_tensor(["I hate it", "I love it"])
raw_txt = text_vectorizationBng1(raw_txt)
model.predict(raw_txt)

In [None]:
_ = get_scores(
    model_name,
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=0.5,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
)

#### Trouvons le meilleur seuil de séparation pour les prédiction (sur la base du F1 score)

In [None]:
acc_value, best_threshold = find_best_threshold(model, X_valid_ready, y_valid, accuracy_score)
print(f"The best Accuracy score ({acc_value}) for this model is reached with a threshold of {best_threshold}")

In [None]:
scores = get_scores(
    f"{model_name} [w. threshold]",
    model=model,
    X_ref=X_test_ready,
    y_ref=y_test, 
    threshold=best_threshold,
    # y_pred=y_preds, 
    # y_pred_proba=y_preds_proba, 
    training_time=train_time, 
    # inference_time=inf_time, 
    dagslogger=dagslogger,
    register=True,
    show_roccurves=False,
)

#### Enregistrons les scores pour DagsHub

In [None]:
save_metrics_to_dagshub(dagslogger, scores, best_threshold, history)
dagslogger.save() 
dagslogger.close()

### Compare

In [None]:
from classification_utils import scores_df
scores_df

---
---
# 4. Modèle `classiques` (pour comparaison) <a class="anchor" id="NO_EMBEDDING"></a> [⇪](#menu)

---
## 4.1 `EMB-C0` LogisticRegression | RAW_lemmas_not_filtered <a class="anchor" id="NO_EMBEDDING_C0"></a> [⇪](#menu)

In [None]:
from sklearn.model_selection import StratifiedKFold
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed)

### Préparons le jeu de données en `Train` et `Test` sets

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare TF-IDF 
tfidf_vectorizer_train = TfidfVectorizer(use_idf=True, min_df=5, max_df=0.5) #, max_features=2000)
X_train_ready = tfidf_vectorizer_train.fit_transform(X_train)
X_test_ready = tfidf_vectorizer_train.transform(X_test)

In [None]:
print(f"TF-IDF features: {tfidf_vectorizer_train.get_feature_names_out()}")
print(f"Sets shapes >> X_train_ready: {X_train_ready.shape}, X_test_ready: {X_test_ready.shape}")

### Appliquons la Logistic Regression avec ce jeu de données

In [None]:
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform, loguniform

In [None]:
max_lr_iter = 3000
param_grid = [
    {
        'random_state':[random_seed],
        'max_iter':[max_lr_iter],
        'solver' : ['lbfgs'],
        'penalty' : ['l2'], # none
        'C' : loguniform(1e-5, 1e2),
    },
    {
        'random_state':[random_seed],
        'max_iter':[max_lr_iter],
        'solver' : ['liblinear'],
        'penalty' : ['l1', 'l2'], # none
        'C' : loguniform(1e-5, 1e2),
    },
    {
        'random_state':[random_seed],
        'max_iter':[max_lr_iter],
        'solver' : ['saga'],
        'penalty' : ['elasticnet'], # none
        'C' : loguniform(1e-5, 1e2),
        'l1_ratio' : uniform(0,1)
    },
]

In [None]:
model_name = "Test C00"
max_lr_iter = 3000

grd_log = fit_model(
    LogisticRegression(), 
    param_grid=param_grid,  
    cv = skfold,
    X_ref = X_train_ready,
    y_ref = y_train,
)

### Evaluate

In [None]:
get_scores(model_name, **grd_log, register=True, X_ref=X_test_ready, y_ref=y_test)

---
---
# 5. Comparaison des `scores` <a class="anchor" id="EMBEDDING_scores"></a> [⇪](#menu)

In [None]:
from classification_utils import scores_df

In [None]:
scores_df

In [None]:
scores_df.sort_values(['ROC AUC'], ascending=False)