# 1. Importation des Bibliothèques et Chargement des Données

In [25]:
!pip install fasttext

# Importation des bibliothèques nécessaires
import numpy as np
import pandas as pd
import mlflow
import mlflow.keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Dropout
from tensorflow.keras.models import Sequential
from sklearn.metrics import accuracy_score, classification_report
import fasttext
import fasttext.util
import nltk

# Assurez-vous que les ressources NLTK nécessaires sont téléchargées
nltk.download('punkt')
nltk.download('wordnet')

# Charger les données déjà prétraitées
data = pd.read_csv('../data/database_p7_rework.csv')

# Paramètres de configuration
MAX_SEQUENCE_LENGTH = 100  # Longueur maximale des séquences
EMBEDDING_DIM = 300  # Dimension des vecteurs d'embedding FastText
VOCAB_SIZE = 10000  # Taille maximale du vocabulaire


Collecting fasttext
  Downloading fasttext-0.9.3.tar.gz (73 kB)
     ---------------------------------------- 0.0/73.4 kB ? eta -:--:--
     --------------- --------------------- 30.7/73.4 kB 660.6 kB/s eta 0:00:01
     ------------------------------------- 73.4/73.4 kB 810.2 kB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.13.5-py3-none-any.whl.metadata (9.5 kB)
Using cached pybind11-2.13.5-py3-none-any.whl (240 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (pyproject.toml): started
  Building wheel for fasttext (pyproject.toml): finished with status 'error'
Failed to build fasttext


  error: subprocess-exited-with-error
  
  Building wheel for fasttext (pyproject.toml) did not run successfully.
  exit code: 1
  
  [33 lines of output]
  !!
  
          ********************************************************************************
          Usage of dash-separated 'description-file' will not be supported in future
          versions. Please use the underscore name 'description_file' instead.
  
          By 2024-Sep-26, you need to update your project and remove deprecated calls
          or your builds will no longer be supported.
  
          See https://setuptools.pypa.io/en/latest/userguide/declarative_config.html for details.
          ********************************************************************************
  
  !!
    opt = self.warn_dash_deprecation(opt, section)
  running bdist_wheel
  running build
  running build_py
  creating build
  creating build\lib.win-amd64-cpython-311
  creating build\lib.win-amd64-cpython-311\fasttext
  copying python\fa

ModuleNotFoundError: No module named 'fasttext'

# 2. Préparation des séquence

In [None]:
# Choisir le texte lemmatisé ou stemmé
text_column_lemma = 'text_lemmatized'
text_column_stem = 'text_stemmed'

# Tokenization des textes lemmatisés
tokenizer_lemma = Tokenizer(num_words=VOCAB_SIZE)
tokenizer_lemma.fit_on_texts(data[text_column_lemma])
sequences_lemma = tokenizer_lemma.texts_to_sequences(data[text_column_lemma])

# Tokenization des textes stemmés
tokenizer_stem = Tokenizer(num_words=VOCAB_SIZE)
tokenizer_stem.fit_on_texts(data[text_column_stem])
sequences_stem = tokenizer_stem.texts_to_sequences(data[text_column_stem])

# Padding des séquences pour obtenir des longueurs uniformes
X_lemma = pad_sequences(sequences_lemma, maxlen=MAX_SEQUENCE_LENGTH)
X_stem = pad_sequences(sequences_stem, maxlen=MAX_SEQUENCE_LENGTH)

# Label des cibles
y = data['target']

# Séparation des données en ensemble d'entraînement et de test pour les deux méthodes
X_train_lemma, X_test_lemma, y_train_lemma, y_test_lemma = train_test_split(X_lemma, y, test_size=0.2, random_state=42)
X_train_stem, X_test_stem, y_train_stem, y_test_stem = train_test_split(X_stem, y, test_size=0.2, random_state=42)


# 3. Préparation des Embeddings Textuels

## 3.1 Embeddings avec Word2Vec

In [None]:
# Entraîner le modèle Word2Vec sur les textes lemmatisés
w2v_model_lemma = Word2Vec(sentences=[text.split() for text in data[text_column_lemma]], vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4)

# Entraîner le modèle Word2Vec sur les textes stemmés
w2v_model_stem = Word2Vec(sentences=[text.split() for text in data[text_column_stem]], vector_size=EMBEDDING_DIM, window=5, min_count=1, workers=4)

# Extraire les embeddings pour le vocabulaire lemmatisé (300 Features)
embedding_matrix_word2vec_lemma = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in tokenizer_lemma.word_index.items():
    if i < VOCAB_SIZE:
        try:
            embedding_vector = w2v_model_lemma.wv[word]
            embedding_matrix_word2vec_lemma[i] = embedding_vector
        except KeyError:
            continue

# Extraire les embeddings pour le vocabulaire stemmé (300 Features)
embedding_matrix_word2vec_stem = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in tokenizer_stem.word_index.items():
    if i < VOCAB_SIZE:
        try:
            embedding_vector = w2v_model_stem.wv[word]
            embedding_matrix_word2vec_stem[i] = embedding_vector
        except KeyError:
            continue


## 3.2  Embeddings avec FastText

In [None]:
# Téléchargement et chargement des vecteurs FastText (par exemple pour l'anglais)
fasttext.util.download_model('en', if_exists='ignore')  # Télécharge les vecteurs FastText
ft = fasttext.load_model('cc.en.300.bin')  # Charge le modèle (300 dimensions par défaut)

# Embedding matrix pour lemmatisation (300 Features)
embedding_matrix_fasttext_lemma = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in tokenizer_lemma.word_index.items():
    if i < VOCAB_SIZE:
        embedding_matrix_fasttext_lemma[i] = ft.get_word_vector(word)

# Embedding matrix pour stemming (300 Features)
embedding_matrix_fasttext_stem = np.zeros((VOCAB_SIZE, EMBEDDING_DIM))
for word, i in tokenizer_stem.word_index.items():
    if i < VOCAB_SIZE:
        embedding_matrix_fasttext_stem[i] = ft.get_word_vector(word)


## 3.3 Concaténation de FastText et Word2Vec

In [None]:
# Concaténation des embeddings FastText et Word2Vec pour lemmatisation (600 Features)
embedding_matrix_concat_lemma = np.concatenate((embedding_matrix_fasttext_lemma, embedding_matrix_word2vec_lemma), axis=1)

# Concaténation des embeddings FastText et Word2Vec pour stemming (600 Features)
embedding_matrix_concat_stem = np.concatenate((embedding_matrix_fasttext_stem, embedding_matrix_word2vec_stem), axis=1)


# 4. Entraînement des Modèles LSTM et CNN