# 1. Importation des Bibliothèques et Chargement des Données

In [8]:
# Importation des bibliothèques nécessaires
import numpy as np
import pandas as pd
import mlflow
import mlflow.keras
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Embedding, LSTM, Dense, Conv1D, MaxPooling1D, Flatten, Dropout, Input
from tensorflow.keras.models import Sequential, Model
from sklearn.metrics import accuracy_score, classification_report
from gensim.models import Word2Vec, FastText
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Assurez-vous que les ressources NLTK nécessaires sont téléchargées
nltk.download('punkt')
nltk.download('wordnet')

# Charger les données déjà prétraitées
data = pd.read_csv('../data/database_p7_rework.csv')

# Supposons que chaque séquence ait 100 dimensions de feature.
timesteps = 1  # Traiter chaque séquence indépendamment
features_w2v = 100  # Dimensions des embeddings Word2Vec
features_ft = 100  # Dimensions des embeddings FastText

data.head()


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\trist\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\trist\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,target,id,date,flag,user,text,year,month,word_count,hour,sentiment,text_cleaned,text_lemmatized,text_stemmed
0,0,1467810369,2009-04-06 22:19:45,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",2009,4,19,22,0.216667,switchfoot http twitpic com 2y1zl awww that s...,switchfoot http twitpic com 2y1zl awww that s ...,switchfoot http twitpic com 2y1zl awww that s ...
1,0,1467810672,2009-04-06 22:19:49,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,2009,4,21,22,0.0,is upset that he can t update his facebook by ...,is upset that he can t update his facebook by ...,is upset that he can t updat hi facebook by te...
2,0,1467810917,2009-04-06 22:19:53,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,2009,4,18,22,0.5,kenichan i dived many times for the ball mana...,kenichan i dived many time for the ball manage...,kenichan i dive mani time for the ball manag t...
3,0,1467811184,2009-04-06 22:19:57,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,2009,4,10,22,0.2,my whole body feels itchy and like its on fire,my whole body feel itchy and like it on fire,my whole bodi feel itchi and like it on fire
4,0,1467811193,2009-04-06 22:19:57,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",2009,4,21,22,-0.625,nationwideclass no it s not behaving at all i...,nationwideclass no it s not behaving at all i ...,nationwideclass no it s not behav at all i m m...


# 2.  Extraction des Embeddings avec Word2Vec et FastText

In [None]:
# Word2Vec - Lemmatisation
model_w2v_lemma = Word2Vec(sentences=[text.split() for text in data['text_lemmatized']], vector_size=features_w2v, window=5, min_count=1)
embedding_matrix_lemma_w2v = np.array([np.mean([model_w2v_lemma.wv[word] for word in text.split() if word in model_w2v_lemma.wv] or [np.zeros(features_w2v)], axis=0) for text in data['text_lemmatized']])

# Word2Vec - Stemming
model_w2v_stem = Word2Vec(sentences=[text.split() for text in data['text_stemmed']], vector_size=features_w2v, window=5, min_count=1)
embedding_matrix_stem_w2v = np.array([np.mean([model_w2v_stem.wv[word] for word in text.split() if word in model_w2v_stem.wv] or [np.zeros(features_w2v)], axis=0) for text in data['text_stemmed']])

# FastText - Lemmatisation
model_ft_lemma = FastText(sentences=[text.split() for text in data['text_lemmatized']], vector_size=features_ft, window=5, min_count=1)
embedding_matrix_lemma_ft = np.array([np.mean([model_ft_lemma.wv[word] for word in text.split() if word in model_ft_lemma.wv] or [np.zeros(features_ft)], axis=0) for text in data['text_lemmatized']])

# FastText - Stemming
model_ft_stem = FastText(sentences=[text.split() for text in data['text_stemmed']], vector_size=features_ft, window=5, min_count=1)
embedding_matrix_stem_ft = np.array([np.mean([model_ft_stem.wv[word] for word in text.split() if word in model_ft_stem.wv] or [np.zeros(features_ft)], axis=0) for text in data['text_stemmed']])
