In [1]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.model_selection import KFold, train_test_split, GridSearchCV
import spacy
from io import StringIO
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score, auc, roc_auc_score, roc_curve, classification_report
from fastapi import FastAPI, Request
import re
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import mlflow
import mlflow.pytorch
import mlflow.sklearn
import joblib
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import mlflow.keras
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, SpatialDropout1D, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import os
from gensim.models import Word2Vec
import joblib
import mlflow.sklearn

  from .autonotebook import tqdm as notebook_tqdm


# Préparation

In [2]:
# Charger le modèle anglais de spaCy
nlp = spacy.load('en_core_web_sm')

# Fonction pour nettoyer le texte
def clean_text(text):
    # Supprimer les URL
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    # Supprimer les mentions (@user)
    text = re.sub(r'@\w+', '', text)
    # Supprimer la ponctuation, mettre en minuscule, et conserver les hashtags
    text = re.sub(r'[^\w\s#]', '', text).lower()
    # Tokenisation du texte
    tokens = word_tokenize(text)
    # Suppression des mots vides (stop words)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    return tokens

# Fonction pour lemmatiser le texte
def lemmatize_text(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

# Paramètres du modèle
max_words = 15000  # Nombre maximal de mots dans le tokenizer
max_len = 60      # Longueur maximale des séquences

# Chemin vers le fichier CSV contenant les données de catégories
fichier = 'C:/Users/DELL/Desktop/OpenClass/Formation/Projet_007/training_1600000_processed_noemoticon.csv'

with open(fichier, 'rb') as file:
    bytes_data = file.read()  # lire le fichier en mode binaire
    decoded_data = bytes_data.decode('latin-1')  # essayer de décoder en utilisant latin-1


data = pd.read_csv(StringIO(decoded_data))

# Liste des noms de colonnes
noms_colonnes = ['Groupe', 'id_uti', 'date_voyage', 'requete', 'pseudo_uti', 'tweet']

# Lire le fichier CSV avec les noms de colonnes spécifiés
data = pd.read_csv(fichier, names=noms_colonnes, header=None)

# Utiliser seulement x% des données pour le développement initial
data = data.sample(frac=0.2, random_state=42)

# Application des fonctions de nettoyage sur la colonne 'tweet'
data['tweet'] = data['tweet'].fillna('')
data['cleaned_tokens'] = data['tweet'].apply(clean_text)
data['cleaned_text'] = data['cleaned_tokens'].apply(lambda x: ' '.join(x))

# Tokenizer pour convertir les tweets en séquences de tokens
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data['cleaned_text'])
sequences = tokenizer.texts_to_sequences(data['cleaned_text'])

# Padding des séquences pour les rendre de longueur égale
X_train = pad_sequences(sequences, maxlen=max_len)
y_train = np.array(data['Groupe'].map({0: 0, 4: 1}))

# Lemmatisation
data['lemmatized_tokens'] = data['cleaned_text'].apply(lemmatize_text)

# Entrainement du modèle Word2Vec
word2vec_model = Word2Vec(sentences=data['lemmatized_tokens'], vector_size=100, window=5, min_count=1, workers=4)

# Transformation des textes en vecteurs
data['word2vec_vectors'] = data['lemmatized_tokens'].apply(lambda x: np.mean([word2vec_model.wv[word] for word in x if word in word2vec_model.wv], axis=0).tolist())

# Assurez-vous de remplacer 'local_path_to_use_model' par le chemin réel du modèle téléchargé
use_model = hub.load("https://www.kaggle.com/models/google/universal-sentence-encoder/TensorFlow2/universal-sentence-encoder/2")

# Fonction pour obtenir l'embedding USE
def embed_use(text):
    return use_model([text]).numpy().flatten()

# Embedding des textes avec USE
data['use_embeddings'] = data['lemmatized_tokens'].apply(lambda x: embed_use(" ".join(x)))

# Préparation des données pour l'entraînement
data['labels'] = data['Groupe'].apply(lambda x: 1 if x == 4 else 0)  # Convertir en 0 et 1
data = data[['tweet', 'labels', 'word2vec_vectors', 'use_embeddings']]

# Séparer les données en jeu d’entraînement (70%), jeu de validation (15%) et jeu de test (15%)
train_texts, temp_texts, train_labels, temp_labels = train_test_split(data['tweet'], data['labels'], test_size=0.3, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [3]:

# Initialiser MLFlow
mlflow.set_experiment("Approche Classique")

# Vectorisation des textes (Bag of Words et TF-IDF)
vectorizer = CountVectorizer(max_df=0.5, min_df=5, max_features=1000, ngram_range=(1, 2))
X_train_bow = vectorizer.fit_transform(train_texts)
X_val_bow = vectorizer.transform(val_texts)
X_test_bow = vectorizer.transform(test_texts)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_bow)
X_val_tfidf = tfidf_transformer.transform(X_val_bow)
X_test_tfidf = tfidf_transformer.transform(X_test_bow)

# Extraction des labels
y_train = train_labels
y_val = val_labels
y_test = test_labels

# Définir les paramètres pour GridSearchCV
param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0]  # Paramètres de la recherche en grille pour Naive Bayes
}

# Initialiser le modèle Naive Bayes
nb = MultinomialNB()

# Initialiser GridSearchCV
grid_search = GridSearchCV(nb, param_grid, cv=5, scoring='accuracy')

# Entraînement du modèle avec GridSearchCV
with mlflow.start_run() as run:
    run_id = run.info.run_id
    
    # Effectuer la recherche en grille
    grid_search.fit(X_train_tfidf, y_train)
    
    # Meilleur modèle trouvé par GridSearchCV
    best_model = grid_search.best_estimator_
    
    # Prédictions sur le jeu de validation
    y_val_pred = best_model.predict(X_val_tfidf)
    
    # Évaluation du modèle sur le jeu de validation
    val_accuracy = accuracy_score(y_val, y_val_pred)
    val_report = classification_report(y_val, y_val_pred)
    
    print("Best Parameters:", grid_search.best_params_)
    print("Validation Accuracy:", val_accuracy)
    print("Validation Classification Report:\n", val_report)
    
    # Enregistrement des métriques dans MLFlow
    mlflow.log_metric("val_accuracy", val_accuracy)
    
    # Prédictions sur le jeu de test
    y_test_pred = best_model.predict(X_test_tfidf)
    
    # Évaluation du modèle sur le jeu de test
    test_accuracy = accuracy_score(y_test, y_test_pred)
    test_report = classification_report(y_test, y_test_pred)
    
    print("Test Accuracy:", test_accuracy)
    print("Test Classification Report:\n", test_report)
    
    # Enregistrement des métriques du test dans MLFlow
    mlflow.log_metric("test_accuracy", test_accuracy)
    
    # Enregistrement du modèle, vectorizer et transformer dans MLFlow
    mlflow.sklearn.log_model(best_model, "model")
    joblib.dump(vectorizer, "vectorizer.pkl")
    joblib.dump(tfidf_transformer, "tfidf_transformer.pkl")
    mlflow.log_artifact("vectorizer.pkl")
    mlflow.log_artifact("tfidf_transformer.pkl")
    
    # Enregistrement des paramètres du vectorizer et du transformer
    mlflow.log_param("vectorizer_max_df", 0.5)
    mlflow.log_param("vectorizer_min_df", 5)
    mlflow.log_param("vectorizer_max_features", 1000)
    mlflow.log_param("vectorizer_ngram_range", (1, 2))
    
    # Enregistrement des meilleurs paramètres du modèle
    mlflow.log_param("best_alpha", grid_search.best_params_['alpha'])
    
    # Afficher le Run ID
    print(f"Run ID: {run_id}")

# Fin du run MLFlow
mlflow.end_run()

# Dictionnaire de mappage des groupes aux sentiments
group_to_sentiment = {
    0: "Négatif",
    1: "Positif"
}


Best Parameters: {'alpha': 0.01}
Validation Accuracy: 0.7381875
Validation Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.75      0.74     23916
           1       0.75      0.72      0.74     24084

    accuracy                           0.74     48000
   macro avg       0.74      0.74      0.74     48000
weighted avg       0.74      0.74      0.74     48000

Test Accuracy: 0.7416041666666666
Test Classification Report:
               precision    recall  f1-score   support

           0       0.73      0.76      0.74     23931
           1       0.75      0.73      0.74     24069

    accuracy                           0.74     48000
   macro avg       0.74      0.74      0.74     48000
weighted avg       0.74      0.74      0.74     48000





Run ID: cd9f7981061e482a99d29ce43b520168


In [7]:
# Charger le modèle, vectorizer et transformateur depuis MLFlow
model = mlflow.sklearn.load_model(f"runs:/{run_id}/model")
vectorizer = joblib.load("vectorizer.pkl")
tfidf_transformer = joblib.load("tfidf_transformer.pkl")

# Fonction de prédiction
def predict_sentiment(text):
    # Nettoyage du texte
    cleaned_tokens = clean_text(text)
    cleaned_text = ' '.join(cleaned_tokens)
    text_bow = vectorizer.transform([cleaned_text])
    text_tfidf = tfidf_transformer.transform(text_bow)
    
    # Prédire le score de probabilité
    probas = model.predict_proba(text_tfidf)[0]
    
    # Retourner le score de probabilité pour le sentiment positif
    return probas[1]

# Liste de tweets à tester
test_tweets = [
    "I can't believe how terrible this service is. Absolutely awful!",
    "Worst experience ever. Totally disappointing and frustrating.",
    "Everything about this product is just so bad. Waste of money.",
    "I'm really not happy with how things turned out. Expected better.",
    "Not impressed with the quality at all. Quite disappointing.",
    "This is not what I ordered. Very misleading.",
    "I was hoping for more, but it's just okay. Not quite what I wanted.",
    "It's alright, but it could be a lot better. Needs improvement.",
    "Service was slow and unresponsive, but the product was decent.",
    "It’s fine, nothing too special but not too bad either.",
    "I had an average experience, nothing to write home about.",
    "It’s okay, does the job but doesn’t exceed expectations.",
    "Pretty good overall, just a few minor issues here and there.",
    "Decent product for the price, happy with the purchase.",
    "Met most of my expectations, would recommend with some reservations.",
    "Really satisfied with this. It’s just what I needed.",
    "Great product, would definitely buy again. Very happy.",
    "Good service and quality, met my expectations well.",
    "Absolutely love this! Exceeded all my expectations.",
    "Best purchase I've made in a while. Highly recommend!",
    "Fantastic experience from start to finish. Couldn't be happier!"
]

# Prédire et afficher le sentiment pour chaque tweet
for tweet in test_tweets:
    prediction = predict_sentiment(tweet)
    sentiment = "Positif" if prediction > 0.5 else "Négatif"
    print(f"Tweet: {tweet}")
    print(f"Score: {prediction:.4f}, Sentiment: {sentiment}")


Tweet: I can't believe how terrible this service is. Absolutely awful!
Score: 0.2863, Sentiment: Négatif
Tweet: Worst experience ever. Totally disappointing and frustrating.
Score: 0.5061, Sentiment: Positif
Tweet: Everything about this product is just so bad. Waste of money.
Score: 0.2627, Sentiment: Négatif
Tweet: I'm really not happy with how things turned out. Expected better.
Score: 0.5900, Sentiment: Positif
Tweet: Not impressed with the quality at all. Quite disappointing.
Score: 0.6192, Sentiment: Positif
Tweet: This is not what I ordered. Very misleading.
Score: 0.5016, Sentiment: Positif
Tweet: I was hoping for more, but it's just okay. Not quite what I wanted.
Score: 0.4619, Sentiment: Négatif
Tweet: It's alright, but it could be a lot better. Needs improvement.
Score: 0.4142, Sentiment: Négatif
Tweet: Service was slow and unresponsive, but the product was decent.
Score: 0.5016, Sentiment: Positif
Tweet: It’s fine, nothing too special but not too bad either.
Score: 0.2919, S