In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
df =pd.read_csv('train_set.csv',index_col=0)
df['note'] = df['note'].apply(lambda x: int(x))
df.head(2)

Unnamed: 0,note,avis,assureur,produit,date_publication,date_exp,avis_en,tokens_en,bigrams_en,tokens_fr,bigrams_fr
0,4,la personne au téléphone était clair et sympat...,L'olivier Assurance,auto,06/10/2021,01/10/2021,the person on the phone was clear and friendly...,"['person', 'phone', 'clear', 'friendly', 'expl...","['person_phone', 'phone_clear', 'clear_friendl...","['personne', 'téléphone', 'clair', 'sympathiqu...","['personne_téléphone', 'téléphone_clair', 'cla..."
1,4,satisfaitréactivité simplicité prix attractif ...,APRIL Moto,moto,09/07/2021,01/07/2021,satisfiedreactivity simplicity attractive pric...,"['satisfiedreactivity', 'simplicity', 'attract...","['satisfiedreactivity_simplicity', 'simplicity...","['satisfaitréactivité', 'simplicité', 'prix', ...","['satisfaitréactivité_simplicité', 'simplicité..."


### Différents preprocessing

In [7]:
# Preprocessing the text with lemmatization
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    words_list = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return " ".join(words_list)

In [11]:
# Preprocessing the text with stemming
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)    
    stemmer = SnowballStemmer('english')
    stemmed_tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]
    return ' '.join(stemmed_tokens)

In [6]:
# Preprocessing the text without stemming and lemmatizing
def preprocess(text):
    text = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text) 
    toks = [word for word in tokens if word not in stop_words]
    return ' '.join(toks)

## TF_IDF 

### SVM

In [8]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(df['avis_en'], df['note'], test_size=0.2, random_state=42)
# Preprocessing
X_train_pre = X_train.apply(preprocess)
X_test_pre = X_test.apply(preprocess)
# TF IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train_pre)
X_test_tfidf = vectorizer.transform(X_test_pre)
# Entrainement modèle
from sklearn.svm import SVC
model = SVC(class_weight='balanced')
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, predictions))

Accuracy: 0.5098527276498652
              precision    recall  f1-score   support

           1       0.64      0.74      0.68      1444
           2       0.36      0.37      0.37       716
           3       0.32      0.22      0.26       665
           4       0.45      0.41      0.43       999
           5       0.54      0.58      0.56       997

    accuracy                           0.51      4821
   macro avg       0.46      0.46      0.46      4821
weighted avg       0.49      0.51      0.50      4821



In [6]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(df['avis_en'], df['note'], test_size=0.2, random_state=42)
# Preprocessing
X_train_pre = X_train.apply(preprocess)
X_test_pre = X_test.apply(preprocess)
# Ajouter les bigrams
vectorizer = TfidfVectorizer(ngram_range=(2,2))
X_train_tfidf = vectorizer.fit_transform(X_train_pre)
X_test_tfidf = vectorizer.transform(X_test_pre)
# Entrainement modèle
from sklearn.svm import SVC
model = SVC(class_weight='balanced')
model.fit(X_train_tfidf, y_train)
predictions = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, predictions))

Accuracy: 0.4662933001451981
              precision    recall  f1-score   support

           1       0.47      0.94      0.62      1444
           2       0.34      0.08      0.13       716
           3       0.39      0.05      0.09       665
           4       0.45      0.33      0.38       999
           5       0.51      0.48      0.49       997

    accuracy                           0.47      4821
   macro avg       0.43      0.37      0.34      4821
weighted avg       0.44      0.47      0.40      4821



## LTSM

In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional,Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint,EarlyStopping,ReduceLROnPlateau
from tensorflow.keras.models import load_model
from tensorflow.keras.utils import to_categorical
import tensorflow_hub as hub
import numpy as np
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Reshape

#### V1

In [10]:
sample_df = df.sample(n=8000, random_state=42)
# Separate the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    sample_df['avis_en'],
    sample_df['note'],
    test_size=0.2,
    random_state=42
)
# Apply preprocessing to the training and testing data
X_train_processed= X_train.apply(preprocess)
X_test_processed = X_test.apply(preprocess)
# Convert text to USE embeddings
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")
X_train_use = np.array(use(X_train_processed.tolist()))
X_test_use = np.array(use(X_test_processed.tolist()))
# Adjust labels to start from 0
y_train_adjusted = y_train - 1
y_test_adjusted = y_test - 1
# LSTM model with USE embeddings
model = Sequential()
model.add(Reshape((1, X_train_use.shape[1]), input_shape=(X_train_use.shape[1],)))  
model.add(LSTM(128, activation='relu', return_sequences=True))
model.add(LSTM(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train model
model.fit(X_train_use, y_train_adjusted, validation_split=0.2, epochs=50, batch_size=200, verbose=1)
# Evaluate the model on the test set
accuracy = model.evaluate(X_test_use, y_test_adjusted)[1]
print(f'Accuracy: {accuracy}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Accuracy: 0.42375001311302185


#### V2

In [13]:
# Separate the data into training and testing sets
sample_df = df.sample(n=8000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(
    sample_df['avis_en'],
    sample_df['note'],
    test_size=0.2,
    random_state=42
)

# Apply preprocessing to the training and testing data
X_train_processed = X_train.apply(preprocess)
X_test_processed = X_test.apply(preprocess)

# Load Universal Sentence Encoder
use = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

# Convert text to USE embeddings
X_train_use = use(X_train_processed.tolist())
X_test_use = use(X_test_processed.tolist())

y_train_adjusted = y_train-1
y_test_adjusted = y_test-1

es = EarlyStopping(monitor='val_accuracy', mode='auto', verbose=0,patience=15)
mc = ModelCheckpoint('best_model.h5', monitor='val_accuracy', mode='max', verbose=1, save_best_only=True)
lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.2, patience=3, min_lr=0.0001)

from tensorflow.keras.layers import Input, Reshape, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential

# LSTM model with USE embeddings
model = Sequential()
model.add(Input(shape=(X_train_use.shape[1],)))
model.add(Reshape((1, X_train_use.shape[1])))
model.add(LSTM(128, return_sequences=True, activation='relu'))  # return_sequences must be True for stacking
model.add(LSTM(64, return_sequences=True, activation='relu'))  # another LSTM layer
model.add(LSTM(32, activation='relu'))  # final LSTM layer
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))  # Output layer with 5 neurons

optimizer = keras.optimizers.Adam(learning_rate=0.001)

# Compile the model
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train_use, y_train_adjusted, validation_split=0.2, epochs=200, batch_size=100, verbose=1, callbacks=[es, mc, lr])
# Evaluate the model on the test set
accuracy = model.evaluate(X_test_use, y_test_adjusted)[1]
print(f'Accuracy: {accuracy}')

Epoch 1/200
Epoch 1: val_accuracy improved from -inf to 0.28906, saving model to best_model.h5
Epoch 2/200
Epoch 2: val_accuracy improved from 0.28906 to 0.45156, saving model to best_model.h5
Epoch 3/200
Epoch 3: val_accuracy improved from 0.45156 to 0.47344, saving model to best_model.h5
Epoch 4/200
Epoch 4: val_accuracy improved from 0.47344 to 0.48203, saving model to best_model.h5
Epoch 5/200
Epoch 5: val_accuracy did not improve from 0.48203
Epoch 6/200
Epoch 6: val_accuracy did not improve from 0.48203
Epoch 7/200
Epoch 7: val_accuracy did not improve from 0.48203
Epoch 8/200
Epoch 8: val_accuracy improved from 0.48203 to 0.48672, saving model to best_model.h5
Epoch 9/200
Epoch 9: val_accuracy did not improve from 0.48672
Epoch 10/200
Epoch 10: val_accuracy did not improve from 0.48672
Epoch 11/200
Epoch 11: val_accuracy did not improve from 0.48672
Epoch 12/200
Epoch 12: val_accuracy did not improve from 0.48672
Epoch 13/200
Epoch 13: val_accuracy did not improve from 0.48672
E

## Bert Hugging face

In [3]:
# Use a pipeline as a high-level helper
from transformers import pipeline
# Load the sentiment analysis pipeline
sentiment_pipeline = pipeline("text-classification", model="nlptown/bert-base-multilingual-uncased-sentiment")
# Example usage
review = str(df["avis_en"].iloc[3])
result = sentiment_pipeline(review)
result




[{'label': '2 stars', 'score': 0.341024249792099}]