In [1]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input, Attention, GlobalAveragePooling1D
from transformers import TFBertForSequenceClassification, BertTokenizer




- #### Unzip is a Unix command so to run it use : bash terminal instead of powershell / cmd

<img src="./bash_commands_to_execute.png" style="height:auto; width:1600px">

- #### Charger les Donn√©es √† l'aide de Python

- #### Cette fois si j'ai utulis√© un chemin relatif üòÖ

In [2]:
df = pd.read_csv("imdb_reviews/IMDB Dataset.csv")
df.head(10)
# print(df.head(10))
# print(df.tail(10))


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


In [3]:
df.shape

(50000, 2)

- #### Nettoyage et Pr√©traitement des Textes

In [4]:
nltk.download('stopwords')
nltk.download('punkt')

stop_words = set(stopwords.words('english'))
df['cleaned_reviews'] = df['review'].apply(lambda x: ' '.join([word for word in word_tokenize(x.lower()) if word not in stop_words and word not in string.punctuation]))
print(df['cleaned_reviews'].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\soula\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\soula\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


0    one reviewers mentioned watching 1 oz episode ...
1    wonderful little production br br filming tech...
2    thought wonderful way spend time hot summer we...
3    basically 's family little boy jake thinks 's ...
4    petter mattei 's `` love time money '' visuall...
Name: cleaned_reviews, dtype: object


- #### Encodage des Labels

In [5]:
le = LabelEncoder()
df['sentiment'] = le.fit_transform(df['sentiment'])
df.head()

Unnamed: 0,review,sentiment,cleaned_reviews
0,One of the other reviewers has mentioned that ...,1,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,1,wonderful little production br br filming tech...
2,I thought this was a wonderful way to spend ti...,1,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,0,basically 's family little boy jake thinks 's ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei 's `` love time money '' visuall...


- #### S√©paration des Donn√©es

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

- #### Padding et Troncature des S√©quences

In [7]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=100)
X_test_pad = pad_sequences(X_test_seq, maxlen=100)

- #### Entra√Ænement de 3 Mod√®les

- #### Impl√©mentation de 2 Callbacks

In [8]:
# D√©finition des callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=0.001)

- #### Mod√®le LSTM from Scratch

In [9]:
maxlen = 100

In [10]:
model_lstm = Sequential([
    Embedding(input_dim=5000, output_dim=128, input_length=100),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(64),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])



- #### Mod√®le LSTM avec Attention

In [11]:
input_layer = Input(shape=(100,))

embedding_layer = Embedding(input_dim=5000, output_dim=128)(input_layer)
lstm_layer = LSTM(128, return_sequences=True)(embedding_layer)
attention_layer = Attention()([lstm_layer, lstm_layer])

#reduce dimennsion
lstm_layer_2 = LSTM(64)(attention_layer)

output_layer = Dense(1, activation='sigmoid')(lstm_layer_2)

model_lstm_attention = Model(inputs=input_layer, outputs=output_layer)

- #### Mod√®le Transformer

In [12]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model_transformer = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')




All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


- #### Entra√Ænement des 3 Mod√®les

    Compile the LSTM Model

In [13]:
# Compile the LSTM model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    Compile LSTM with Attention Model

In [14]:
# Compile the LSTM with Attention model
model_lstm_attention.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    Compile Transformer Model 
( <span style="color:red">Erreur lors la compilation du transforme</span> )

In [15]:
# Compile the transformer model
model_transformer.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5),
                          loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                          metrics=['accuracy'])




ValueError: Could not interpret optimizer identifier: <keras.src.optimizers.adam.Adam object at 0x000001DF7F0B9160>

In [16]:
# Entra√Ænement du mod√®le LSTM
model_lstm.fit(X_train_pad, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping, reduce_lr])

# Entra√Ænement du mod√®le LSTM avec Attention
model_lstm_attention.fit(X_train_pad, y_train, validation_split=0.2, epochs=10, batch_size=32, callbacks=[early_stopping, reduce_lr])

# Entra√Ænement du mod√®le Transformer
model_transformer.fit({'input_ids': X_train_enc['input_ids'], 'attention_mask': X_train_enc['attention_mask']}, y_train, validation_split=0.2, epochs=3, batch_size=16, callbacks=[early_stopping, reduce_lr])


Epoch 1/10
[1m1000/1000[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m89s[0m 86ms/step - accuracy: 0.7477 - loss: 0.4902 - val_accuracy: 0.8553 - val_loss: 0.3443 - learning_rate: 0.0010
Epoch 2/10
[1m1000/1000[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m87s[0m 87ms/step - accuracy: 0.8804 - loss: 0.2921 - val_accuracy: 0.8484 - val_loss: 0.3437 - learning_rate: 0.0010
Epoch 3/10
[1m1000/1000[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m89s[0m 89ms/step - accuracy: 0.9123 - loss: 0.2247 - val_accuracy: 0.8621 - val_loss: 0.3313 - learning_rate: 0.0010
Epoch 4/10
[1m1000/1000[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m89s[0m 89ms/step - accuracy: 0.9341 - loss: 0.1714 - val_accuracy: 0.8558 - val_loss: 0.3865 - learning_rate: 0.0010
Epoch 5/10
[1m1000/1000[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚

NameError: name 'X_train_enc' is not defined