## Data Afrique Hub Project
This notebook is used to create a model for analysing the sentiments of film commentaries. 


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense,Dropout
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve,confusion_matrix, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import learning_curve
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle

2024-08-06 11:18:47.133610: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2024-08-06 11:18:47.133653: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2024-08-06 11:19:16.703670: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2024-08-06 11:19:16.704007: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory
[nltk_data] Downloading package punkt to /home/anderson/nltk_data...
[nltk_data]   Package pun

In [2]:
# Load the CSV file in a pandas DataFrame
data = pd.read_csv('../dataset/IMDB_Dataset.csv')
sentiment_mapping = {'positive': 1, 'negative': 0} #transforming sentiment into binary
data['sentiment'] = data['sentiment'].map(sentiment_mapping)
print(data.describe())
data

          sentiment
count  50000.000000
mean       0.500000
std        0.500005
min        0.000000
25%        0.000000
50%        0.500000
75%        1.000000
max        1.000000


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


## Data pre-processing

In [3]:
def preprocess_text(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    text = ' '.join(filtered_text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove single letters
    text = re.sub(r'\b\w\b', '', text)
    return text

# Apply preprocessing to the reviews
data['review'] = data['review'].apply(preprocess_text)
print(data)

                                                  review  sentiment
0      One reviewers mentioned watching  Oz episode l...          1
1      wonderful little production  filming technique...          1
2      thought wonderful way spend time hot summer we...          1
3      Basically  family little boy  Jake  thinks  zo...          0
4      Petter Mattei   Love Time Money  visually stun...          1
...                                                  ...        ...
49995  thought movie right good job  nt creative orig...          1
49996  Bad plot  bad dialogue  bad acting  idiotic di...          0
49997  Catholic taught parochial elementary schools n...          0
49998   going disagree previous comment side Maltin o...          0
49999  one expects Star Trek movies high art  fans ex...          0

[50000 rows x 2 columns]


## Subdivision of data into training, validation and test sets

In [4]:
X = data['review'].values
y = data['sentiment'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print(X_train)

['fault actors  put great performances   overall story well executed  movie opens great zinger  crazy old guy forces young Aborigine girl  car road   re forced endure 40 minutes character development entirely new group characters  nt know 40 minutes  turns ones eventually discover girl  body  story progresses thereWhile story pick point  really goes nowhere   hours  asked  point  see characters struggle accusations racism stupidity handled discovery  story ultimately unsatisfying felt unfinished  well acted   strong enough backbone film warrant recommending '
 'first thing thought saw films  really film  least imagine spontaneously hear word  film   entirely symbolic  everything figurative meaning  used express thing symbolic way  find strange  acquainted philosophy  religion  spiritual life  think  fairytale  even weird one  chaotic   legend Zu  perfectly transparent  like  tells us images story fight light darkness  fight old humanity  every one search sens life confronted  film obvi

## Tokenization

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)
vocab_size = len(tokenizer.word_index) + 1

max_len = 100  # Define the maximum length of sequences
X_train_seq = pad_sequences(X_train_seq, maxlen=max_len)
X_test_seq = pad_sequences(X_test_seq, maxlen=max_len)
X_val_seq = pad_sequences(X_val_seq, maxlen=max_len)
X_train_seq

array([[    0,     0,     0, ...,     2,  6887, 12423],
       [    0,     0,     0, ...,     9,  1154,   770],
       [  810,  1476,    47, ...,   243,  1773,   326],
       ...,
       [    0,     0,     0, ...,  1036,  6698,     4],
       [    0,     0,     0, ...,  2289,     4,   579],
       [    0,     6,   209, ...,  1533,  5134,  5524]], dtype=int32)

## Creating the model

In [6]:
# Creating the sequential model
model = Sequential()
# Embedding layer:
# - input_dim = size of the vocabulary
# - output_dim = dimension of the embedding vectors (here 128)
# - input_length = maximum length of the input sequences
model.add(Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len))
# LSTM (Long Short-Term Memory) layer:
# - 128 units in the LSTM layer
# - Captures long-term dependencies in text sequences
model.add(LSTM(128))
# Fully connected output layer:
# - 1 output unit (binary prediction)
# - Sigmoid activation to obtain a prediction between 0 and 1
model.add(Dense(1, activation='sigmoid'))
# Dropout regularization layer:
# - Dropout rate of 0.3 (30%) to reduce overfitting
model.add(Dropout(0.3))
# Compiling the model:
# - Adam optimizer
# - Loss function: Binary Cross-Entropy (for binary prediction)
# - Evaluation metric: Accuracy
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

2024-08-06 11:47:03.861925: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:267] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2024-08-06 11:47:04.007042: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (anderson-HP-ProBook-4540s): /proc/driver/nvidia/version does not exist
2024-08-06 11:47:29.880160: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 72816640 exceeds 10% of free system memory.
2024-08-06 11:47:30.956931: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 72816640 exceeds 10% of free system memory.
2024-08-06 11:47:31.200514: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 72816640 exceeds 10% of free system memory.


In [7]:
# Entraînement du modèle
history =model.fit(X_train_seq, y_train, batch_size=64, epochs=2, validation_data=(X_val_seq, y_val))

Epoch 1/2


2024-08-06 11:50:56.970694: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 72816640 exceeds 10% of free system memory.
2024-08-06 11:50:57.059981: W tensorflow/tsl/framework/cpu_allocator_impl.cc:82] Allocation of 72816640 exceeds 10% of free system memory.


Epoch 2/2


## Model evaluation

In [8]:
# Model evaluation
# Getting the model predictions on the test data
y_pred = model.predict(X_test_seq)
y_pred_binary = np.round(y_pred)

# Calculating evaluation metrics
accuracy = accuracy_score(y_test, y_pred_binary)
precision = precision_score(y_test, y_pred_binary)
recall = recall_score(y_test, y_pred_binary)
f1 = f1_score(y_test, y_pred_binary)

# Displaying the metrics
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 score:", f1)

Accuracy: 0.8375
Precision: 0.9210656142081894
Recall: 0.7410200436594563
F1 score: 0.8212911030462994


## Real test with a review

In [14]:
model.save('../api/review_model/review_model.h5')
with open ('../api/review_model/tokenizer.pkl','wb') as f:
    pickle.dump(tokenizer,f)

In [10]:
new_review = "good film"
# Preprocessing the review (ignored in this case)
preprocessed_review = new_review
# Encoding the review
encoded_review = tokenizer.texts_to_sequences([preprocessed_review])
# Adjusting the length of the sequence
encoded_review = pad_sequences(encoded_review, maxlen=max_len)
# Passing the review to the model
prediction = model.predict(encoded_review)
print(prediction)
# Interpreting the prediction
if prediction > 0.5:
    print("The review is positive.")
else:
    print("The review is negative.")

[[0.5366965]]
La revue est positive.


## Plotting important curves

In [1]:
# Extracting training and validation metrics from the history
train_acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
train_loss = history.history['loss']
val_loss = history.history['val_loss']

# Plotting the learning and loss curves on the same figure
epochs = range(1, len(train_acc) + 1)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(epochs, train_acc, 'b', label='Training')
plt.plot(epochs, val_acc, 'g', label='Validation')
plt.title('Learning Curve - Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(epochs, train_loss, 'b', label='Training')
plt.plot(epochs, val_loss, 'g', label='Validation')
plt.title('Loss Curve')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

NameError: name 'history' is not defined

In [26]:
# Sauvegarde du modèle




INFO:tensorflow:Assets written to: ../api/review/assets


INFO:tensorflow:Assets written to: ../api/review/assets
