Modèle TM

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from bs4 import BeautifulSoup
import os
import chardet

In [2]:
path = "data"
textes = []
labels = []

for file in os.listdir(path):
    path_file = os.path.join(path, file)

    if os.path.isfile(path_file):
        # Détecter l'encodage du fichier
        with open(path_file, 'rb') as f:
            encoding = chardet.detect(f.read())['encoding']

        # Ouvrir le fichier avec l'encodage détecté
        with open(path_file, encoding=encoding) as fp:
            soup = BeautifulSoup(fp, "html.parser")

        paragraphes = soup.select("p")
        for p in paragraphes:
            texte = p.get_text()
            textes.append(texte)
            labels.append(file.split("-")[0])


In [3]:
def rassembler_textes_et_labels(textes, labels, taille_minimale=1000):
    textes_rassembles = []
    labels_rassembles = []

    buffer_texte = ""
    buffer_label = ""

    for texte, label in zip(textes, labels):
        if buffer_label == "":
            buffer_label = label

        if buffer_label == label:
            buffer_texte += " " + texte
            if len(buffer_texte) >= taille_minimale:
                textes_rassembles.append(buffer_texte)
                labels_rassembles.append(buffer_label)
                buffer_texte = ""
                buffer_label = ""
        else:
            if len(buffer_texte) >= taille_minimale:
                textes_rassembles.append(buffer_texte)
                labels_rassembles.append(buffer_label)
            buffer_texte = texte
            buffer_label = label

    # Ajoute le dernier échantillon s'il n'a pas été ajouté précédemment et s'il est assez long
    if buffer_label and len(buffer_texte) >= taille_minimale:
        textes_rassembles.append(buffer_texte)
        labels_rassembles.append(buffer_label)

    return textes_rassembles, labels_rassembles

textes_rassembles, labels_rassembles = rassembler_textes_et_labels(textes, labels)

In [4]:
print("Nombre d'échantillons rassemblés")
print("Balzac :",labels_rassembles.count("balzac"))
print("Flaubert :",labels_rassembles.count("flaubert"))
print("Maupassant :",labels_rassembles.count("maupassant"))
print("Sand :",labels_rassembles.count("sand"))
print("Zola :",labels_rassembles.count("zola"))

Nombre d'échantillons rassemblés
Balzac : 1667
Flaubert : 1887
Maupassant : 966
Sand : 1922
Zola : 3826


In [5]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Tokenisation
max_len = 500  # Longueur maximale des séquences
max_words = 10000  # Nombre maximum de mots à considérer

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(textes_rassembles)
sequences = tokenizer.texts_to_sequences(textes_rassembles)

word_index = tokenizer.word_index
print(f"{len(word_index)} mots uniques trouvés.")

# Padding
data = pad_sequences(sequences, maxlen=max_len)

# Encodage des labels
le = LabelEncoder()
labels_encoded = le.fit_transform(labels_rassembles)
labels_categorical = to_categorical(labels_encoded)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
cv = KFold(n_splits=5, shuffle=True, random_state=42)


70037 mots uniques trouvés.


Construction et entraînement du modèle LSTM

In [45]:
import numpy as np
import os
import pickle
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


In [46]:
# Paramètres
MAX_NB_WORDS = 10000
MAX_SEQUENCE_LENGTH = 1000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# Prétraitement des données
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(textes_rassembles)
sequences = tokenizer.texts_to_sequences(textes_rassembles)

word_index = tokenizer.word_index
print(f"Found {len(word_index)} unique tokens.")

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

encoder = LabelEncoder()
encoder.fit(labels_rassembles)
encoded_labels = encoder.transform(labels_rassembles)
categorical_labels = to_categorical(encoded_labels)

X_train, X_test, y_train, y_test = train_test_split(data, categorical_labels, test_size=0.2, random_state=42)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


Found 70036 unique tokens.


In [47]:
# Modèle LSTM
def create_model():
    model = Sequential()
    model.add(Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(LSTM(128))
    model.add(Dense(len(encoder.classes_), activation='softmax'))
    model.compile(optimizer=Adam(), loss='categorical_crossentropy', metrics=['accuracy'])
    return model


In [50]:

# Entraînement du modèle avec cross-validation
print("Entraînement du modèle avec cross-validation...")
accuracies = []
for train_index, val_index in cv.split(X_train, np.argmax(y_train, axis=1)):
    model = create_model()
    model.fit(X_train[train_index], y_train[train_index], epochs=20, batch_size=64)
    scores = model.evaluate(X_train[val_index], y_train[val_index])
    accuracies.append(scores[1])

mean_accuracy = np.mean(accuracies)
print(f"Mean accuracy from cross-validation: {mean_accuracy}")


Entraînement du modèle avec cross-validation...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Epoch 1/20
Epoch 2/20
Epoch 

In [51]:
# Entraînement du modèle avec X_train et y_train
print("Entraînement du modèle avec X_train et y_train...")
model = create_model()
model.fit(X_train, y_train, epochs=20, batch_size=64)

Entraînement du modèle avec X_train et y_train...
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x1e8adb969a0>

In [57]:
# Prédictions sur l'ensemble de test
print("Prédictions sur l'ensemble de test...")
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_test_classes = np.argmax(y_test, axis=1)

Prédictions sur l'ensemble de test...


In [58]:
# Calcul de l'accuracy
accuracy = accuracy_score(y_test_classes, y_pred_classes)
print(f"Test accuracy: {accuracy}")

# Calcul de la matrice de confusion normalisée
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)
conf_matrix_normalized = conf_matrix.astype('float') / conf_matrix.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix:")
print(conf_matrix_normalized)

model_directory = "model"
if not os.path.exists(model_directory):
    os.makedirs(model_directory)

Test accuracy: 0.8568646543330087
Normalized confusion matrix:
[[0.92241379 0.00287356 0.0316092  0.03735632 0.00574713]
 [0.00539084 0.86522911 0.06469003 0.01347709 0.05121294]
 [0.0776699  0.15048544 0.55825243 0.05825243 0.15533981]
 [0.07435897 0.03333333 0.06153846 0.78717949 0.04358974]
 [0.00405954 0.0202977  0.01894452 0.01488498 0.94181326]]


In [60]:
# Sauvegarde du modèle
print("Sauvegarde du modèle...")
model.save_weights(f"{model_directory}/LSTM_model_weights.h5")
print("Modèle sauvegardé.")

Sauvegarde du modèle...
Modèle sauvegardé.


Modèle BERT