In [None]:
def clean_and_balance_data(pdf_dataframe):
    """Cleans the 'Title' column, replaces values, and balances the dataset"""
    # Nettoyage de la colonne Title
    pdf_dataframe['Title'] = pdf_dataframe['Title'].str.replace(r'\d+', '', regex=True)  # Supprimer les nombres
    pdf_dataframe['Title'] = pdf_dataframe['Title'].str.replace(r'\.pdf', '', regex=True)  # Supprimer ".pdf"
    pdf_dataframe['Title'] = pdf_dataframe['Title'].str.replace(r'[_-]', '', regex=True)  # Supprimer "_" et "-"
    
    # Remplacer les valeurs sp√©cifiques dans la colonne Title
    pdf_dataframe['Title'] = pdf_dataframe['Title'].replace({
        'assignation': 0,
        'assignationsansVices': 1,
        'NotificationVice': 2,
        'Notification': 3
    })
    
    # **Afficher la r√©partition des classes avant √©quilibrage**
    print("R√©partition des classes avant √©quilibrage :")
    print(pdf_dataframe['Title'].value_counts())
    
    # S√©parer les classes majoritaires et minoritaires
    majority = pdf_dataframe[pdf_dataframe['Title'] == 0]
    minority_1 = pdf_dataframe[pdf_dataframe['Title'] == 1]
    minority_2 = pdf_dataframe[pdf_dataframe['Title'] == 2]
    minority_3 = pdf_dataframe[pdf_dataframe['Title'] == 3]

    # Sur√©chantillonner les classes minoritaires
    minority_1_upsampled = resample(minority_1, 
                                    replace=True,     # Permet le sur√©chantillonnage
                                    n_samples=len(majority),  # Faire correspondre la taille de la classe majoritaire
                                    random_state=123)
    minority_2_upsampled = resample(minority_2, 
                                    replace=True, 
                                    n_samples=len(majority), 
                                    random_state=123)
    minority_3_upsampled = resample(minority_3, 
                                    replace=True, 
                                    n_samples=len(majority), 
                                    random_state=123)

    # Fusionner toutes les classes apr√®s le sur√©chantillonnage
    data_balanced = pd.concat([majority, minority_1_upsampled, minority_2_upsampled, minority_3_upsampled])

    return data_balanced


## Legal BERT V2

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch

# Charger le dataset
df = pd.read_csv('pdf_contents_final.csv')

# Diviser les donn√©es en ensembles d'entra√Ænement et de test
X_train, X_test, y_train, y_test = train_test_split(df['Content'], df['Title'], test_size=0.2, random_state=42)

# Charger le tokenizer de LegalBERT
tokenizer = BertTokenizer.from_pretrained('nlpaueb/legal-bert-base-uncased')

# Tokenisation des donn√©es
def tokenize_function(texts):
    return tokenizer(texts, padding=True, truncation=True, max_length=512)

train_encodings = tokenize_function(X_train.tolist())
test_encodings = tokenize_function(X_test.tolist())

# Cr√©er un Dataset personnalis√© pour BERT
class LegalBERTDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item

# Cr√©er les datasets pour l'entra√Ænement et l'√©valuation
train_dataset = LegalBERTDataset(train_encodings, y_train)
test_dataset = LegalBERTDataset(test_encodings, y_test)

# Charger le mod√®le pr√©-entra√Æn√© de LegalBERT pour la classification
model = BertForSequenceClassification.from_pretrained('nlpaueb/legal-bert-base-uncased', num_labels=4)

# Arguments d'entra√Ænement
training_args = TrainingArguments(
    output_dir='./results',          # R√©pertoire de sortie
    evaluation_strategy="epoch",     # Strat√©gie d'√©valuation par √©poque
    learning_rate=2e-5,              # Taux d'apprentissage
    per_device_train_batch_size=8,   # Taille du batch pour l'entra√Ænement
    per_device_eval_batch_size=8,    # Taille du batch pour l'√©valuation
    num_train_epochs=3,              # Nombre d'√©poques
    weight_decay=0.01,               # D√©croissance du poids
    logging_dir='./logs',            # R√©pertoire des logs
)

# Initialiser le Trainer
trainer = Trainer(
    model=model, 
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Entra√Æner le mod√®le
trainer.train()

# √âvaluer le mod√®le
trainer.evaluate()

# Pr√©dictions sur le jeu de test
predictions = trainer.predict(test_dataset)

# Afficher les r√©sultats des pr√©dictions
print(predictions.predictions.argmax(axis=-1))

# Sauvegarder le mod√®le et le tokenizer
model.save_pretrained('./legal_bert_model')
tokenizer.save_pretrained('./legal_bert_model')


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at nlpaueb/legal-bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/111 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from sklearn.metrics import accuracy_score, confusion_matrix

# Charger le dataset
df = pd.read_csv('pdf_contents_final.csv')

# Charger le mod√®le et le tokenizer sauvegard√©s
model = BertForSequenceClassification.from_pretrained('./legal_bert_model')
tokenizer = BertTokenizer.from_pretrained('./legal_bert_model')

# Fonction pour pr√©dire la classe d'un texte
def predict(texts):
    # Tokeniser les nouveaux textes
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=512, return_tensors="pt")
    
    # Obtenir les pr√©dictions du mod√®le
    with torch.no_grad():
        outputs = model(**encodings)
        logits = outputs.logits
    
    # Obtenir les indices des classes pr√©dominantes
    predictions = torch.argmax(logits, dim=-1)
    return predictions

# Tester le mod√®le avec le contenu du dataset
new_texts = df['Content'].tolist()  # Liste des contenus des PDFs
predicted_labels = predict(new_texts)

# Ajouter les pr√©dictions dans le dataframe
df['Predicted_Label'] = predicted_labels.numpy()

# Afficher les r√©sultats avec les titres
print("Exemples de pr√©dictions compar√©es aux v√©ritables titres :")
print(df[['Title', 'Predicted_Label']].head())

# Calcul de la pr√©cision
accuracy = accuracy_score(df['Title'], df['Predicted_Label'])
print(f"Pr√©cision du mod√®le : {accuracy * 100:.2f}%")

# Confusion Matrix pour visualiser les bonnes et mauvaises pr√©dictions
conf_matrix = confusion_matrix(df['Title'], df['Predicted_Label'])
print("\nMatrice de confusion :")
print(conf_matrix)

# Nombre de bonnes r√©ponses
correct_predictions = (df['Title'] == df['Predicted_Label']).sum()
print(f"\nNombre de bonnes r√©ponses : {correct_predictions}")

# Nombre de mauvaises r√©ponses
incorrect_predictions = len(df) - correct_predictions
print(f"Nombre de mauvaises r√©ponses : {incorrect_predictions}")


Exemples de pr√©dictions compar√©es aux v√©ritables titres :
   Title  Predicted_Label
0      0                0
1      0                0
2      0                0
3      0                0
4      0                0
Pr√©cision du mod√®le : 89.13%

Matrice de confusion :
[[52 40  0  0]
 [ 0 92  0  0]
 [ 0  0 92  0]
 [ 0  0  0 92]]

Nombre de bonnes r√©ponses : 328
Nombre de mauvaises r√©ponses : 40
