In [None]:
# Import necessary libraries
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, Dataset
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BertModel, BertTokenizer, CamembertModel, CamembertTokenizer

# Import custom modules
import load_data
from utils import *
from training_text_model import *

# Set device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())

# Specify the path to the transcription data
transcr_path='paco-cheese/transcr'

# Load the data using the custom load_data module
data=load_data.load_all_ipus(folder_path=transcr_path,load_words=True)

In [None]:
data[275:285]

# creation des y à prédire

In [None]:
# Create a DataFrame from the 'data' variable
df = pd.DataFrame(data)

# Generate a target variable 'y' based on the DataFrame 'df'
y = create_y(df)

In [None]:
# Convert the 'text_words' column of the 'data' DataFrame to a numpy array, reshape it to a 1D array, and convert it to a list
features = data['text_words'].to_numpy().reshape(-1).tolist() 

# Replace any null values in 'features' with '[UNK]' (unknown token) for the tokenizer
features = [word if not pd.isnull(word) else '[UNK]' for word in features]

# Set a seed for reproducibility
seed = 42

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=seed)

# Create sequences of text for training and test sets
X_train = create_sequences(X_train)
X_test = create_sequences(X_test)

# Check the dimensions of the training and test sets
print(f"Number of training sequences: {len(X_train)}, labels: {len(y_train)}")
print(f"Number of test sequences: {len(X_test)}, labels: {len(y_test)}")

In [None]:
print(X_train)
print(y_train[365:375])

In [None]:
# Set the parameters for the model
model_name = 'camembert-base'  # The name of the pre-trained model to use
max_length = 256  # The maximum length of the sequences
batch_size = 16  # The batch size for training
epochs = 3  # The number of epochs to train for
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Use CUDA if available, otherwise use CPU

# Initialize the tokenizer for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained(model_name)

# Create the training dataset and dataloader
train_dataset = TextDataset(X_train, y_train, tokenizer, max_length)  # Create a dataset from the training data
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Create a dataloader for the training data

# Create the test dataset and dataloader
test_dataset = TextDataset(X_test, y_test, tokenizer, max_length)  # Create a dataset from the test data
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # Create a dataloader for the test data

In [None]:
print(len(X_train))
print(len(y_train))

# Entrainement

In [None]:
# Initialize the CamemBERT model for sequence classification with 2 output labels
model = CamembertForSequenceClassification.from_pretrained(model_name, num_labels=2)

#model=torch.load('modele/camembert_epoch_3.bin')

# Move the model to the device (GPU if available, otherwise CPU)
model.to(device)

# Define the path to save the trained model
model_save_path = 'modele'

# Create the directory if it doesn't exist
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Initialize the optimizer (AdamW) with learning rate 2e-5
optimizer = AdamW(model.parameters(), lr=2e-5)

# Calculate the total number of training steps
total_steps = len(train_loader) * epochs

# Initialize the learning rate scheduler with no warmup steps
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Define the class weights for the loss function
class_weights = torch.tensor([1.0, 15.0], device=device)

# Initialize the loss function (Cross Entropy Loss)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Define the function for training one epoch
def train_epoch(model, data_loader, optimizer, device, scheduler, loss_fn):
    # Set the model to training mode
    model.train()

    # Initialize variables for storing loss and accuracy
    losses = []
    correct_predictions = 0

    # Loop over the data
    for d in data_loader:
        # Move the data to the device
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        # Zero the gradients
        model.zero_grad()

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Compute the loss
        loss = loss_fn(outputs.logits, labels)

        # Get the predictions
        _, preds = torch.max(outputs.logits, dim=1)

        # Update the number of correct predictions
        correct_predictions += torch.sum(preds == labels)

        # Store the loss
        losses.append(loss.item())

        # Backward pass
        loss.backward()

        # Clip the gradients to prevent exploding gradients
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        # Update the weights
        optimizer.step()

        # Update the learning rate
        scheduler.step()

    # Return the accuracy and average loss
    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Training loop
for epoch in range(1):

    #epoch+=1

    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    # Train for one epoch
    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler,
        loss_fn
    )

    # Print the training loss and accuracy
    print(f'Train loss {train_loss} accuracy {train_acc}')

    # Save the model
    save_file_path = os.path.join(model_save_path, f'camembert_epoch_{epoch+1}.bin')
    torch.save(model, save_file_path)

# score f1 et confusion matrix sur le test

In [None]:
# Initialize empty lists for predictions and labels
all_preds_text = []
all_labels = []

#model= torch.load('modele/camembert_epoch_1.bin')

# Set the model to evaluation mode
model.eval()

# Get predictions and labels from the model using the test data
all_preds_text, all_labels = prediction_model_text(model, test_loader, device)

# Calculate the F1 score of the model's predictions
f1 = f1_score(all_labels, all_preds_text)

# Generate a confusion matrix from the model's predictions
conf_matrix = confusion_matrix(all_labels, all_preds_text)

# Print the F1 score and confusion matrix
print(f'Test F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Calculate the total number of instances for each class
total_class_0 = np.sum(conf_matrix[0])
total_class_1 = np.sum(conf_matrix[1])

# Calculate the number of correctly detected instances for each class
detected_class_0 = conf_matrix[0, 0]  # True positives for class 0
detected_class_1 = conf_matrix[1, 1]  # True positives for class 1

# Print the number of correctly detected instances for each class
print(f'Nombre d\'éléments de classe 0 détectés : {detected_class_0} sur {total_class_0}')
print(f'Nombre d\'éléments de classe 1 détectés : {detected_class_1} sur {total_class_1}')

# prédictions sur quelques données afin de voir ce qu'il se passe : 

In [None]:
# Initialize the tokenizer for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Initialize an empty list to store the probabilities
all_probs = []

# Select a subset of the training data
data = X_train[273:277]

# Disable gradient calculations (since we're not training)
with torch.no_grad():
    # Loop over the sentences in the data
    for sentence in data:
        # Print the sentence
        print(sentence)

        # Tokenize the sentence and convert it to tensors
        # Also pad/truncate the sentence to a maximum length of 512 tokens
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)

        # Move the input tensors to the device (GPU if available, otherwise CPU)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Get the model's outputs for the inputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the logits from the outputs
        logits = outputs.logits

        # Calculate the probabilities from the logits
        probs = torch.softmax(logits, dim=1)

        # Append the probabilities to the list
        all_probs.append(probs.cpu().numpy())

        # Print the outputs
        print(outputs)