In [None]:
# Import necessary libraries
import os
import torch
import numpy as np
import pandas as pd
from torch import nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from transformers import BertModel, BertTokenizer, CamembertModel, CamembertTokenizer, AdamW, get_linear_schedule_with_warmup, CamembertForSequenceClassification

# Import custom modules
import load_data
from utils_ipu import *
from training_text_model_ipu import *

# Set device to GPU if available, else CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(torch.cuda.is_available())

# Specify the path to the transcription data
transcr_path = 'paco-cheese/transcr'

# Load the data using the custom load_data module
data = load_data.load_all_ipus(folder_path=transcr_path, load_words=True)

In [None]:
data[275:285]

# creation des y à prédire

In [None]:
# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Create the target variable y using the custom function create_y
y = create_y(df)

# Display the first 3 rows of the DataFrame
df[:3]

In [None]:
# Create sequences from the DataFrame using the custom function create_sequences
features = create_sequences(df)

# Set the random seed for reproducibility
seed = 42

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(features, y, test_size=0.2, random_state=seed)

# Print the number of training and test sequences
print(f"Number of training sequences: {len(X_train)}, labels: {len(y_train)}")
print(f"Number of test sequences: {len(X_test)}, labels: {len(y_test)}")

In [None]:
# Display
print(X_train)
print(y_train[365:375])

In [None]:
# Set parameters
model_name = 'camembert-base'  # The name of the pre-trained model to use
max_length = 256  # The maximum length of the sequences
batch_size = 16  # The size of the batches for training
epochs = 3  # The number of epochs for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  # Set device to GPU if available, else CPU

# Initialize the tokenizer for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained(model_name)

# Create the datasets and dataloaders for training and testing
train_dataset = TextDataset(X_train, y_train, tokenizer, max_length)  # Create the training dataset
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)  # Create the training dataloader

test_dataset = TextDataset(X_test, y_test, tokenizer, max_length)  # Create the test dataset
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)  # Create the test dataloader

In [None]:
# Display
print(len(X_train))
print(len(y_train))

# Entrainement

In [None]:
# Initialize the CamemBERT model for sequence classification
model = CamembertForSequenceClassification.from_pretrained(model_name, num_labels=2)
model.to(device)

# Define the path to save the model
model_save_path = 'modele'
# Create the directory if it does not exist
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

# Initialize the optimizer and learning rate
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

# Set the class weights and loss function
class_weights = torch.tensor([1.0, 2], device=device)
loss_fn = nn.CrossEntropyLoss(weight=class_weights)

# Define a function to train the model for one epoch
def train_epoch(model, data_loader, optimizer, device, scheduler, loss_fn):
    model.train()
    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        model.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, labels)
        
        _, preds = torch.max(outputs.logits, dim=1)
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# Train the model for the specified number of epochs
for epoch in range(3):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_loader,
        optimizer,
        device,
        scheduler,
        loss_fn
    )

    print(f'Train loss {train_loss} accuracy {train_acc}')
    save_file_path = os.path.join(model_save_path, f'camembert_ipu_epoch_{epoch+1}.bin')
    torch.save(model, save_file_path)

# score f1 et confusion matrix sur le test

In [None]:
# Initialize an empty list for the predictions and labels
all_preds_text = []
all_labels = []

# Load the trained model
model= torch.load('modele/camembert_ipu_epoch_1.bin')

# Set the model to evaluation mode
model.eval()

# Get the predictions and labels using the custom function prediction_model_text
all_preds_text, all_labels = prediction_model_text(model, test_loader, device, proba=False)

# Calculate the F1 score for the test set
f1 = f1_score(all_labels, all_preds_text)

# Calculate the confusion matrix for the test set
conf_matrix = confusion_matrix(all_labels, all_preds_text)

# Print the F1 score and confusion matrix
print(f'Test F1 Score: {f1}')
print(f'Confusion Matrix:\n{conf_matrix}')

# Calculate the total number of instances for each class
total_class_0 = np.sum(conf_matrix[0])
total_class_1 = np.sum(conf_matrix[1])

# Calculate the number of correctly detected instances for each class
detected_class_0 = conf_matrix[0, 0]  # True positives for class 0
detected_class_1 = conf_matrix[1, 1]  # True positives for class 1

# Print the number of correctly detected instances for each class
print(f'Number of class 0 instances detected: {detected_class_0} out of {total_class_0}')
print(f'Number of class 1 instances detected: {detected_class_1} out of {total_class_1}')

# prédictions sur quelques données afin de voir ce qu'il se passe : 

In [None]:
# Initialize the tokenizer for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Initialize an empty list for the probabilities
all_probs = []

# Select a subset of the training data
data = X_train[273:277]

# Disable gradient calculations
with torch.no_grad():
    for sentence in data:
        # Print the sentence
        print(sentence)
        
        # Tokenize the sentence and convert to tensors
        inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True, max_length=512)
        input_ids = inputs['input_ids'].to(device)
        attention_mask = inputs['attention_mask'].to(device)

        # Get the model outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        
        # Calculate the probabilities
        probs = torch.softmax(logits, dim=1)
        all_probs.append(probs.cpu().numpy())

        # Print the model outputs
        print(outputs)