In [None]:
<a href="https://colab.research.google.com/drive/1s846jzD2EeppSBldjnh7UUrMhXRMpBIP#scrollTo=xA850ZrbKfIL" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> 

# FlauBERT

Firstly, the code begins by importing necessary libraries like Pandas for data handling, PyTorch for deep learning, and Transformers for accessing pretrained language models. Essential functions for evaluation metrics such as precision, recall, and f1-score are also imported.

The data preparation phase involves mapping language levels to numeric labels and vice versa, which is crucial for the machine learning model to understand the outputs. The custom TextDataset class processes text data, tokenizing and converting it to PyTorch tensors, which are suitable for input into the FlauBERT model.

FlauBERT is chosen for its specialization in the French language. Unlike generic models, FlauBERT has been pre-trained on a diverse set of French texts, making it adept at understanding the nuances and context of the language. This pre-training allows it to effectively grasp various styles and levels of French, which is critical for your project.

In the training phase, DataLoaders are used for feeding data in batches. AdamW optimizer is employed along with a learning rate scheduler for effective training. During each epoch, the model is trained and validated, with performance metrics like loss and accuracy being computed and displayed.

For predictions, a function is defined to process new text inputs and generate predictions using the trained model. The model is then applied to test data, and the predictions are saved into a CSV file. Finally, the model is saved for future use, avoiding the need for retraining.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
df_train = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/training_data_2.csv").dropna()
df_test = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/unlabelled_test_data.csv").dropna()
df_final = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/sample_submission.csv").dropna()

In [None]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm
from transformers import FlaubertTokenizer, FlaubertForSequenceClassification, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from transformers import CamembertTokenizer, CamembertForSequenceClassification, get_linear_schedule_with_warmup
import string
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

import sentencepiece as spm
import joblib

label_map = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
reverse_label_map = {v: k for k, v in label_map.items()}

def convert_labels_to_numeric(df, label_column):
    df[label_column] = df[label_column].map(label_map)
    return df

def convert_numeric_to_labels(labels):
    return [reverse_label_map[label] for label in labels]

class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

def train_model(model, train_dataset, val_dataset, device, learning_rate=2e-5, epochs=3):
    train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=32)
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    total_steps = len(train_loader) * epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in tqdm(train_loader):
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss}')

        model.eval()
        total_eval_accuracy = 0
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            with torch.no_grad():
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            total_eval_accuracy += (predictions == labels).sum().item()

        avg_val_accuracy = total_eval_accuracy / len(val_dataset)
        print(f'Epoch {epoch+1}/{epochs} - Validation Accuracy: {avg_val_accuracy}')

def predict(texts, tokenizer, model, device):
    model.eval()
    predictions = []
    for text in texts:
        inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
        with torch.no_grad():
            outputs = model(**inputs)
        probs = outputs[0].softmax(1)
        predictions.append(probs.argmax().item())
    return predictions

training_data = convert_labels_to_numeric(df_train, 'difficulty')

train_texts, val_texts, train_labels, val_labels = train_test_split(training_data['sentence'], training_data['difficulty'], test_size=0.1)
train_texts.reset_index(drop=True, inplace=True)
train_labels.reset_index(drop=True, inplace=True)
val_texts.reset_index(drop=True, inplace=True)
val_labels.reset_index(drop=True, inplace=True)

tokenizer = FlaubertTokenizer.from_pretrained('flaubert/flaubert_base_cased')
model = FlaubertForSequenceClassification.from_pretrained('flaubert/flaubert_base_cased', num_labels=6)

train_dataset = TextDataset(train_texts, train_labels, tokenizer)
val_dataset = TextDataset(val_texts, val_labels, tokenizer)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_model(model, train_dataset, val_dataset, device)

test_texts = df_test['sentence']
predicted_difficulties_numeric = predict(test_texts, tokenizer, model, device)
predicted_difficulties = convert_numeric_to_labels(predicted_difficulties_numeric)

result_df = pd.DataFrame({'id': df_test['id'], 'difficulty': predicted_difficulties})
result_df.to_csv('predicted_difficultiesCamenBERT.csv', index=False)

torch.save(model.state_dict(), 'FlauBERT.pth')