<a href="https://colab.research.google.com/drive/1cP2OoW75gnQF7nYUCwHNly-OG6xA_1Gi#scrollTo=7zENRfQhl1h4" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **BERT**

In this new attempt, we employed BERT (Bidirectional Encoder Representations from Transformers), a cutting-edge model in natural language processing, combined with a classification layer. BERT is renowned for its ability to deeply understand the context and nuances of language, making it highly suitable for complex tasks like text difficulty classification. We preprocessed our text data to fit BERT’s requirements, involving lowercasing, removing punctuation and numbers, and condensing spaces.

Then, we used BERT’s tokenizer to convert our sentences into a format understandable by the model, followed by padding these tokenized sentences to a fixed length for consistent input size. We ran this data through the BERT model, which is adept at extracting meaningful features from text, and then used these features for classification into different difficulty levels. Training BERT required careful tuning of parameters and understanding its learning process, but its advanced understanding of language context offered us a significant advantage in accurately classifying text difficulty.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
df_train = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/training_data.csv").dropna()
df_test = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/unlabelled_test_data.csv").dropna()
df_final = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/sample_submission.csv").dropna()

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
import re
import torch

def preprocess_text(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'\s+', ' ', text).strip() 
    return text

def encode_sentences(tokenizer, sentences, max_length):
    input_ids = []
    attention_masks = []

    for sentence in sentences:
        encoded_sent = tokenizer.encode_plus(
            text=sentence,
            add_special_tokens=True,
            max_length=max_length,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)

    return input_ids, attention_masks

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

training_data = df_train
training_data['cleaned_sentence'] = training_data['sentence'].apply(preprocess_text)

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=6)
model.to(device)

max_length = 256  

train_inputs, train_masks = encode_sentences(tokenizer, training_data['cleaned_sentence'], max_length)
train_labels = torch.tensor(training_data['difficulty'].factorize()[0])  # Convertir en indices numériques

batch_size = 16
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

epochs = 4 
lr = 3e-5
optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)

total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for step, batch in enumerate(train_dataloader):
        batch_input_ids = batch[0].to(device)
        batch_input_mask = batch[1].to(device)
        batch_labels = batch[2].to(device)

        model.zero_grad()

        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask, labels=batch_labels)
        loss = outputs[0]
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}/{epochs} - Loss: {avg_train_loss}')

unlabelled_data = df_test
unlabelled_data['cleaned_sentence'] = unlabelled_data['sentence'].apply(preprocess_text)
test_inputs, test_masks = encode_sentences(tokenizer, unlabelled_data['cleaned_sentence'], max_length)
test_data = TensorDataset(test_inputs, test_masks)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

model.eval()
all_logits = []

for batch in test_dataloader:
    batch_input_ids = batch[0].to(device)
    batch_input_mask = batch[1].to(device)

    with torch.no_grad():
        outputs = model(batch_input_ids, token_type_ids=None, attention_mask=batch_input_mask)
    logits = outputs[0]
    all_logits.extend(logits.detach().cpu().numpy())

predicted_labels = np.argmax(all_logits, axis=1)
label_mapping = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}
predictions_df = pd.DataFrame({
    'id': unlabelled_data['id'],
    'predicted_difficulty': predicted_labels
})
predictions_df['predicted_difficulty'] = predictions_df['predicted_difficulty'].map(label_mapping)