<a href="https://colab.research.google.com/github/michalis0/DataScience_and_MachineLearning/blob/master/Assignements/Part%204/Assignment_part_four.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a> 

**BERT for sequence classification**

Finally, we utilized the BertForSequenceClassification model from the BERT (Bidirectional Encoder Representations from Transformers) series, specifically the 'bert-base-multilingual-cased' version, configured for sequence classification tasks. This model is designed to handle complex natural language processing challenges, adept at understanding and analyzing text in multiple languages. We specifically used this model for classifying text into six categories, indicated by the num_labels=6 parameter. This setup allows the model to learn and predict various levels of text difficulty, harnessing BERT's sophisticated understanding of language context and structure.

BERT's architecture, combined with the sequence classification layer, provided a powerful tool for our text classification needs. The model's ability to process and analyze textual data in depth, considering the contextual nuances of language, made it an ideal choice for accurately categorizing texts into predefined difficulty levels.

In [None]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
df_train = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/training_data.csv").dropna()
df_test = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/unlabelled_test_data.csv").dropna()
df_final = pd.read_csv("https://raw.githubusercontent.com/Oglo/Project-DSML/main/Data/sample_submission.csv").dropna()

In [None]:
!pip install transformers
!pip install torch

from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim

training_data = df_train
unlabelled_test_data = df_test

class FrenchTextDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
encoded_training_data = tokenizer(training_data['sentence'].tolist(), truncation=True, padding=True, max_length=128)

difficulty_mapping = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
training_labels = training_data['difficulty'].map(difficulty_mapping).tolist()

train_dataset = FrenchTextDataset(encoded_training_data, training_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=6)

optimizer = optim.Adam(model.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

model.train()
for epoch in range(3): 
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(torch.int64)
        attention_mask = batch['attention_mask'].to(torch.int64)
        labels = batch['labels'].to(torch.int64)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

encoded_test_data = tokenizer(unlabelled_test_data['sentence'].tolist(), truncation=True, padding=True, max_length=128)
test_dataset = FrenchTextDataset(encoded_test_data)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model.eval()
predicted_difficulties = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(torch.int64)
        attention_mask = batch['attention_mask'].to(torch.int64)
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=1).cpu().numpy()
        predicted_difficulties.extend(predictions)

reverse_difficulty_mapping = {v: k for k, v in difficulty_mapping.items()}
predicted_difficulties = [reverse_difficulty_mapping[label] for label in predicted_difficulties]

result_df = pd.DataFrame({'id': unlabelled_test_data['id'], 'difficulty': predicted_difficulties})
