In [2]:
!pip install transformers



In [3]:
from transformers import BertTokenizer
import pandas as pd

# Load the data
train_data = pd.read_csv('training_data.csv')

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the sentences
train_encodings = tokenizer(train_data['sentence'].tolist(), truncation=True, padding=True, max_length=128)

# Convert labels to numeric values if they are not already
difficulty_levels = {'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}
train_labels = train_data['difficulty'].replace(difficulty_levels).values


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 29.0/29.0 [00:00<00:00, 78.8kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 996k/996k [00:00<00:00, 1.96MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.96M/1.96M [00:00<00:00, 3.21MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 625/625 [00:00<00:00, 3.57MB/s]


In [4]:
import torch
from torch.utils.data import Dataset, DataLoader

class FrenchSentencesDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = FrenchSentencesDataset(train_encodings, train_labels)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)


In [5]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=6)


Downloading model.safetensors: 100%|██████████| 714M/714M [02:53<00:00, 4.11MB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from transformers import AdamW

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = AdamW(model.parameters(), lr=5e-5)

for epoch in range(3):  # Number of training epochs
    model.train()
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()




KeyboardInterrupt: 