In [1]:
import pandas as pd

# Function to load CoNLL data
def load_conll_data(file_path):
    sentences = []
    labels = []
    with open(file_path, 'r', encoding='utf-8') as file:
        sentence = []
        label = []
        for line in file:
            line = line.strip()
            if line:  
                token, tag = line.split()
                sentence.append(token)
                label.append(tag)
            else:  
                sentences.append(sentence)
                labels.append(label)
                sentence = []
                label = []
    return sentences, labels

sentences, labels = load_conll_data('labeled_dataset.conll')


In [None]:
from transformers import XLMRobertaTokenizer

tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

tokenized_inputs = []
label_list = []

for sentence, label in zip(sentences, labels):
    
    encoding = tokenizer(sentence, is_split_into_words=True, padding='max_length', truncation=True, return_tensors="pt", max_length=128)
    tokenized_inputs.append(encoding)

    aligned_labels = []
   
    label_index = 0

    for token_id in encoding['input_ids'][0]:
        
        token = tokenizer.convert_ids_to_tokens(token_id.item())
        if token.startswith('▁'):  
            token = token[1:]  
        
        if label_index < len(label) and sentence[label_index] == token:
            aligned_labels.append(label[label_index])
            label_index += 1
        else:
            aligned_labels.append("O")  

    # Pad the labels to match the tokenized length
    aligned_labels = aligned_labels + ['O'] * (encoding['input_ids'].shape[1] - len(aligned_labels))
    label_list.append(aligned_labels)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)


In [None]:
from transformers import XLMRobertaForTokenClassification, Trainer, DataCollatorForTokenClassification


labels = [
    ["B-Product", "O", "B-LOC", "O"],
    ["O", "B-PRICE", "O", "O", "B-PRICE"],
    
]

all_labels = [label for sublist in labels for label in sublist]  
unique_labels = list(set(all_labels))  

model = XLMRobertaForTokenClassification.from_pretrained("xlm-roberta-base", num_labels=len(unique_labels))

data_collator = DataCollatorForTokenClassification(tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,  
    eval_dataset=eval_dataset,     
)

# Start training
trainer.train()
