In [2]:
def read_conll(filepath):
    sentences = []
    labels = []
    tokens = []
    tags = []

    with open(filepath, 'r', encoding='utf-8') as file:
        for line in file:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    labels.append(tags)
                    tokens, tags = [], []
            else:
                splits = line.split()
                if len(splits) >= 2:
                    tokens.append(splits[0])
                    tags.append(splits[1])

    return sentences, labels

tokens, ner_tags = read_conll("../data/labeled_telegram_product_price_location.txt")


In [9]:
from transformers import AutoTokenizer

model_checkpoint = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Extract unique labels
unique_labels = sorted(set(tag for seq in ner_tags for tag in seq))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}


In [10]:
from datasets import Dataset

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], is_split_into_words=True, truncation=True)
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Convert to Hugging Face dataset
dataset = Dataset.from_dict({"tokens": tokens, "ner_tags": ner_tags})
dataset = dataset.train_test_split(test_size=0.2)
tokenized_dataset = dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 2532/2532 [00:01<00:00, 1548.34 examples/s]
Map: 100%|██████████| 634/634 [00:00<00:00, 1843.87 examples/s]


In [2]:
# Cell 6: Define variables, load model, and set up Trainer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, AutoTokenizer

# Define the pretrained model checkpoint
model_checkpoint = "xlm-roberta-base"  # or your chosen model checkpoint

# Example unique labels list, replace with your actual labels extracted from your dataset
unique_labels = ['B-LOC', 'I-LOC', 'B-PRICE', 'I-PRICE', 'B-Product', 'I-Product', 'O']

# Create label-to-id and id-to-label mappings
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Load the pretrained model for token classification (NER)
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(unique_labels),
    id2label=id2label,
    label2id=label2id
)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
)

# Define the data collator to dynamically pad inputs & labels
data_collator = DataCollatorForTokenClassification(tokenizer)

# Set up the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


ModuleNotFoundError: No module named 'transformers'