In [3]:
import json
import torch
from transformers import (
    DistilBertForTokenClassification,
    DistilBertTokenizerFast,
    Trainer,
    TrainingArguments
)
from datasets import Dataset
from torch.optim import AdamW

# Configuration (keep your existing values)
MODEL_NAME = "distilbert-base-uncased"
TRAIN_DATA_PATH = "./tasteset_final.jsonl"
OUTPUT_DIR = "./recipe_ner_model"
LABELS = ["O", "B-AMOUNT", "B-UNIT", "B-INGREDIENT"]

# Custom Trainer class
class FixedTrainer(Trainer):
    def training_step(self, model, inputs):
        return super().training_step(model, inputs)

# Load dataset (keep your existing code)
def load_dataset(file_path):
    data = []
    with open(file_path, "r") as f:
        for line in f:
            data.append(json.loads(line))
    return Dataset.from_list(data)

dataset = load_dataset(TRAIN_DATA_PATH)

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained(MODEL_NAME)

# Tokenize and align labels (keep your existing code)
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
        is_split_into_words=False
    )
    labels = []
    for i, entities in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            else:
                for entity in entities:
                    if entity["start"] <= word_idx < entity["end"]:
                        label_ids.append(LABELS.index(entity["label"]))
                        break
                else:
                    label_ids.append(LABELS.index("O"))
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset.column_names
)

# Create model
model = DistilBertForTokenClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(LABELS),
    id2label={i: label for i, label in enumerate(LABELS)},
    label2id={label: i for i, label in enumerate(LABELS)}
)

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,
    per_device_train_batch_size=16,
    save_steps=500,
    logging_steps=100,
    disable_tqdm=False,
    learning_rate=2e-5,
    weight_decay=0.01,
)

# Use the fixed trainer
trainer = FixedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    optimizers=(AdamW(model.parameters(), lr=2e-5), None)
)

# Start training
print("Starting training...")
trainer.train()
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")

Map:   0%|          | 0/1310 [00:00<?, ? examples/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


TypeError: FixedTrainer.training_step() takes 3 positional arguments but 4 were given