In [None]:
import pandas as pd
import ast
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
import torch

# --- Step 1: Load and Preprocess the Dataset ---
# Load the dataset
dataset = pd.read_csv("TASTEset.csv")

# Parse the entities column (convert JSON-like strings to Python dictionaries)
def parse_entities(row):
    try:
        return ast.literal_eval(row["ingredients_entities"])
    except Exception as e:
        print(f"Error parsing row: {row['ingredients_entities']}")
        return []

dataset["entities"] = dataset.apply(parse_entities, axis=1)

# Convert pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(dataset)

# --- Step 2: Tokenize and Align Labels ---
# Initialize tokenizer and model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define label mappings
label2id = {"O": 0, "B-QUANTITY": 1, "I-QUANTITY": 2, "B-UNIT": 3, "I-UNIT": 4, "B-FOOD": 5, "I-FOOD": 6}
id2label = {v: k for k, v in label2id.items()}

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["ingredients"], truncation=True, padding="max_length", max_length=128)
    labels = []
    
    for i, entity_list in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to words
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Ignore special tokens
            elif word_idx < len(entity_list):  # Ensure index is within bounds
                label_ids.append(label2id.get(entity_list[word_idx]["type"], 0))  # Default to "O"
            else:
                label_ids.append(0)  # Default to "O" for out-of-bound indices
        
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_dataset = hf_dataset.map(tokenize_and_align_labels, batched=True)

# --- Step 3: Split Dataset into Train and Validation ---
split_dataset = tokenized_dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

print(f"Training set size: {len(train_dataset)}")
print(f"Validation set size: {len(val_dataset)}")

# --- Step 4: Fine-Tune DistilBERT ---
# Load DistilBERT with a classification head
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label2id))

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the trained model
model.save_pretrained("./fine_tuned_distilbert")
tokenizer.save_pretrained("./fine_tuned_distilbert")

# --- Step 5: Test the Model ---
def extract_ingredients(text):
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=-1)
    
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    entities = [id2label[pred.item()] for pred in predictions[0]]
    
    return list(zip(tokens, entities))

recipe_text = "Mix 2 cups of flour with 1 tsp salt"
print(extract_ingredients(recipe_text))




Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Training set size: 800
Validation set size: 200


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
