In [1]:
import pandas as pd
import re
import json

# Load the CSV
df = pd.read_csv("tasteset.csv")

# Regex pattern to match AMOUNT, UNIT, INGREDIENT
pattern = r"(?P<amount>\d+/\d+|\d+)\s+(?P<unit>\w+)\s+(?P<ingredient>.+)"

clean_data = []

for _, row in df.iterrows():
    text = row["ingredients"]
    match = re.match(pattern, text)
    
    if match:
        amount = match.group("amount")
        unit = match.group("unit")
        ingredient = match.group("ingredient").strip()
        
        # Calculate character positions
        amount_start = match.start("amount")
        amount_end = match.end("amount")
        unit_start = match.start("unit")
        unit_end = match.end("unit")
        ingredient_start = match.start("ingredient")
        ingredient_end = match.end("ingredient")
        
        # Format for NER training
        entities = [
            {"start": amount_start, "end": amount_end, "label": "B-AMOUNT"},
            {"start": unit_start, "end": unit_end, "label": "B-UNIT"},
            {"start": ingredient_start, "end": ingredient_end, "label": "B-INGREDIENT"}
        ]
        
        clean_data.append({
            "text": text,
            "entities": entities
        })

# Save as JSONL (one JSON per line)
with open("clean_tasteset.jsonl", "w") as f:
    for item in clean_data:
        f.write(json.dumps(item) + "\n")

In [3]:
import json
import re

def reformat_tasteset(input_file, output_file):
    with open(input_file, "r") as f_in, open(output_file, "w") as f_out:
        for line in f_in:
            data = json.loads(line)
            text = data["text"].strip()
            
            # Split by newline and process each ingredient separately
            for ingredient_line in text.split("\n"):
                if not ingredient_line.strip():
                    continue
                
                # Regex to extract AMOUNT, UNIT, INGREDIENT
                match = re.match(
                    r"(?P<amount>\d+/\d+|\d+\s*-\s*\d+|\d+)\s+"
                    r"(?P<unit>ounces|tablespoons|teaspoons|cups?|pieces|sheets?|tubes?)\s+"
                    r"(?P<ingredient>.+)", 
                    ingredient_line
                )
                
                if match:
                    amount = match.group("amount")
                    unit = match.group("unit")
                    ingredient = match.group("ingredient").strip()
                    
                    # Calculate positions
                    amount_start = match.start("amount")
                    amount_end = match.end("amount")
                    unit_start = match.start("unit")
                    unit_end = match.end("unit")
                    ingredient_start = match.start("ingredient")
                    ingredient_end = match.end("ingredient")
                    
                    # Write clean example
                    f_out.write(json.dumps({
                        "text": ingredient_line,
                        "entities": [
                            {"start": amount_start, "end": amount_end, "label": "B-AMOUNT"},
                            {"start": unit_start, "end": unit_end, "label": "B-UNIT"},
                            {"start": ingredient_start, "end": ingredient_end, "label": "B-INGREDIENT"}
                        ]
                    }) + "\n")

# Usage
reformat_tasteset("clean_tasteset.jsonl", "clean_food.jsonl")