In [33]:
from huggingface_hub import InferenceClient
from getpass import getpass
import pandas as pd
import io
import csv
import time

In [35]:
HUGGINGFACE_API_KEY = getpass("Enter your Hugging Face API key: ")

In [36]:
MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
client = InferenceClient(
    model=MODEL,
    token=HUGGINGFACE_API_KEY
)

In [42]:
SYSTEM_PROMPT = """You are a food science expert specializing in culinary chemistry and taste perception. You will enhance a taste profile database by adding preparation method variations for ingredients.

CONTEXT: 
- Each ingredient has 5 taste dimensions: salty, umami, sweet, sour, bitter (scored 0-1)
- Preparation methods chemically transform ingredients, altering their taste profiles
- Common transformations include: fermentation, pickling, roasting, caramelization, curing, smoking, drying, blanching, grilling

RULES:
1.  **Analyze the Base Ingredient:** Understand what it is (e.g., a vegetable, a dairy product, a meat).
2.  **Generate Variations:** Brainstorm common cooking methods, processing techniques, or specific types related to the ingredient. The new name should be "Processed Ingredient" (e.g., "Fermented Cabbage", "Smoked Salmon", "Blue Cheese"). Include both traditional and modern preparation techniques.
3.  **Estimate Taste Profile:** For each new variation, create a new 5-point taste profile (salty, umami, sweet, sour, bitter) with scores from 0.0 to 1.0. The scores MUST logically reflect the change caused by the process. Consider how these methods chemically transform the taste compounds. For example, fermenting increases sourness, curing increases saltiness, and roasting can increase sweetness and bitterness (caramelization).
4.  **Format Output Correctly:** The output MUST be only the new CSV lines. Do not include headers, explanations, or the original line. 

IMPORTANT RULES:
- Only generate preparations that actually exist in culinary practice
- Score adjustments should reflect real chemical changes, not assumptions
- For ingredients rarely processed (e.g., salt), return only the original
- Include a note explaining the dominant chemical/taste change

**OUTPUT FORMAT** STRICTLY ONLY RETURN A LIST OF NEW CSV LINES with the following columns:
entity_name,entity_type,salty,umami,sweet,sour,bitter,notes

EXAMPLES:
Input: cabbage,ingredient,0.0,0.2,0.3,0.1,0.4,"Category: Vegetable"
Output:
cabbage_fermented,ingredient_processed,0.2,0.6,0.1,0.8,0.3,"Lactic acid fermentation creates glutamates (umami) and acids (sour)"
cabbage_pickled,ingredient_processed,0.4,0.1,0.2,0.9,0.2,"Vinegar pickling adds acetic acid (sour) and salt"'
cabbage_roasted,ingredient_processed,0.1,0.4,0.5,0.0,0.5,"Maillard reaction creates savory-sweet compounds"'
cabbage_blanched,ingredient_processed,0.0,0.1,0.2,0.0,0.2,"Blanching reduces bitter glucosinolates"

Input: pork,ingredient,0.1,0.7,0.2,0.0,0.1,"Category: Meat"
Output:
pork_cured,ingredient_processed,0.9,0.8,0.1,0.0,0.2,"Salt curing and nitrites enhance umami"
pork_smoked,ingredient_processed,0.5,0.9,0.3,0.0,0.4,"Smoke compounds add umami and slight bitterness"
pork_caramelized,ingredient_processed,0.2,0.8,0.6,0.0,0.3,"Surface caramelization creates sweet-savory crust"
"""

In [25]:
USER_PROMPT_TEMPLATE = """
Generate the processed variations for this ingredient:
{ingredient_row}

**IMPORTANT**: Your response must contain ONLY the raw CSV lines and nothing else. Do not include any analysis, explanations, or introductory text.
"""

In [43]:

def expand_ingredient(row, client):
    row_str = f"{row['entity_name']},{row['entity_type']},{row['salty']},"
    row_str += f"{row['umami']},{row['sweet']},{row['sour']},{row['bitter']},\"{row['notes']}\""
    
    messages = [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user", "content": USER_PROMPT_TEMPLATE.format(ingredient_row=row_str)}
    ]
    
    response = client.chat_completion(
        model=MODEL,
        messages=messages,
        max_tokens=2048,
        temperature=0.0,
        top_p=0.9
    )
    output = response.choices[0].message.content
    clean_output = output.strip().strip("[]'\" ")
    
    string_io = io.StringIO(clean_output)
    reader = csv.reader(string_io, skipinitialspace=True)
    
    new_rows_as_dicts = []
    headers = ['entity_name', 'entity_type', 'salty', 'umami', 'sweet', 'sour', 'bitter', 'notes']
    
    for row_values in reader:
        # 3. Ensure the row has the correct number of columns
        if len(row_values) == len(headers):
            # 4. Create a dictionary and append it
            new_rows_as_dicts.append(dict(zip(headers, row_values)))
    
    return new_rows_as_dicts


In [46]:
df = pd.read_csv('knowledge_base_average.csv')


all_rows = []
    
print("Starting knowledge base expansion...")
for index, row in df.iterrows():
    print(f"[{index + 1}/{len(df)}] Expanding '{row['entity_name']}'...")
    
    # 1. Add the original row to our new list
    all_rows.append(row.to_dict())
    
    # 2. Get the new, expanded rows
    new_rows = expand_ingredient(row, client)
    
    if new_rows:
        print(f"  -> Generated {len(new_rows)} new variations.")
        all_rows.extend(new_rows)
    else:
        print("  -> No new variations generated or an error occurred.")
    
    # 3. Be polite to the API to avoid rate limits
    time.sleep(1) 

print("\nExpansion complete.")

# Create the final augmented DataFrame
augmented_df = pd.DataFrame(all_rows)

# Save to the new CSV file
augmented_df.to_csv("knowledge_base_average_processed.csv", index=False)
print(f"Augmented knowledge base saved to 'temp_new'.")
print(f"Original size: {len(df)} rows. New size: {len(augmented_df)} rows.")

Starting knowledge base expansion...
[1/935] Expanding 'bakery products'...
  -> Generated 3 new variations.
[2/935] Expanding 'bread'...
  -> Generated 4 new variations.
[3/935] Expanding 'rye bread'...
  -> Generated 4 new variations.
[4/935] Expanding 'wheaten bread'...
  -> Generated 3 new variations.
[5/935] Expanding 'white bread'...
  -> Generated 3 new variations.
[6/935] Expanding 'wholewheat bread'...
  -> Generated 3 new variations.
[7/935] Expanding 'wort'...
  -> Generated 3 new variations.
[8/935] Expanding 'arrack'...
  -> Generated 3 new variations.
[9/935] Expanding 'beer'...
  -> Generated 3 new variations.
[10/935] Expanding 'bantu beer'...
  -> Generated 3 new variations.
[11/935] Expanding 'brandy'...
  -> Generated 3 new variations.
[12/935] Expanding 'anise brandy'...
  -> Generated 2 new variations.
[13/935] Expanding 'apple brandy'...
  -> Generated 2 new variations.
[14/935] Expanding 'armagnac brandy'...
  -> Generated 4 new variations.
[15/935] Expanding 'bl

HfHubHTTPError: 402 Client Error: Payment Required for url: https://router.huggingface.co/together/v1/chat/completions (Request ID: Root=1-68b8e0f7-4256b6af6aca660e6cf2dea4;b9a2d966-41f3-4d85-b9f6-aee3c4385f78)

You have exceeded your monthly included credits for Inference Providers. Subscribe to PRO to get 20x more monthly included credits.