# Homework - Data Collection

In [23]:
import pandas as pd
import numpy as np
import os

In [24]:
data_path = "AppetIte_Dataset.csv"
appetite_df = pd.read_csv(data_path)

In [25]:
print("=== Basic Dataset Information ===")
print(f"Total Records: {appetite_df.shape[0]}")
print(f"Total Features: {appetite_df.shape[1]}")
print("\nColumn Names:")
print(appetite_df.columns.tolist())

=== Basic Dataset Information ===
Total Records: 13501
Total Features: 8

Column Names:
['recipe_id', 'recipe_name', 'ingredients', 'instructions', 'image_path', 'category', 'storage_tips', 'nutrition_score']


In [26]:
print("\n🔍 Data Types & Non-Null Counts:")
print(appetite_df.info())


🔍 Data Types & Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   recipe_id        13501 non-null  int64  
 1   recipe_name      13501 non-null  int64  
 2   ingredients      13501 non-null  object 
 3   instructions     13493 non-null  object 
 4   image_path       13501 non-null  object 
 5   category         13501 non-null  object 
 6   storage_tips     13501 non-null  object 
 7   nutrition_score  13501 non-null  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 843.9+ KB
None


In [27]:
print("\n=== Summary Statistics (for numeric columns) ===")
print(appetite_df.describe(include='all').transpose())


=== Summary Statistics (for numeric columns) ===
                   count unique  \
recipe_id        13501.0    NaN   
recipe_name      13501.0    NaN   
ingredients        13501  13473   
instructions       13493  13464   
image_path         13501  13472   
category           13501      4   
storage_tips       13501      1   
nutrition_score  13501.0    NaN   

                                                               top   freq  \
recipe_id                                                      NaN    NaN   
recipe_name                                                    NaN    NaN   
ingredients                                                     []     12   
instructions     place ingredients in blender in the order list...      5   
image_path                                                  #NAME?     30   
category                                                 Indulgent  10685   
storage_tips     Store ingredients in airtight containers; refr...  13501   
nutrition_score   

In [28]:
print("\n=== Sample Data (first 5 rows) ===")
display(appetite_df.head())
curated_path = "data/curated/AppetIte_Dataset_v1.csv"
os.makedirs("data/curated", exist_ok=True)
appetite_df.to_csv(curated_path, index=False)


=== Sample Data (first 5 rows) ===


Unnamed: 0,recipe_id,recipe_name,ingredients,instructions,image_path,category,storage_tips,nutrition_score
0,1,0,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,Indulgent,Store ingredients in airtight containers; refr...,0.63
1,2,1,"['2 large egg whites', '1 pound new potatoes (...",preheat oven to 400°f and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,Indulgent,Store ingredients in airtight containers; refr...,0.83
2,3,2,"['1 cup evaporated milk', '1 cup whole milk', ...",place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,Indulgent,Store ingredients in airtight containers; refr...,0.68
3,4,3,"['1 (¾- to 1-pound) round italian loaf, cut in...",preheat oven to 350°f with rack in middle. gen...,italian-sausage-and-bread-stuffing-240559,Healthy,Store ingredients in airtight containers; refr...,0.69
4,5,4,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,Quick Meals,Store ingredients in airtight containers; refr...,0.65


# Homework - Model development (The very first steps)

In [29]:
!pip install transformers torch --upgrade




In [30]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

In [31]:
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [32]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print(f"✅ Model '{model_name}' loaded successfully on device: {device}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

✅ Model 'facebook/bart-large-cnn' loaded successfully on device: mps
Number of parameters: 406,290,432


In [33]:
sample_inputs = [
    "ingredients: chicken, rice, soy sauce, garlic, egg",
    "ingredients: spinach, tomato, feta cheese, olive oil",
    "ingredients: oats, honey, banana, milk"
]

In [34]:
for text in sample_inputs:
    inputs = tokenizer(text, return_tensors="pt").to(device)
    summary_ids = model.generate(
        **inputs,
        max_length=30,
        num_beams=4,
        early_stopping=True
    )
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"\n🧾 Input: {text}")
    print(f" Generated Recipe Suggestion: {output}")




🧾 Input: ingredients: chicken, rice, soy sauce, garlic, egg
 Generated Recipe Suggestion: ingredients: chicken, rice, soy sauce, garlic, egg, egg and rice. Serves 8 people at a time

🧾 Input: ingredients: spinach, tomato, feta cheese, olive oil
 Generated Recipe Suggestion: ingredients: spinach, tomato, feta cheese, olive oil and olive oil. Serves 2-3 people at a

🧾 Input: ingredients: oats, honey, banana, milk
 Generated Recipe Suggestion: ingredients: oats, honey, banana, milk, milk. Serves 4 people. For more information, visit www.


In [35]:
with torch.no_grad():
    inputs = tokenizer(sample_inputs[0], return_tensors="pt").to(device)
    outputs = model.model.encoder(**inputs, output_hidden_states=True)
    # Grab last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

print("\n Embedding shape:", embeddings.shape)
print("These embeddings can be used for clustering or category classifiers (Healthy, Quick, etc.).")


 Embedding shape: (1, 1024)
These embeddings can be used for clustering or category classifiers (Healthy, Quick, etc.).


In [36]:
test_input = "ingredients: pasta, tomato, garlic, olive oil, basil"
inputs = tokenizer(test_input, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)
recipe_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("\n Example Generated Output:")
print(recipe_output)





 Example Generated Output:
ingredients: pasta, tomato, garlic, olive oil, basil, basil. Serves 4-6 people. For more information, go to www.gofundme.com/sauceof pasta. For


## Next steps:

- Fine-tune on curated AppetIte_Dataset.csv (input_text → target_text)
- Evaluate recipe coherence & category alignment
- Optionally distill or prune model for lower latency (<2 s goal)

# Risk Management and Trustworthiness

In [44]:
import pandas as pd
import re
import json
import os
from datetime import datetime

def check_dataset_quality(df):
    print("Data Quality Check")
    print("Rows:", len(df), ", Columns:", len(df.columns))
    print("Missing Values:\n", df.isnull().sum())
    print("Duplicate Rows:", df.duplicated().sum())
    
    # Fix here — handle both naming cases safely
    category_col = None
    for col in df.columns:
        if col.lower() == 'category':
            category_col = col
            break
    
    if category_col:
        print("Category Distribution:\n", df[category_col].value_counts())
    else:
        print("No 'category' column found in dataset.")
    
    print("==========================")


**Safety & Allergen Detection**

In [45]:
ALLERGENS = ['peanut', 'milk', 'egg', 'soy', 'fish', 'shellfish', 'wheat', 'gluten', 'sesame']

def detect_allergens(text):
    found = [a for a in ALLERGENS if re.search(rf'\b{a}\b', str(text).lower())]
    return found

def safety_check(recipe_text):
    if detect_allergens(recipe_text):
        print(f"Warning: Contains allergens: {detect_allergens(recipe_text)}")
    if any(bad in recipe_text.lower() for bad in ['kill', 'poison', 'suicide']):
        print("Unsafe content detected! Review required.")

**Simple Logging**

In [46]:
def log_prediction(input_text, output_text):
    os.makedirs("logs", exist_ok=True)
    entry = {
        "timestamp": datetime.utcnow().isoformat(),
        "input": input_text,
        "output": output_text
    }
    with open("logs/predictions.jsonl", "a") as f:
        f.write(json.dumps(entry) + "\n")
    print(" Logged prediction for monitoring.")

**Trustworthy Model Card Generator**

In [47]:
def create_model_card(name="AppetIte-BART", version="v1.0"):
    card = f"""
# Model Card: {name}
**Version:** {version}
**Purpose:** Generate recipes from ingredients.
**Training Data:** Curated AppetIte_Dataset.csv
**Risks:** May include allergen ingredients or biased cuisine categories.
**Mitigations:** Allergen filter, user feedback, manual review.
**Contact:** Sharath / Project Maintainer
"""
    with open("MODEL_CARD.md", "w") as f:
        f.write(card)
    print(" Model Card created (MODEL_CARD.md)")


**Example Usage**

In [48]:
df = pd.read_csv("data/curated/AppetIte_Dataset_v1.csv")

check_dataset_quality(df)

sample_input = "ingredients: peanut butter, banana, honey"
sample_output = "peanut butter banana smoothie"

safety_check(sample_output)

log_prediction(sample_input, sample_output)

create_model_card()

Data Quality Check
Rows: 13501 , Columns: 8
Missing Values:
 recipe_id          0
recipe_name        0
ingredients        0
instructions       8
image_path         0
category           0
storage_tips       0
nutrition_score    0
dtype: int64
Duplicate Rows: 0
Category Distribution:
 category
Indulgent          10685
Healthy             1437
Quick Meals         1308
Family-Friendly       71
Name: count, dtype: int64
 Logged prediction for monitoring.
 Model Card created (MODEL_CARD.md)


  "timestamp": datetime.utcnow().isoformat(),
