# Homework - Data Collection

In [25]:
import pandas as pd
import numpy as np
import os

In [26]:
data_path = "AppetIte_Dataset.csv"
appetite_df = pd.read_csv(data_path)

In [27]:
print("=== Basic Dataset Information ===")
print(f"Total Records: {appetite_df.shape[0]}")
print(f"Total Features: {appetite_df.shape[1]}")
print("\nColumn Names:")
print(appetite_df.columns.tolist())

=== Basic Dataset Information ===
Total Records: 13501
Total Features: 8

Column Names:
['recipe_id', 'recipe_name', 'ingredients', 'instructions', 'image_path', 'category', 'storage_tips', 'nutrition_score']


In [29]:
print("\nData Types & Non-Null Counts:")
print(appetite_df.info())


Data Types & Non-Null Counts:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13501 entries, 0 to 13500
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   recipe_id        13501 non-null  int64  
 1   recipe_name      13501 non-null  int64  
 2   ingredients      13501 non-null  object 
 3   instructions     13493 non-null  object 
 4   image_path       13501 non-null  object 
 5   category         13501 non-null  object 
 6   storage_tips     13501 non-null  object 
 7   nutrition_score  13501 non-null  float64
dtypes: float64(1), int64(2), object(5)
memory usage: 843.9+ KB
None


In [30]:
print("\n=== Summary Statistics (for numeric columns) ===")
print(appetite_df.describe(include='all').transpose())


=== Summary Statistics (for numeric columns) ===
                   count unique  \
recipe_id        13501.0    NaN   
recipe_name      13501.0    NaN   
ingredients        13501  13473   
instructions       13493  13464   
image_path         13501  13472   
category           13501      4   
storage_tips       13501      1   
nutrition_score  13501.0    NaN   

                                                               top   freq  \
recipe_id                                                      NaN    NaN   
recipe_name                                                    NaN    NaN   
ingredients                                                     []     12   
instructions     place ingredients in blender in the order list...      5   
image_path                                                  #NAME?     30   
category                                                 Indulgent  10685   
storage_tips     Store ingredients in airtight containers; refr...  13501   
nutrition_score   

In [31]:
print("\n=== Sample Data (first 5 rows) ===")
display(appetite_df.head())
curated_path = "data/curated/AppetIte_Dataset_v1.csv"
os.makedirs("data/curated", exist_ok=True)
appetite_df.to_csv(curated_path, index=False)


=== Sample Data (first 5 rows) ===


Unnamed: 0,recipe_id,recipe_name,ingredients,instructions,image_path,category,storage_tips,nutrition_score
0,1,0,"['1 (3½–4-lb.) whole chicken', '2¾ tsp. kosher...","pat chicken dry with paper towels, season all ...",miso-butter-roast-chicken-acorn-squash-panzanella,Indulgent,Store ingredients in airtight containers; refr...,0.63
1,2,1,"['2 large egg whites', '1 pound new potatoes (...",preheat oven to 400°f and line a rimmed baking...,crispy-salt-and-pepper-potatoes-dan-kluger,Indulgent,Store ingredients in airtight containers; refr...,0.83
2,3,2,"['1 cup evaporated milk', '1 cup whole milk', ...",place a rack in middle of oven; preheat to 400...,thanksgiving-mac-and-cheese-erick-williams,Indulgent,Store ingredients in airtight containers; refr...,0.68
3,4,3,"['1 (¾- to 1-pound) round italian loaf, cut in...",preheat oven to 350°f with rack in middle. gen...,italian-sausage-and-bread-stuffing-240559,Healthy,Store ingredients in airtight containers; refr...,0.69
4,5,4,"['1 teaspoon dark brown sugar', '1 teaspoon ho...",stir together brown sugar and hot water in a c...,newtons-law-apple-bourbon-cocktail,Quick Meals,Store ingredients in airtight containers; refr...,0.65


# Homework - Model development (The very first steps)

In [7]:
!pip install transformers torch --upgrade




In [8]:
from transformers import BartTokenizer, BartForConditionalGeneration
import torch

In [9]:
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [10]:
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model = model.to(device)

print(f"Model '{model_name}' loaded successfully on device: {device}")
print(f"Number of parameters: {sum(p.numel() for p in model.parameters()):,}")

Model 'facebook/bart-large-cnn' loaded successfully on device: mps
Number of parameters: 406,290,432


In [11]:
sample_inputs = [
    "ingredients: chicken, rice, soy sauce, garlic, egg",
    "ingredients: spinach, tomato, feta cheese, olive oil",
    "ingredients: oats, honey, banana, milk"
]

In [12]:
for text in sample_inputs:
    inputs = tokenizer(text, return_tensors="pt").to(device)
    summary_ids = model.generate(
        **inputs,
        max_length=30,
        num_beams=4,
        early_stopping=True
    )
    output = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    print(f"\nInput: {text}")
    print(f" Generated Recipe Suggestion: {output}")




Input: ingredients: chicken, rice, soy sauce, garlic, egg
 Generated Recipe Suggestion: ingredients: chicken, rice, soy sauce, garlic, egg, egg and rice. Serves 8 people at a time

Input: ingredients: spinach, tomato, feta cheese, olive oil
 Generated Recipe Suggestion: ingredients: spinach, tomato, feta cheese, olive oil and olive oil. Serves 2-3 people at a

Input: ingredients: oats, honey, banana, milk
 Generated Recipe Suggestion: ingredients: oats, honey, banana, milk, milk. Serves 4 people. For more information, visit www.


In [13]:
with torch.no_grad():
    inputs = tokenizer(sample_inputs[0], return_tensors="pt").to(device)
    outputs = model.model.encoder(**inputs, output_hidden_states=True)
    # Grab last hidden state
    embeddings = outputs.last_hidden_state.mean(dim=1).cpu().numpy()

print("\n Embedding shape:", embeddings.shape)
print("These embeddings can be used for clustering or category classifiers (Healthy, Quick, etc.).")


 Embedding shape: (1, 1024)
These embeddings can be used for clustering or category classifiers (Healthy, Quick, etc.).


In [14]:
test_input = "ingredients: pasta, tomato, garlic, olive oil, basil"
inputs = tokenizer(test_input, return_tensors="pt").to(device)
generated_ids = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)
recipe_output = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("\n Example Generated Output:")
print(recipe_output)





 Example Generated Output:
ingredients: pasta, tomato, garlic, olive oil, basil, basil. Serves 4-6 people. For more information, go to www.gofundme.com/sauceof pasta. For


## Next steps:

- Fine-tune on curated AppetIte_Dataset.csv (input_text → target_text)
- Evaluate recipe coherence & category alignment
- Optionally distill or prune model for lower latency (<2 s goal)

# Fine-Tuning the Model

In [1]:
import os
import json
import time
import torch
import pandas as pd
from tqdm.auto import tqdm
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import BartForConditionalGeneration, BartTokenizer, get_linear_schedule_with_warmup

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

df = pd.read_csv("data/curated/AppetIte_Dataset_v1.csv")

def prepare_input_text(row):
    ingredients = row['ingredients'] if pd.notna(row['ingredients']) else 'no ingredients listed'
    category = row['category'] if pd.notna(row['category']) else 'general'
    return f"Generate a {category} recipe using: {ingredients}"

def prepare_target_text(row):
    recipe_name = row['recipe_name'] if pd.notna(row['recipe_name']) else 'Delicious Recipe'
    instructions = row['instructions'] if pd.notna(row['instructions']) else 'Instructions not available'
    return f"Recipe: {recipe_name}. Instructions: {instructions}"

df['input_text'] = df.apply(prepare_input_text, axis=1)
df['target_text'] = df.apply(prepare_target_text, axis=1)

train_df, val_df = train_test_split(df, test_size=0.15, random_state=42, stratify=df['category'])

train_df = train_df.head(2000).reset_index(drop=True)
val_df = val_df.head(500).reset_index(drop=True)

print(f"Training samples: {len(train_df):,}")
print(f"Validation samples: {len(val_df):,}")

model_name = "facebook/bart-base"  # faster than large-cnn
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name).to(device)

class RecipeDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_input_length=128, max_target_length=256):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data.loc[idx, 'input_text']
        target_text = self.data.loc[idx, 'target_text']
        
        input_encoding = self.tokenizer(input_text,
                                        max_length=self.max_input_length,
                                        padding="max_length",
                                        truncation=True,
                                        return_tensors="pt")
        target_encoding = self.tokenizer(target_text,
                                         max_length=self.max_target_length,
                                         padding="max_length",
                                         truncation=True,
                                         return_tensors="pt")
        
        labels = target_encoding["input_ids"].squeeze()
        labels[labels == self.tokenizer.pad_token_id] = -100
        
        return {'input_ids': input_encoding['input_ids'].squeeze(),
                'attention_mask': input_encoding['attention_mask'].squeeze(),
                'labels': labels}

train_dataset = RecipeDataset(train_df, tokenizer)
val_dataset = RecipeDataset(val_df, tokenizer)

BATCH_SIZE = 1
GRADIENT_ACCUMULATION_STEPS = 8
EPOCHS = 2
LEARNING_RATE = 3e-5
WARMUP_STEPS = 100

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=0.01)
total_steps = (len(train_loader) // GRADIENT_ACCUMULATION_STEPS) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

def train_epoch(model, train_loader, optimizer, scheduler, device, epoch):
    model.train()
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch} Training")
    for batch_idx, batch in enumerate(progress_bar):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / GRADIENT_ACCUMULATION_STEPS
        total_loss += loss.item() * GRADIENT_ACCUMULATION_STEPS
        loss.backward()
        
        if (batch_idx + 1) % GRADIENT_ACCUMULATION_STEPS == 0:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
        
        avg_loss = total_loss / (batch_idx + 1)
        progress_bar.set_postfix({'Loss': f'{avg_loss:.4f}'})
    if hasattr(torch, 'mps') and torch.backends.mps.is_available():
        torch.mps.empty_cache()
    return total_loss / len(train_loader)

def validate(model, val_loader, device):
    model.eval()
    total_loss = 0
    progress_bar = tqdm(val_loader, desc="Validating")
    with torch.no_grad():
        for batch_idx, batch in enumerate(progress_bar):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += outputs.loss.item()
            progress_bar.set_postfix({'Val Loss': f'{outputs.loss.item():.4f}'})
            if batch_idx % 10 == 0 and hasattr(torch, 'mps') and torch.backends.mps.is_available():
                torch.mps.empty_cache()
    return total_loss / len(val_loader)

os.makedirs("models", exist_ok=True)
os.makedirs("training_logs", exist_ok=True)

print(f"Training started")

best_val_loss = float('inf')
training_history = {'train_loss': [], 'val_loss': [], 'epoch_times': []}

for epoch in range(1, EPOCHS + 1):
    start_time = time.time()
    
    print(f"\nEpoch {epoch}/{EPOCHS}")
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device, epoch)
    val_loss = validate(model, val_loader, device)
    
    training_history['train_loss'].append(train_loss)
    training_history['val_loss'].append(val_loss)
    epoch_time = time.time() - start_time
    training_history['epoch_times'].append(epoch_time)
    
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Validation Loss: {val_loss:.4f}")
    print(f"Epoch Duration: {epoch_time/60:.2f} minutes")
    
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        print("New best model - saving...")
        model.save_pretrained("models/appetite_bart_best")
        tokenizer.save_pretrained("models/appetite_bart_best")
        model_info = {
            'epoch': epoch,
            'train_loss': train_loss,
            'val_loss': val_loss,
            'training_time': sum(training_history['epoch_times']),
            'date_saved': time.strftime("%Y-%m-%d %H:%M:%S")
        }
        with open('models/appetite_bart_best/training_info.json', 'w') as f:
            json.dump(model_info, f, indent=2)

with open('training_logs/training_history.json', 'w') as f:
    json.dump(training_history, f, indent=2)

print("Training complete")


Using device: mps
Training samples: 2,000
Validation samples: 500
Training started

Epoch 1/2


Epoch 1 Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Validating:   0%|          | 0/500 [00:00<?, ?it/s]

Train Loss: 17.9904
Validation Loss: 20.9060
Epoch Duration: 8.07 minutes
New best model - saving...





Epoch 2/2


Epoch 2 Training:   0%|          | 0/2000 [00:00<?, ?it/s]

Validating:   0%|          | 0/500 [00:00<?, ?it/s]

Train Loss: 22.0239
Validation Loss: 22.2065
Epoch Duration: 7.84 minutes
Training complete


In [2]:
from transformers import BartForConditionalGeneration, BartTokenizer

finetuned_model = BartForConditionalGeneration.from_pretrained("models/appetite_bart_best")
finetuned_tokenizer = BartTokenizer.from_pretrained("models/appetite_bart_best")
finetuned_model = finetuned_model.to(device)

print("Fine-tuned model loaded")

try:
    with open('models/appetite_bart_best/training_info.json', 'r') as f:
        training_info = json.load(f)
    print(f"Best epoch: {training_info['epoch']}")
    print(f"Validation loss: {training_info['val_loss']:.4f}")
    print(f"Training time: {training_info['training_time']/60:.1f} minutes")
except:
    pass

Fine-tuned model loaded
Best epoch: 1
Validation loss: 20.9060
Training time: 8.1 minutes


In [5]:
import re

ALLERGENS = ['peanut', 'milk', 'egg', 'soy', 'fish', 'shellfish', 'wheat', 'gluten', 'sesame']

def detect_allergens(text):
    found = [a for a in ALLERGENS if re.search(rf'\\b{a}\\b', text.lower())]
    return found


In [23]:
import torch
from transformers import BartForConditionalGeneration, BartTokenizer

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")

finetuned_model = BartForConditionalGeneration.from_pretrained("models/appetite_bart_best")
finetuned_tokenizer = BartTokenizer.from_pretrained("models/appetite_bart_best")
finetuned_model = finetuned_model.to(device)
finetuned_model.eval()

test_recipes = [
    "Generate a Healthy recipe using: ['chicken breast', 'broccoli', 'olive oil', 'garlic', 'lemon']",
    "Generate a Quick Meals recipe using: ['pasta', 'tomato sauce', 'basil', 'mozzarella cheese']",
    "Generate an Indulgent recipe using: ['chocolate', 'cream', 'butter', 'vanilla', 'eggs']"
]

print("Testing fine-tuned model:\n")

for i, test_input in enumerate(test_recipes, 1):
    print(f"Test {i}")
    print(f"Input: {test_input}")
    
    inputs = finetuned_tokenizer(test_input, return_tensors="pt", max_length=128, truncation=True).to(device)
    
    with torch.no_grad():
        generated_ids = finetuned_model.generate(
            **inputs,
            max_length=150,
            num_beams=5,
            no_repeat_ngram_size=3,
            early_stopping=True,
            temperature=0.8,
            do_sample=True
            )
    ai_recipe = finetuned_tokenizer.decode(generated_ids[0], skip_special_tokens=True)

Testing fine-tuned model:

Test 1
Input: Generate a Healthy recipe using: ['chicken breast', 'broccoli', 'olive oil', 'garlic', 'lemon']
Test 2
Input: Generate a Quick Meals recipe using: ['pasta', 'tomato sauce', 'basil', 'mozzarella cheese']
Test 3
Input: Generate an Indulgent recipe using: ['chocolate', 'cream', 'butter', 'vanilla', 'eggs']


In [17]:
import numpy as np

try:
    from rouge_score import rouge_scorer
except ImportError:
    import subprocess
    subprocess.run(['pip', 'install', 'rouge-score'], check=True)
    from rouge_score import rouge_scorer

def simple_evaluation(model, tokenizer, val_df, device, num_samples=50):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    model.eval()
    sample_indices = np.random.choice(len(val_df), min(num_samples, len(val_df)), replace=False)
    
    print(f"Evaluating on {len(sample_indices)} samples...")
    
    for idx in sample_indices:
        input_text = val_df.iloc[idx]['input_text']
        reference_text = val_df.iloc[idx]['target_text']
        
        inputs = tokenizer(input_text, return_tensors="pt", max_length=128, truncation=True).to(device)
        
        with torch.no_grad():
            generated_ids = model.generate(**inputs, max_length=150, num_beams=4, early_stopping=True)
        
        prediction = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
        
        try:
            scores = scorer.score(reference_text, prediction)
            rouge1_scores.append(scores['rouge1'].fmeasure)
            rouge2_scores.append(scores['rouge2'].fmeasure)
            rougeL_scores.append(scores['rougeL'].fmeasure)
        except:
            pass
    
    results = {
        'ROUGE-1': np.mean(rouge1_scores),
        'ROUGE-2': np.mean(rouge2_scores),
        'ROUGE-L': np.mean(rougeL_scores)
    }
    
    return results
eval_results = simple_evaluation(finetuned_model, finetuned_tokenizer, val_df, device, num_samples=50)

print("\nEvaluation Results:")
print(f"ROUGE-1: {eval_results['ROUGE-1']:.4f}")
print(f"ROUGE-2: {eval_results['ROUGE-2']:.4f}")
print(f"ROUGE-L: {eval_results['ROUGE-L']:.4f}")

overall_rouge = (eval_results['ROUGE-1'] + eval_results['ROUGE-2'] + eval_results['ROUGE-L']) / 3
print(f"\nOverall ROUGE: {overall_rouge:.4f}")


Evaluating on 50 samples...

Evaluation Results:
ROUGE-1: 0.0000
ROUGE-2: 0.0000
ROUGE-L: 0.0000

Overall ROUGE: 0.0000


# Model Development Complete

## Completed Steps

- Data preparation with train/validation split
- PyTorch dataset implementation
- Model fine-tuning with progress tracking
- Training visualization
- Model testing on diverse examples
- Comprehensive ROUGE evaluation
- Safety checks and logging

# Risk Management and Trustworthiness

In [18]:
import pandas as pd
import re
import json
import os
from datetime import datetime

def check_dataset_quality(df):
    print("Data Quality Check")
    print("Rows:", len(df), ", Columns:", len(df.columns))
    print("Missing Values:\n", df.isnull().sum())
    print("Duplicate Rows:", df.duplicated().sum())
    
    category_col = None
    for col in df.columns:
        if col.lower() == 'category':
            category_col = col
            break
    
    if category_col:
        print("Category Distribution:\n", df[category_col].value_counts())
    else:
        print("No 'category' column found in dataset.")
    
    print("==========================")


**Safety & Allergen Detection**

In [19]:
ALLERGENS = ['peanut', 'milk', 'egg', 'soy', 'fish', 'shellfish', 'wheat', 'gluten', 'sesame']

def detect_allergens(text):
    found = [a for a in ALLERGENS if re.search(rf'\b{a}\b', str(text).lower())]
    return found

def safety_check(recipe_text):
    if detect_allergens(recipe_text):
        print(f"Warning: Contains allergens: {detect_allergens(recipe_text)}")
    if any(bad in recipe_text.lower() for bad in ['kill', 'poison', 'suicide']):
        print("Unsafe content detected! Review required.")

**Simple Logging**

In [20]:
def log_prediction(input_text, output_text):
    os.makedirs("logs", exist_ok=True)
    entry = {
        "timestamp": datetime.utcnow().isoformat(),
        "input": input_text,
        "output": output_text
    }
    with open("logs/predictions.jsonl", "a") as f:
        f.write(json.dumps(entry) + "\n")
    print(" Logged prediction for monitoring.")

**Trustworthy Model Card Generator**

In [21]:
def create_model_card(name="AppetIte-BART", version="v1.0"):
    card = f"""
# Model Card: {name}
**Version:** {version}
**Purpose:** Generate recipes from ingredients.
**Training Data:** Curated AppetIte_Dataset.csv
**Risks:** May include allergen ingredients or biased cuisine categories.
**Mitigations:** Allergen filter, user feedback, manual review.
**Contact:** Sharath / Project Maintainer
"""
    with open("MODEL_CARD.md", "w") as f:
        f.write(card)
    print(" Model Card created (MODEL_CARD.md)")


**Example Usage**

In [22]:
df = pd.read_csv("data/curated/AppetIte_Dataset_v1.csv")

check_dataset_quality(df)

sample_input = "ingredients: peanut butter, banana, honey"
sample_output = "peanut butter banana smoothie"

safety_check(sample_output)

log_prediction(sample_input, sample_output)

create_model_card()

Data Quality Check
Rows: 13501 , Columns: 8
Missing Values:
 recipe_id          0
recipe_name        0
ingredients        0
instructions       8
image_path         0
category           0
storage_tips       0
nutrition_score    0
dtype: int64
Duplicate Rows: 0
Category Distribution:
 category
Indulgent          10685
Healthy             1437
Quick Meals         1308
Family-Friendly       71
Name: count, dtype: int64
 Logged prediction for monitoring.
 Model Card created (MODEL_CARD.md)


  "timestamp": datetime.utcnow().isoformat(),
