# Import Libraries

In [1]:
!pip install transformers datasets torch spacy gradio
!python -m spacy download en_core_web_sm

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting gradio
  Downloading gradio-5.9.1-py3-none-any.whl.metadata (16 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecti

In [2]:
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from tqdm import tqdm
import torch
import gradio as gr
import os
import json

# Data Preprocessing

In [3]:
data = pd.read_csv('recipe.csv')

required_columns = [
    'title', 'category', 'ingredients', 'directions', 'servings',
    'calories', 'carbohydrates_g', 'sugars_g', 'fat_g', 'saturated_fat_g',
    'cholesterol_mg', 'protein_g', 'dietary_fiber_g', 'sodium_mg',
    'calcium_mg', 'iron_mg', 'magnesium_mg', 'potassium_mg'
]
data = data[required_columns].dropna()
nlp = spacy.load("en_core_web_sm")

def segment_sentences(directions):
    doc = nlp(directions)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

data['segmented_directions'] = data['directions'].apply(segment_sentences)
data['parsed_ingredients'] = data['ingredients'].apply(lambda x: [item.strip() for item in x.split(';')])

train_data, val_data = train_test_split(data, test_size=0.1, random_state=42)

preprocessed_dir = './preprocessed_data'
os.makedirs(preprocessed_dir, exist_ok=True)

train_data.to_csv(f'{preprocessed_dir}/train_data.csv', index=False)
val_data.to_csv(f'{preprocessed_dir}/val_data.csv', index=False)

print(f"Preprocessed datasets saved to {preprocessed_dir}.")

Preprocessed datasets saved to ./preprocessed_data.


# Tokenization

In [5]:
tokenizer = T5Tokenizer.from_pretrained('t5-small')

class RecipeDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data.iloc[idx]

        ingredients = "; ".join(item['parsed_ingredients'])
        category = item['category']
        directions = "\n".join(item['segmented_directions'])
        nutritional_info = f"Calories: {item['calories']} kcal, Carbohydrates: {item['carbohydrates_g']}g, Sugars: {item['sugars_g']}g, Fat: {item['fat_g']}g, Protein: {item['protein_g']}g"

        input_text = f"Generate a recipe with the following criteria: Category: {category}, Ingredients: {ingredients}"
        output_text = f"Category: {category}\nTitle: {item['title']}\nIngredients: {ingredients}\nInstructions: {directions}\nNutritional Information: {nutritional_info}\nServings: {item['servings']}"

        input_encoding = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')
        output_encoding = self.tokenizer(output_text, padding='max_length', truncation=True, max_length=self.max_length, return_tensors='pt')

        return {
            'input_ids': input_encoding['input_ids'].squeeze(),
            'attention_mask': input_encoding['attention_mask'].squeeze(),
            'labels': output_encoding['input_ids'].squeeze()
        }

train_dataset = RecipeDataset(train_data, tokenizer)
val_dataset = RecipeDataset(val_data, tokenizer)

tokenized_data_dir = './tokenized_data'
os.makedirs(tokenized_data_dir, exist_ok=True)

train_tokenized_data = [
    {
        'input_ids': item['input_ids'].tolist(),
        'attention_mask': item['attention_mask'].tolist(),
        'labels': item['labels'].tolist()
    }
    for item in train_dataset
]
torch.save(train_tokenized_data, f'{tokenized_data_dir}/train_tokenized_data.pth')

val_tokenized_data = [
    {
        'input_ids': item['input_ids'].tolist(),
        'attention_mask': item['attention_mask'].tolist(),
        'labels': item['labels'].tolist()
    }
    for item in val_dataset
]
torch.save(val_tokenized_data, f'{tokenized_data_dir}/val_tokenized_data.pth')

print(f"Tokenized datasets saved in {tokenized_data_dir}.")

train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Tokenized datasets saved in ./tokenized_data.


# Model Training

In [8]:
import os

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
checkpoint_dir = './checkpoints'
os.makedirs(checkpoint_dir, exist_ok=True)

model = T5ForConditionalGeneration.from_pretrained('t5-small').to(device)
tokenizer = T5Tokenizer.from_pretrained('t5-small')

optimizer = AdamW(model.parameters(), lr=1e-4)

checkpoint_path = os.path.join(checkpoint_dir, 'checkpoint.pth')

if not os.path.exists(f"{checkpoint_dir}/checkpoint.pth"):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'epoch': 0,
    }, checkpoint_path)

if os.path.exists(checkpoint_path):
    print("Loading checkpoint...")
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    start_epoch = checkpoint['epoch'] + 1
    print(f"Resuming from epoch {start_epoch}")
else:
    start_epoch = 0

Loading checkpoint...


  checkpoint = torch.load(checkpoint_path, map_location=device)


Resuming from epoch 70


In [9]:
epochs = 70
for epoch in range(start_epoch, epochs):
    model.train()
    total_train_loss = 0
    with tqdm(train_dataloader, unit="batch") as tepoch:
        tepoch.set_description(f"Epoch {epoch+1}")
        for batch in tepoch:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            tepoch.set_postfix(loss=loss.item())

    print(f"Epoch {epoch+1} - Average Training Loss: {total_train_loss / len(train_dataloader)}")

    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }
    torch.save(checkpoint, checkpoint_path)
    print(f"Checkpoint saved for epoch {epoch+1}")

In [None]:
model.eval()
total_val_loss = 0

with tqdm(val_dataloader, unit="batch") as tepoch:
    tepoch.set_description("Validation")
    with torch.no_grad():
        for batch in tepoch:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()
            tepoch.set_postfix(loss=loss.item())

print(f"Validation Loss: {total_val_loss / len(val_dataloader)}")

Validation:   0%|          | 0/333 [00:00<?, ?batch/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Validation: 100%|██████████| 333/333 [01:22<00:00,  4.05batch/s, loss=0.456]

Validation Loss: 0.46333030009412907





# Rule-Based Integration

In [10]:
def rule_based_output(user_input, data):
    filtered_data = data[data['ingredients'].str.contains(user_input, case=False)]
    if filtered_data.empty:
        return "Sorry, no recipes match your query."

    recipe = filtered_data.iloc[0]
    return f"Title: {recipe['title']}\nCategory: {recipe['category']}\nIngredients: {recipe['ingredients']}\nInstructions: {recipe['directions']}"

# Gradio UI

In [11]:
model_dir = './checkpoints'
tokenizer = T5Tokenizer.from_pretrained('t5-small')
model = T5ForConditionalGeneration.from_pretrained('t5-small')

checkpoint_path = os.path.join(model_dir, 'checkpoint.pth')
if os.path.exists(checkpoint_path):
    checkpoint = torch.load(checkpoint_path, map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    print("Model checkpoint loaded successfully.")
else:
    print("Checkpoint not found. Ensure the path is correct and the model is saved.")

model = model.to(device)
model.eval()

conversation_history = []

def generate_response(input_text):
    input_ids = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)

    output_ids = model.generate(input_ids, max_length=512, num_beams=5, early_stopping=True)

    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

def conversational_interface(user_input):
    global conversation_history
    conversation_history.append(f"User: {user_input}")

    if user_input.lower() in ["reset", "clear"]:
        conversation_history = []
        return "Conversation reset."

    chatbot_response = generate_response(user_input)
    conversation_history.append(f"ChefBot: {chatbot_response}")

    return "\n".join(conversation_history)

iface = gr.Interface(
    fn=conversational_interface,
    inputs="text",
    outputs="text",
    title="ChefBot ",
    description="Chat with ChefBot about recipes and more! Type 'reset' to start a new conversation.",
    theme="compact"
)

iface.launch()

  checkpoint = torch.load(checkpoint_path, map_location=device)


Model checkpoint loaded successfully.



Sorry, we can't find the page you are looking for.


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b18389464b74ee14bb.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




# Model Evaluation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from nltk.translate.bleu_score import sentence_bleu

def evaluate_model(model, dataloader, tokenizer, device):
    """Evaluate the model's performance metrics."""
    model.eval()
    all_predictions = []
    all_labels = []
    bleu_scores = []

    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=512)
            predictions = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
            actuals = [tokenizer.decode(label, skip_special_tokens=True) for label in labels]

            for pred, actual in zip(predictions, actuals):
                bleu_scores.append(sentence_bleu([actual.split()], pred.split()))

            all_predictions.extend(predictions)
            all_labels.extend(actuals)

    y_pred = [" ".join(set(pred.split())) for pred in all_predictions]
    y_true = [" ".join(set(label.split())) for label in all_labels]

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted')
    recall = recall_score(y_true, y_pred, average='weighted')
    avg_bleu = sum(bleu_scores) / len(bleu_scores)

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "naturalness_bleu": avg_bleu
    }

metrics = evaluate_model(model, val_dataloader, tokenizer, device)
print(f"Accuracy: {metrics['accuracy']:.4f}")
print(f"Precision: {metrics['precision']:.4f}")
print(f"Recall: {metrics['recall']:.4f}")
print(f"Naturalness (BLEU Score): {metrics['naturalness_bleu']:.4f}")

Evaluating:  23%|██▎       | 77/333 [07:37<24:41,  5.79s/it]

Created dataset file at: .gradio/flagged/dataset1.csv


Evaluating: 100%|██████████| 333/333 [31:08<00:00,  5.61s/it]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.0000
Precision: 0.0000
Recall: 0.0000
Naturalness (BLEU Score): 0.4198


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import json
import os

metrics_log_path = './metrics_log.json'

def log_metrics(epoch, metrics):
    """Log metrics for learning capability tracking."""
    if os.path.exists(metrics_log_path):
        with open(metrics_log_path, 'r') as f:
            logs = json.load(f)
    else:
        logs = {}

    logs[epoch] = metrics

    with open(metrics_log_path, 'w') as f:
        json.dump(logs, f, indent=4)

log_metrics(start_epoch, metrics)

print(f"Logged Metrics for Epoch {start_epoch}:")
print(metrics)

Logged Metrics for Epoch 70:
{'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'naturalness_bleu': 0.41977058516953225}
