In [1]:
import json
from tqdm import tqdm
import torch
import numpy as np
import pandas as pd
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import ast
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import re
from sklearn.model_selection import train_test_split
from transformers import pipeline
import matplotlib.pyplot as plt

# Read the contents of the all.json file
with open('Filtered_recipes.json', 'r')  as file:
    recipes = json.load(file)
    
test_recipes = recipes[:2]

print(len(test_recipes))
print(test_recipes[0])
print(test_recipes[0]['cook_time'])


2024-12-07 19:59:52.193508: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


2
{'title': 'Simple Macaroni and Cheese', 'category': 'main-dish', 'ingredients': '1 (8 ounce) box elbow macaroni ; 1/4 cup butter ; 1/4 cup all-purpose flour ; 1/2 teaspoon salt ;   ground black pepper to taste ; 2 cups milk ; 2 cups shredded Cheddar cheese', 'directions': 'Bring a large pot of lightly salted water to a boil. Cook elbow macaroni in the boiling water, stirring occasionally until cooked through but firm to the bite, 8 minutes. Drain. Melt butter in a saucepan over medium heat; stir in flour, salt, and pepper until smooth, about 5 minutes. Slowly pour milk into butter-flour mixture while continuously stirring until mixture is smooth and bubbling, about 5 minutes. Add Cheddar cheese to milk mixture and stir until cheese is melted, 2 to 4 minutes. Fold macaroni into cheese sauce until coated.', 'prep_time': '10 mins', 'cook_time': '20 mins', 'total_time': '30 mins', 'servings': '4', 'yields': '4 servings', 'calories': '630.2', 'instructions_list': '[\'Bring a large pot of 

In [2]:

def time_to_minutes(time_str):
    if not time_str:
        return 0
    time_str = time_str.lower()
    hours = re.findall(r'(\d+)\s*h', time_str)
    minutes = re.findall(r'(\d+)\s*m', time_str)
    total_minutes = 0
    if hours:
        total_minutes += int(hours[0]) * 60
    if minutes:
        total_minutes += int(minutes[0])
    return total_minutes

# # Prepare data
y = np.array([time_to_minutes(recipe["cook_time"]) for recipe in recipes])


print(len(y))

25636


**Recipe BERT**

In [3]:
# Normalize y
y = np.array(y)  # Ensure y is a numpy array
y_normalized = (y - y.min()) / (y.max() - y.min())  # Min-max scaling

In [4]:
# Combine ingredients and instructions into a single text for each recipe
texts = [f"Ingredients: {recipe['ingredients']}. Instructions: {recipe['directions']}" for recipe in recipes]

In [5]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

class RecipeDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]

        # Tokenize the text
        encodings = self.tokenizer(
            text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt"
        )

        return {
            "input_ids": encodings["input_ids"].squeeze(0),  # Remove batch dimension
            "attention_mask": encodings["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.float32),
        }

# Split into train and test
recipes_train, recipes_val, labels_train, labels_val = train_test_split(texts, y_normalized, test_size=0.2, random_state=42)

# Load tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("alexdseo/RecipeBERT")

# Create datasets and dataloaders
train_dataset = RecipeDataset(recipes_train, labels_train, tokenizer)
val_dataset = RecipeDataset(recipes_val, labels_val, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)


In [6]:
import torch
import torch.nn as nn
from transformers import AutoModel

class RecipeBERTForRegression(nn.Module):
    def __init__(self, base_model_name='CookBERT-checkpoint', local_files_only=True):
        super(RecipeBERTForRegression, self).__init__()
        self.base_model = AutoModel.from_pretrained(base_model_name)
        embedding_dim = self.base_model.config.hidden_size
        self.regressor = nn.Sequential(
            nn.Linear(embedding_dim, embedding_dim // 2),
            nn.ReLU(),
            nn.Dropout(0.5),
            nn.Linear(embedding_dim // 2, 1),  # Output a single regression value
        )

    def forward(self, input_ids, attention_mask):
        
        # Get the embeddings for all tokens
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden_states = outputs.last_hidden_state  # Shape: (batch_size, seq_len, hidden_dim)
        
        # Perform mean pooling (consider the attention mask)
        # Step 1: Multiply hidden states by attention mask to ignore padding
        masked_hidden_states = hidden_states * attention_mask.unsqueeze(-1)  # Shape: (batch_size, seq_len, hidden_dim)
        
        # Step 2: Sum over the sequence dimension
        summed_hidden_states = masked_hidden_states.sum(dim=1)  # Shape: (batch_size, hidden_dim)
        
        # Step 3: Count non-padding tokens per example
        non_padding_counts = attention_mask.sum(dim=1).unsqueeze(-1)  # Shape: (batch_size, 1)
        
        # Step 4: Calculate the mean
        mean_pooled_output = summed_hidden_states / non_padding_counts  # Shape: (batch_size, hidden_dim)

        # Pass through the regressor
        return self.regressor(mean_pooled_output)

In [None]:
from transformers import AdamW
from tqdm import tqdm
from sklearn.metrics import r2_score

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RecipeBERTForRegression().to(device)

criterion = nn.MSELoss()  # Mean Squared Error for regression
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.8)

epochs = 30
for epoch in range(epochs):
    # Training phase
    model.train()
    total_train_loss = 0
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch + 1}/{epochs}"):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask).squeeze()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item()

    # Validation phase
    model.eval()
    total_val_loss = 0
    all_true_labels = []
    all_pred_labels = []
    with torch.no_grad():
        for batch in tqdm(val_loader, desc=f"Validating Epoch {epoch + 1}/{epochs}"):
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["label"].to(device)

            outputs = model(input_ids, attention_mask).squeeze()
            loss = criterion(outputs, labels)
            total_val_loss += loss.item()

            all_true_labels.extend(labels.cpu().numpy())
            all_pred_labels.extend(outputs.cpu().numpy())

    # Calculate R² score
    r2 = r2_score(all_true_labels, all_pred_labels)
    print(f"Epoch {epoch + 1}/{epochs}: Train Loss = {total_train_loss / len(train_loader):.4f}, "
          f"Val Loss = {total_val_loss / len(val_loader):.4f}, R² = {r2:.4f}")