# Regression using Transformers

The goal for this notebook is to predict how long will it take to do a recipe based on the [Recipe NLG Dataset](https://recipenlg.cs.put.poznan.pl/)

## ⬇️ Imports

In [17]:
%%bash
pip install --upgrade jupyter ipywidgets

Collecting jupyter
  Downloading jupyter-1.1.1-py2.py3-none-any.whl.metadata (2.0 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.7-py3-none-any.whl.metadata (2.4 kB)
Collecting notebook (from jupyter)
  Downloading notebook-7.4.1-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-console (from jupyter)
  Downloading jupyter_console-6.6.3-py3-none-any.whl.metadata (5.8 kB)
Collecting nbconvert (from jupyter)
  Using cached nbconvert-7.16.6-py3-none-any.whl.metadata (8.5 kB)
Collecting jupyterlab (from jupyter)
  Downloading jupyterlab-4.4.1-py3-none-any.whl.metadata (16 kB)
Collecting widgetsnbextension~=4.0.14 (from ipywidgets)
  Downloading widgetsnbextension-4.0.14-py3-none-any.whl.metadata (1.6 kB)
Collecting jupyterlab_widgets~=3.0.15 (from ipywidgets)
  Downloading jupyterlab_widgets-3.0.15-py3-none-any.whl.metadata (20 kB)
Collecting async-lru>=1.0.0 (from jupyterlab->jupyter)
  Using cached async_lru-2.0.5-py3-none-any.whl.metadata (4.5 kB)
Collecting httpx>=0.25.0 (fr

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm # For progress bars
import os # For device selection
from torch.optim import AdamW 

## 🛠️ Preprocessing
### Downloading the dataset

If not already done use this [notebook](../../preprocessing/preprocessing.ipynb) to download the dataset in your local directory.

In [2]:
data = pd.read_csv('../../../data/processed/preprocessed_recipe.csv')

In [3]:
data.head()

Unnamed: 0,index,id,name,minutes,n_steps,description,n_ingredients,steps_string_standardize,ingredients_text,tags_text,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,0,137739,arriba baked squash mexican,55,11,autumn is my favorite time of year to cook! th...,7,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,1,31490,breakfast pizza,30,9,this recipe calls for the crust to be prebaked...,6,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,2,112140,chili,130,6,this modified version of 'mom's' chili was a h...,13,brown ground beef in larg pot add chop onion t...,"['ground beef', 'yellow onions', 'diced tomato...","['time-to-make', 'course', 'preparation', 'mai...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,3,59389,alouette potato,45,11,"this is a super easy, great tasting, make ahea...",11,place potato in a larg pot of lightli salt wat...,"['spreadable cheese with garlic and herbs', 'n...","['60-minutes-or-less', 'time-to-make', 'course...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,4,44061,amish tomato ketchup canning,190,5,my dh's amish mother raised him on this recipe...,8,"mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",352.9,1.0,337.0,23.0,3.0,0.0,28.0


In [104]:
""" seems to have no effect
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"])
texts = data["steps_string_standardize"].astype(str).tolist()
num_cpus = os.cpu_count()
n_process = max(1, int(num_cpus * 0.7)) if num_cpus else 1
docs = list(nlp.pipe(texts, batch_size=50, n_process=n_process))
data["steps_tokens"] = [[token.text for token in doc] for doc in docs]
"""

' seems to have no effect\nnlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "tagger", "lemmatizer"])\ntexts = data["steps_string_standardize"].astype(str).tolist()\nnum_cpus = os.cpu_count()\nn_process = max(1, int(num_cpus * 0.7)) if num_cpus else 1\ndocs = list(nlp.pipe(texts, batch_size=50, n_process=n_process))\ndata["steps_tokens"] = [[token.text for token in doc] for doc in docs]\n'

In [None]:
from recipe_dataset import RecipeDataset # Add this import

MODEL_NAME = 'bert-base-uncased' # You can choose other models like 'distilbert-base-uncased' for faster training
MAX_LEN = 256 # Maximum sequence length for the transformer
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5

# Select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

data['minutes'] = data['minutes'].astype(float)

texts = data['steps_string_standardize'].tolist()
labels = data['minutes'].tolist()

# Split data (using indices is good practice if you need to refer back to original data)
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

# --- Tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# --- Create DataLoaders ---
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = RecipeDataset(
        texts=texts,
        labels=labels,
        tokenizer=tokenizer,
        max_len=max_len
    )
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=min(4, os.cpu_count()) # Adjust num_workers based on your system
    )

train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(val_texts, val_labels, tokenizer, MAX_LEN, BATCH_SIZE)

# --- Load Model ---
# Use AutoModelForSequenceClassification with num_labels=1 for regression
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=1)
model = model.to(device)

# --- Optimizer and Scheduler ---
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0, # You can adjust warmup steps if needed
    num_training_steps=total_steps
)

# --- Loss Function ---
# Use Mean Squared Error Loss for regression
loss_fn = torch.nn.MSELoss().to(device)

# --- Training Function ---
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    model = model.train()
    losses = []
    correct_predictions = 0 # Not applicable for regression in the same way as classification

    progress_bar = tqdm(data_loader, desc="Training", leave=False)
    for d in progress_bar:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        labels = d["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        # Squeeze outputs and labels to match dimensions for MSELoss
        preds = outputs.logits.squeeze()
        labels = labels.squeeze()

        loss = loss_fn(preds, labels)
        losses.append(loss.item())

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0) # Gradient clipping
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        progress_bar.set_postfix({'loss': loss.item()})


    return np.mean(losses) # Return average loss for the epoch

# --- Evaluation Function ---
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()
    losses = []
    # Store predictions and actual labels for metrics calculation
    all_preds = []
    all_labels = []

    with torch.no_grad():
        progress_bar = tqdm(data_loader, desc="Evaluating", leave=False)
        for d in progress_bar:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            labels = d["labels"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = outputs.logits.squeeze()
            labels = labels.squeeze()

            loss = loss_fn(preds, labels)
            losses.append(loss.item())

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            progress_bar.set_postfix({'loss': loss.item()})


    # Calculate Regression Metrics
    from sklearn.metrics import mean_absolute_error, r2_score
    mae = mean_absolute_error(all_labels, all_preds)
    r2 = r2_score(all_labels, all_preds)

    return np.mean(losses), mae, r2

# --- Training Loop ---
history = {'train_loss': [], 'val_loss': [], 'val_mae': [], 'val_r2': []}

for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(train_texts)
    )
    print(f'Train loss {train_loss}')
    history['train_loss'].append(train_loss)

    val_loss, val_mae, val_r2 = eval_model(
        model,
        val_data_loader,
        loss_fn,
        device,
        len(val_texts)
    )
    print(f'Val loss {val_loss}, Val MAE {val_mae}, Val R2 {val_r2}')
    history['val_loss'].append(val_loss)
    history['val_mae'].append(val_mae)
    history['val_r2'].append(val_r2)

print("Training complete.")

# --- Optional: Plot training history ---
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 5))

plt.subplot(1, 3, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Validation Loss')
plt.title('Loss')
plt.legend()

plt.subplot(1, 3, 2)
plt.plot(history['val_mae'], label='Validation MAE')
plt.title('Mean Absolute Error')
plt.legend()

plt.subplot(1, 3, 3)
plt.plot(history['val_r2'], label='Validation R^2')
plt.title('R^2 Score')
plt.legend()

plt.tight_layout()
plt.show()

# --- Optional: Make predictions on test set (if needed) ---
# You can adapt the eval_model function or create a new one
# to get predictions for specific examples.

Using device: cpu


2025-05-05 16:19:44.987842: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746454785.037232  247500 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746454785.053231  247500 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1746454785.085481  247500 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746454785.085510  247500 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1746454785.085513  247500 computation_placer.cc:177] computation placer alr

Epoch 1/3
----------


Training:   0%|          | 0/10835 [00:00<?, ?it/s]

# Small tests

In [None]:
original_row = data.loc[X_test_indices[33]]
display(original_row)

print("y_test[33]:", y_test[33])
print("y_pred_lr[33]:", y_pred_lr[33])

id                                                                     322833
name                                            cucumber cilantro pasta salad
minutes                                                                    30
n_steps                                                                     4
description                 so cool and refreshing. if you like spicy, you...
n_ingredients                                                               9
steps_string_standardize    cook pasta , drain combin lime juic , cilantro...
ingredients_text            ['cucumber', 'roma tomato', 'red onion', 'lime...
tags_text                   ['30-minutes-or-less', 'time-to-make', 'course...
calories                                                                 34.8
total_fat                                                                 0.0
sugar                                                                    18.0
sodium                                                          

y_test[33]: 30
y_pred_lr[33]: 26.619497648857614


In [25]:
data.head()

Unnamed: 0,id,name,minutes,n_steps,description,n_ingredients,steps_string_standardize,ingredients_text,tags_text,calories,total_fat,sugar,sodium,protein,saturated_fat,carbohydrates
0,137739,arriba baked squash mexican,55,11,autumn is my favorite time of year to cook! th...,7,make a choic and proceed with recip depend on ...,"['winter squash', 'mexican seasoning', 'mixed ...","['60-minutes-or-less', 'time-to-make', 'course...",51.5,0.0,13.0,0.0,2.0,0.0,4.0
1,31490,breakfast pizza,30,9,this recipe calls for the crust to be prebaked...,6,preheat oven to 103.33 celsius °c press dough ...,"['prepared pizza crust', 'sausage patty', 'egg...","['30-minutes-or-less', 'time-to-make', 'course...",173.4,18.0,0.0,17.0,22.0,35.0,1.0
2,112140,chili,130,6,this modified version of 'mom's' chili was a h...,13,brown ground beef in larg pot add chop onion t...,"['ground beef', 'yellow onions', 'diced tomato...","['time-to-make', 'course', 'preparation', 'mai...",269.8,22.0,32.0,48.0,39.0,27.0,5.0
3,59389,alouette potato,45,11,"this is a super easy, great tasting, make ahea...",11,place potato in a larg pot of lightli salt wat...,"['spreadable cheese with garlic and herbs', 'n...","['60-minutes-or-less', 'time-to-make', 'course...",368.1,17.0,10.0,2.0,14.0,8.0,20.0
4,44061,amish tomato ketchup canning,190,5,my dh's amish mother raised him on this recipe...,8,"mix all ingredients& boil for 2 30.0 minute , ...","['tomato juice', 'apple cider vinegar', 'sugar...","['weeknight', 'time-to-make', 'course', 'main-...",352.9,1.0,337.0,23.0,3.0,0.0,28.0
