## Libraries


In [1]:
!pip install huggingface_hub --quiet

from huggingface_hub import hf_hub_url
import requests
import os

# Folder to store model files
model_dir = "./base_model"
os.makedirs(model_dir, exist_ok=True)

# Hugging Face repo
repo_id = "flax-community/t5-recipe-generation"

# List of files to download
files = [
    "config.json",
    "model.safetensors",
    "special_tokens_map.json",
    "tokenizer.json",
    "tokenizer_config.json"
]

# Download each file directly into ./model/
for file_name in files:
    url = hf_hub_url(repo_id, filename=file_name)
    r = requests.get(url)
    with open(os.path.join(model_dir, file_name), "wb") as f:
        f.write(r.content)
    print(f"Downloaded {file_name} to {model_dir}")


Downloaded config.json to ./base_model
Downloaded model.safetensors to ./base_model
Downloaded special_tokens_map.json to ./base_model
Downloaded tokenizer.json to ./base_model
Downloaded tokenizer_config.json to ./base_model


In [2]:
import torch
import configparser
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import T5ForConditionalGeneration, T5TokenizerFast, get_scheduler
from torch.optim import AdamW
from tqdm import tqdm
from torch.utils.data import DataLoader


2025-10-06 10:52:49.698443: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1759747969.919829      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1759747969.983851      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


### Paths loading

In [3]:
config = configparser.ConfigParser()
config.read('/kaggle/input/config/config.ini')

prepared_dataset_path = config['PATHS']['prepared_dataset_path']
base_model_path = config['PATHS']['prepared_dataset_path']
trained_model_path = config['PATHS']['prepared_dataset_path']
model_dir = config['PATHS']['model_dir']

train_tokens_path = config['PATHS']['train_tokens_path']
val_tokens_path = config['PATHS']['val_tokens_path']


## Data loading


In [4]:
df = pd.read_csv("/kaggle/input/10k-prepared-reciepe-data/prepared_dataset.csv")
df.dropna(how='any', inplace=True)
df.isna().sum()
df.head()


Unnamed: 0,input_text,target_text
0,"1 1/2 lbs cube steaks, 1/4 cup self rising flo...",dredge steak pieces in flour. in a large skill...
1,"1 medium leek, (white portion only), halved an...","in a large saucepan, saute leek in butter unti..."
2,"1 whole chicken, 2 c. cream of chicken soup, s...","boil and bone chicken. mix chicken with soup, ..."
3,"1 crab (about 1 1/2 - 2 pounds), 2 inches ging...",mix the sauce and set aside. clean the crab a...
4,"2 1/2 cups flour, all-purpose, 1 1/2 teaspoons...",preheat oven to 375f (190c) (190c). grease bak...


In [5]:
df_train, df_val = train_test_split(df, test_size=0.1, random_state=0)

In [6]:
del df
df_train.shape, df_val.shape

((9000, 2), (1000, 2))

In [7]:
model_dir

'./base_model/'

In [8]:
tokenizer = T5TokenizerFast.from_pretrained(model_dir)

batch_size = 100000

def preprocess_in_batches(df, batch_size=512):
    input_ids, attention_masks, labels = [], [], []

    for i in tqdm(range(0, len(df), batch_size)):
        batch_df = df.iloc[i:i+batch_size]

        # Tokenize inputs
        inputs = tokenizer(
            batch_df["input_text"].tolist(),
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        # Tokenize targets
        targets = tokenizer(
            batch_df["target_text"].tolist(),
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )

        input_ids.append(inputs["input_ids"])
        attention_masks.append(inputs["attention_mask"])
        labels.append(targets["input_ids"])

        # Optionally free memory
        del inputs, targets

    # Concatenate all batches into single tensors
    return {
        "input_ids": torch.cat(input_ids, dim=0),
        "attention_mask": torch.cat(attention_masks, dim=0),
        "labels": torch.cat(labels, dim=0)
    }

# Use smaller batches if memory is tight
train_data = preprocess_in_batches(df_train, batch_size=batch_size)
del df_train

val_data = preprocess_in_batches(df_val, batch_size=batch_size)
del df_val

100%|██████████| 1/1 [00:06<00:00,  6.36s/it]
100%|██████████| 1/1 [00:00<00:00,  1.62it/s]


In [9]:
# ########### Saving token because it is costly to run multiple time

# torch.save(train_data, train_tokens_path)
# torch.save(val_data, val_tokens_path)


In [10]:
# ########### Saving token because it is costly to run multiple time

# train_data = torch.load(train_tokens_path)
# val_data = torch.load(val_tokens_path)

### Model loading

In [11]:
# Load the model safely
model = T5ForConditionalGeneration.from_pretrained(
    model_dir,
    device_map="auto"            # automatically put on GPU if available
)

In [12]:
device = next(model.parameters()).device
print("Model loaded on device:", device)


Model loaded on device: cuda:0


In [13]:
batch_size = 16
num_epochs = 3
lr = 5e-5

train_loader = DataLoader(
    list(zip(train_data["input_ids"], train_data["attention_mask"], train_data["labels"])),
    batch_size=batch_size,
    shuffle=True
)
val_loader = DataLoader(
    list(zip(val_data["input_ids"].to(device), val_data["attention_mask"].to(device), val_data["labels"].to(device))),
    batch_size=batch_size
)

In [14]:
optimizer = AdamW(model.parameters(), lr=lr)
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)


### Evaluating model performance before finetunning

In [15]:
model.eval()  # no gradient updates
val_loss = 0
all_preds = []
all_labels = []

with torch.no_grad():
    total_loss = 0
    for batch in tqdm(val_loader, desc="Evaluating initial loss"):
        input_ids, attention_mask, labels = batch  # unpack the tuple
        # input_ids = input_ids.to(device)
        # attention_mask = attention_mask.to(device)
        # labels = labels.to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        total_loss += outputs.loss.item()

        generated_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=labels.shape[1],
            num_beams=4
        )

        preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
        refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

        all_preds.extend(preds)
        all_labels.extend(refs)

        del input_ids
        del attention_mask
        del labels

initial_loss = total_loss / len(val_loader)
print(f"Average loss before fine-tuning: {initial_loss:.4f}")


Evaluating initial loss:   0%|          | 0/63 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
Evaluating initial loss: 100%|██████████| 63/63 [36:45<00:00, 35.01s/it]

Average loss before fine-tuning: 23.5893





In [18]:
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

In [19]:
# Compute BLEU
smooth_fn = SmoothingFunction().method1
bleu_scores = [sentence_bleu([ref.split()], pred.split(), smoothing_function=smooth_fn) 
               for pred, ref in zip(all_preds, all_labels)]
average_bleu = sum(bleu_scores) / len(bleu_scores)
print(f"Average BLEU Score: {average_bleu:.4f}")

Average BLEU Score: 0.0108


In [20]:
# Compute ROUGE
rouge = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge1, rouge2, rougeL = 0, 0, 0
for pred, ref in zip(all_preds, all_labels):
    scores = rouge.score(ref, pred)
    rouge1 += scores['rouge1'].fmeasure
    rouge2 += scores['rouge2'].fmeasure
    rougeL += scores['rougeL'].fmeasure

n = len(all_preds)
print(f"Average ROUGE-1: {rouge1/n:.4f}")
print(f"Average ROUGE-2: {rouge2/n:.4f}")
print(f"Average ROUGE-L: {rougeL/n:.4f}")

Average ROUGE-1: 0.1235
Average ROUGE-2: 0.0350
Average ROUGE-L: 0.0840


In [21]:
import math

average_loss = total_loss / len(val_loader)
perplexity = math.exp(average_loss)
print(f"Average Loss: {average_loss:.4f}")
print(f"Perplexity (PPL): {perplexity:.4f}")


Average Loss: 23.5893
Perplexity (PPL): 17567828210.8236


In [26]:
from evaluate import load

bertscore = load("bertscore")
results = bertscore.compute(predictions=all_preds, references=all_labels, lang="en")

print(f"Average BERTScore (Precision): {sum(results['precision']) / len(results['precision']):.4f}")
print(f"Average BERTScore (Recall): {sum(results['recall']) / len(results['recall']):.4f}")
print(f"Average BERTScore (F1): {sum(results['f1']) / len(results['f1']):.4f}")


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average BERTScore (Precision): 0.7360
Average BERTScore (Recall): 0.8178
Average BERTScore (F1): 0.7742


In [27]:
from sentence_transformers import SentenceTransformer, util

# Compute cosine similarity
model_emb = SentenceTransformer("all-MiniLM-L6-v2")
pred_emb = model_emb.encode(all_preds, convert_to_tensor=True)
ref_emb = model_emb.encode(all_labels, convert_to_tensor=True)
cosine_scores = util.cos_sim(pred_emb, ref_emb)
print(f"Average Semantic Cosine Similarity: {cosine_scores.diag().mean().item():.4f}")


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Average Semantic Cosine Similarity: 0.5326
