In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
data_path = "/content/drive/MyDrive/DL/Project/data/food-ingredients-and-recipe-dataset-with-images.zip"
url = "https://www.kaggle.com/api/v1/datasets/download/pes12017000148/food-ingredients-and-recipe-dataset-with-images"

In [None]:
# download data
import os
import subprocess

def download_data_if_not_exists(data_path, url):
  """Downloads data using curl if it doesn't already exist."""
  if not os.path.exists(data_path):
    print("Data not found. Downloading...")
    subprocess.run(["curl", "-L", "-o", data_path, url], check=True)
    print("Download complete.")
  else:
    print("Data already downloaded.")

# Call the function to download the data if necessary
download_data_if_not_exists(data_path, url)

Data not found. Downloading...
Download complete.


In [None]:
# unzip data
import os
import zipfile

def unzip_data(zip_path, extract_path):
  """Unzips the data file if it exists."""
  if os.path.exists(zip_path):
    print("Unzipping data...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      zip_ref.extractall(extract_path)
    print("Unzipping complete.")
  else:
    print("Zip file not found. Please download the data first.")

# Call the function to unzip the data if the zip file exists
unzip_data(data_path, "/content/drive/MyDrive/DL/Project/data/")

Unzipping data...
Unzipping complete.


In [None]:
# set device
import torch

device = 'mps' if torch.backends.mps.is_available() else ('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device = " + device)
if device == 'cpu':
    print("WARNING: Using CPU will cause slower train times")

Using device = cpu


In [None]:
# read data and create train and test text files
import pandas as pd
import re
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/DL/Project/data/Food Ingredients and Recipe Dataset with Image Name Mapping.csv')

def build_text_files(data_csv, dest_path):
    f = open(dest_path, 'w')
    data = ''
    for texts in data_csv.iterrows():
        ingredients  = str(texts[1]['Ingredients']).strip()[1:-1]
        instructions = str(texts[1]['Instructions']).strip()

        ingredients = re.sub(r"\s", " ", ingredients)
        instructions = re.sub(r"\s", " ", instructions)

        summary = 'Ingredients: ' + ingredients + '\nInstructions: ' + instructions
        data += summary + "\n"
    f.write(data)

train, test = train_test_split(df,test_size=0.15)

build_text_files(train,'/content/drive/MyDrive/DL/Project/data/train_dataset.txt')
build_text_files(test,'/content/drive/MyDrive/DL/Project/data/test_dataset.txt')

print("Train dataset length: "+str(len(train)))
print("Test dataset length: "+ str(len(test)))

# Train dataset length: 11475
# Test dataset length: 2026

Train dataset length: 11475
Test dataset length: 2026


In [None]:
# load the data into huggingface datasets and create model and tokenizer
from transformers import TextDataset,DataCollatorForLanguageModeling
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")

train_path = '/content/drive/MyDrive/DL/Project/data/train_dataset.txt'
test_path = '/content/drive/MyDrive/DL/Project/data/test_dataset.txt'

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=128)

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=128)

    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )
    return train_dataset,test_dataset,data_collator

train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)



In [None]:
# trainig step
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/DL/Project/gpt2-recipe",            # Directory to save the model
    num_train_epochs=10,                    # Number of training epochs
    per_device_train_batch_size=128,         # Batch size
    per_device_eval_batch_size=128,          # Evaluation batch size
    logging_dir='/content/drive/MyDrive/DL/Project/logs',                  # Directory for logging
    save_steps=500,                        # Save the model every N steps
    eval_strategy="epoch",           # Evaluate every N steps
    logging_steps=100,                     # Log every N steps
    warmup_steps=200,                      # Warm-up steps for learning rate scheduler
    weight_decay=0.01,                     # Weight decay
    logging_first_step=True,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Optional: You can provide a validation dataset if available
    data_collator=data_collator,
)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 