# Baseline Notebook

In [1]:
!pip install -U accelerate




In [2]:
!pip install -U transformers



In [3]:
pip install torch



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
import torch
import torch.nn as nn
from transformers import GPT2TokenizerFast, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
from tqdm.auto import tqdm
import pandas as pd
import numpy as np
import accelerate

In [7]:
model_name = 'gpt2'

In [8]:
model_save_path = '/content/drive/MyDrive/major project'

In [9]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

In [10]:
tokenizer.save_pretrained(model_save_path)

('/content/drive/MyDrive/major project/tokenizer_config.json',
 '/content/drive/MyDrive/major project/special_tokens_map.json',
 '/content/drive/MyDrive/major project/vocab.json',
 '/content/drive/MyDrive/major project/merges.txt',
 '/content/drive/MyDrive/major project/added_tokens.json',
 '/content/drive/MyDrive/major project/tokenizer.json')

In [11]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

[50259]

In [12]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(**inputs,max_length=256,do_sample=True,pad_token_id=50259)
    print(tokenizer.decode(output[0]))

In [13]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|unknown|>',
 'pad_token': '<|pad|>'}

In [14]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

[50257]

In [15]:
clean = pd.read_csv('/content/drive/MyDrive/dataset/Food_Recipe_Dataset.csv')
clean = clean.sample(frac=1)
clean.reset_index(drop=True,inplace=True)

In [16]:
print(clean['Cuisine'].unique())

['North Indian Recipes' 'Fusion' 'Andhra' 'Continental' 'Rajasthani'
 'Indian' 'Himachal' 'Italian Recipes' 'Punjabi' 'European' 'Karnataka'
 'South Indian Recipes' 'Maharashtrian Recipes' 'Bengali Recipes'
 'Tamil Nadu' 'Malabar' 'Indo Chinese' 'Uttar Pradesh' 'Japanese'
 'Mexican' 'Awadhi' 'Mediterranean' 'African' 'North Karnataka' 'Konkan'
 'Chettinad' 'Kerala Recipes' 'Asian' 'Goan Recipes' 'Malaysian'
 'Kashmiri' 'Chinese' 'Pakistani' 'Nagaland' 'Mangalorean' 'Mughlai'
 'French' 'Thai' 'Udupi' 'Middle Eastern' 'Parsi Recipes'
 'Gujarati Recipes\ufeff' 'American' 'Sindhi' 'Malvani' 'Bihari'
 'South Karnataka' 'Hyderabadi' 'Assamese' 'Indonesian' 'Arab' 'Jewish'
 'Caribbean' 'North East India Recipes' 'Appetizer' 'Oriya Recipes'
 'Jharkhand' 'Greek' 'Coastal Karnataka' 'Coorg' 'Vietnamese'
 'World Breakfast' 'Sri Lankan' 'British' 'Haryana' 'Lucknowi' 'Afghan'
 'Sichuan' 'Cantonese' 'Nepalese' 'Side Dish' 'Uttarakhand-North Kumaon'
 'Lunch' 'Shandong' 'Snack' 'Korean' 'Dinner' 'Hun

In [17]:
def print_recipe(idx):
    print(f"{clean['ingredients'][idx]}\n\n{clean['instructions'][idx]}")

In [18]:
def form_string(ingredient,instruction):
    s = f"<|startoftext|>Ingredients:\n{ingredient.strip()}\n\nInstructions:\n{instruction.strip()}<|endoftext|>"
    return s

In [19]:
data = clean.apply(lambda x:form_string(x['TranslatedIngredients'],x['TranslatedInstructions']),axis=1).to_list()

https://towardsdatascience.com/guide-to-fine-tuning-text-generation-models-gpt-2-gpt-neo-and-t5-dc5de6b3bc5e

In [20]:
train_size = 0.85
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

In [21]:
class RecipeDataset:
    def __init__(self,data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=1024,
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'],0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'],0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [22]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }

In [23]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

  0%|          | 0/5047 [00:00<?, ?it/s]

  0%|          | 0/891 [00:00<?, ?it/s]

In [24]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=2,
                         per_device_eval_batch_size=2,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=3,
                         save_strategy='no'
                        )

In [25]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim,20,eta_min=1e-7)

In [26]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collate_fn,
                  optimizers=(optim,scheduler)
                 )

In [None]:
trainer.train()

Step,Training Loss
500,1.1299
1000,0.8162
1500,0.7737
2000,0.7298
2500,0.723
3000,0.7032


In [5]:
trainer.save_model()

NameError: name 'trainer' is not defined

In [None]:
from transformers import pipeline

In [None]:
pl = pipeline(task='text-generation',model='/content/drive/MyDrive/major project')

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['Rice,Potatoes,Tomatoes,Spinach,red bell peppers','chicken,tomatoes,aloo,jeera,curry powder']

In [None]:

for ing in ingredients:
    prompt = create_prompt(ing)
    print(pl(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])