In [1]:
import torch
import torch.nn as nn
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

from transformers import Trainer, TrainingArguments

from tqdm.auto import tqdm

import pandas as pd
import numpy as np

In [2]:
model_name = 'gpt2'

In [3]:
model_save_path = './khaanaGPT'

In [4]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Embedding(50260, 768)

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
tokenizer.save_pretrained(model_save_path)

('./khaanaGPT/tokenizer_config.json',
 './khaanaGPT/special_tokens_map.json',
 './khaanaGPT/vocab.json',
 './khaanaGPT/merges.txt',
 './khaanaGPT/added_tokens.json',
 './khaanaGPT/tokenizer.json')

In [7]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

[50259]

In [8]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(**inputs,max_length=256,do_sample=True,pad_token_id=50259)
    print(tokenizer.decode(output[0]))

In [9]:
tokenizer.special_tokens_map

{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|unknown|>',
 'pad_token': '<|pad|>'}

In [10]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

[50257]

In [11]:
clean = pd.read_csv('/content/Cleaned_Indian_Food_Dataset.csv')
clean = clean.sample(frac=1)
clean.reset_index(drop=True,inplace=True)

In [12]:
def print_recipe(idx):
    print(f"{clean['Cleaned-Ingredients'][idx]}\n\n{clean['TranslatedInstructions'][idx]}")

In [13]:
def form_string(ingredient,instruction):
    s = f"<|startoftext|>Cleaned-Ingredients:\n{ingredient.strip()}\n\nTranslatedInstructions:\n{instruction.strip()}<|endoftext|>"
    return s

In [14]:
data = clean.apply(lambda x:form_string(x['Cleaned-Ingredients'],x['TranslatedInstructions']),axis=1).to_list()

In [15]:
train_size = 0.90
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

In [16]:
class RecipeDataset:
    def __init__(self,data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=1024,
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'],0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'],0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [17]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }

In [18]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

  0%|          | 0/5344 [00:00<?, ?it/s]

  0%|          | 0/594 [00:00<?, ?it/s]

In [19]:
from transformers import TrainingArguments


In [20]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=2,
                         per_device_eval_batch_size=2,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=5,
                         save_strategy='no'
                        )

In [21]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim,20,eta_min=1e-7)

In [22]:
from transformers import Trainer


In [25]:
!pip install accelerate



In [None]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collate_fn,
                  optimizers=(optim,scheduler)
                 )

In [None]:
trainer.train()

Step,Training Loss


In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline
pl = pipeline(task='text-generation',model='/kaggle/working/khaanaGPT')


In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['Rice,Potatoes,Tomatoes,Spinach,red bell peppers','chicken,tomatoes,aloo,jeera,curry powder']

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    print(pl(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])


In [None]:
!zip -r file.zip /kaggle/working/