###**Install the required packages**

In [None]:
!pip install transformers
!pip install torch
!pip install tqdm
! pip install accelerate -U
! pip install transformers -U

Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m32.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m96.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.6 MB/s[0m eta [36m0:00:00[0m
Col

###**Import the necessary libraries**

In [None]:
import torch
import torch.nn as nn
import accelerate
import transformers
transformers.__version__, accelerate.__version__

from transformers import GPT2TokenizerFast, GPT2LMHeadModel

from transformers import Trainer, TrainingArguments

from tqdm.auto import tqdm

import pandas as pd
import numpy as np

###**Tokenization and Model Initialization**

In [None]:
model_name = 'gpt2'

In [None]:
model_save_path = './khaanaGPT'

In [None]:
tokenizer = GPT2TokenizerFast.from_pretrained(model_name,
                                              bos_token='<|startoftext|>',
                                              eos_token='<|endoftext|>',
                                              unk_token='<|unknown|>',
                                              pad_token='<|pad|>'
                                             )
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50260, 768)

In [None]:
tokenizer.save_pretrained(model_save_path)

('./khaanaGPT/tokenizer_config.json',
 './khaanaGPT/special_tokens_map.json',
 './khaanaGPT/vocab.json',
 './khaanaGPT/merges.txt',
 './khaanaGPT/added_tokens.json',
 './khaanaGPT/tokenizer.json')

In [None]:
tokenizer.convert_tokens_to_ids(['<|pad|>'])

[50259]

In [None]:
def generate(prompt):
    inputs = tokenizer.encode_plus(prompt, return_tensors='pt')
    output = model.generate(**inputs,max_length=256,do_sample=True,pad_token_id=50259)
    print(tokenizer.decode(output[0]))

In [None]:
tokenizer.special_tokens_map

Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


{'bos_token': '<|startoftext|>',
 'eos_token': '<|endoftext|>',
 'unk_token': '<|unknown|>',
 'pad_token': '<|pad|>'}

In [None]:
tokenizer.convert_tokens_to_ids(['<|startoftext|>'],)

[50257]

###**Loading the Dataset and Preprocessing**

In [None]:
clean = pd.read_csv('Cleaned_Indian_Food_Dataset.csv')
clean = clean.sample(frac=1)
clean.reset_index(drop=True,inplace=True)

###**Recipe Formatting and Concatenation**

In [None]:
def print_recipe(idx):
   prompt = f"Ingredients:\n{', '.join(ingredients)}\n\nInstructions:"
   instructions = generate_instructions(ingredients.split(','))
   print(f"Ingredients:\n{ingredients}\n\nInstructions:\n{instructions}\n")

In [None]:
def form_string(ingredient,instruction):
    s = f"<|startoftext|>Ingredients:\n{ingredient.strip()}\n\nInstructions:\n{instruction.strip()}<|endoftext|>"
    return s

In [None]:
data = clean.apply(lambda x:form_string(x['TranslatedInstructions'],x['TranslatedIngredients']),axis=1).to_list()

###**Dataset split for training and validation**

In [None]:
train_size = 0.85
train_len = int(train_size * len(data))
train_data = data[:train_len]
val_data = data[train_len:]

###**Custom Dataset Creation for training**

In [None]:
class RecipeDataset:
    def __init__(self,data):
        self.data = data
        self.input_ids = []
        self.attn_masks = []

        for item in tqdm(data):
            encodings = tokenizer.encode_plus(item,
                                              truncation=True,
                                              padding='max_length',
                                              max_length=1024,
                                              return_tensors='pt'
                                             )
            self.input_ids.append(torch.squeeze(encodings['input_ids'],0))
            self.attn_masks.append(torch.squeeze(encodings['attention_mask'],0))

    def __len__(self):
        return len(self.data)

    def __getitem__(self,idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [None]:
def collate_fn(batch):
    return {
        'input_ids': torch.stack([item[0] for item in batch]),
        'attention_mask': torch.stack([item[1] for item in batch]),
        'labels': torch.stack([item[0] for item in batch])
    }

In [None]:
train_ds = RecipeDataset(train_data)
val_ds = RecipeDataset(val_data)

  0%|          | 0/339 [00:00<?, ?it/s]

  0%|          | 0/60 [00:00<?, ?it/s]

###**Training Configuration**

In [None]:
args = TrainingArguments(output_dir=model_save_path,
                         per_device_train_batch_size=2,
                         per_device_eval_batch_size=2,
                         gradient_accumulation_steps=2,
                         report_to='none',
                         num_train_epochs=3,
                         save_strategy='no'
                        )

In [None]:
optim = torch.optim.AdamW(model.parameters(),lr=5e-5)
scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optim,20,eta_min=1e-7)

In [None]:
my_variable = None

if my_variable is not None:
    length = len(my_variable)
else:
    # Handle the case when my_variable is None
    length = 0  # Or do something else


###**Model Training**

In [None]:
trainer = Trainer(model,
                  args,
                  train_dataset=train_ds,
                  eval_dataset=val_ds,
                  data_collator=collate_fn,
                  optimizers=(optim,scheduler)
                 )

In [None]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=255, training_loss=1.3587970808440564, metrics={'train_runtime': 343.0826, 'train_samples_per_second': 2.964, 'train_steps_per_second': 0.743, 'total_flos': 531467993088000.0, 'train_loss': 1.3587970808440564, 'epoch': 3.0})

In [None]:
trainer.save_model()

In [None]:
from transformers import pipeline

In [None]:
generator = pipeline(task='text-generation',model='./khaanaGPT')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


###**Text Generation**

In [None]:
def create_prompt(ingredients):
    ingredients = ','.join([x.strip().lower() for x in ingredients.split(',')])
    ingredients = ingredients.strip().replace(',','\n')
    s = f"<|startoftext|>Ingredients:\n{ingredients}\n"
    return s

In [None]:
ingredients = ['1/2 teaspoon Turmeric powder (Haldi),1 tablespoon Coriander (Dhania) Seeds,4 Dry Red Chillies,1 teaspoon Sesame (Gingelly) Oil,4 cloves Garlic,1 teaspoon Garam masala powder,Salt - to taste,2 tablespoon Sesame (Gingelly) Oil,4 Green Chillies - slit,500 grams Chicken,6 cloves Garlic - finely chopped,1 teaspoon Fennel seeds (Saunf),2 Onion - chopped,2 cups Sorrel Leaves (Gongura) - picked and chopped,1/4 teaspoon Methi Seeds (Fenugreek Seeds),1 inch Ginger - finely chopped,1 Tomato - chopped']

###**Generating Recipes**

In [None]:
for ing in ingredients:
    prompt = create_prompt(ing)
    print(generator(prompt,
         max_new_tokens=512,
         penalty_alpha=0.6,
         top_k=4,
         pad_token_id=50259
        )[0]['generated_text'])
def generate_instructions(ingredients):
    # Create a prompt with ingredients
    prompt = f"Ingredients:\n{', '.join(ingredients)}\n\nInstructions:"
    generated_instructions = generated_text[0]['generated_text'].split('Instructions:', 1)[-1].strip()
    return generated_instructions
    instructions = generate_instructions(ingredients.split(','))
    print(f"Ingredients:\n{ingredients}\n\nInstructions:\n{instructions}\n")


<|startoftext|>Ingredients:
1/2 teaspoon turmeric powder (haldi)
1 tablespoon coriander (dhania) seeds
4 dry red chillies
1 teaspoon sesame (gingelly) oil
4 cloves garlic
1 teaspoon garam masala powder
salt - to taste
2 tablespoon sesame (gingelly) oil
4 green chillies - slit
500 grams chicken
6 cloves garlic - finely chopped
1 teaspoon fennel seeds (saunf)
2 onion - chopped
2 cups sorrel leaves (gongura) - picked and chopped
1/4 teaspoon methi seeds (fenugreek seeds)
1 inch ginger - finely chopped
1 tomato - chopped
Salt - to taste
To make the Masala Powder, wash the chicken in a pan.
In a small saucepan, add the turmeric powder and coriander and cook until the turmeric powder starts to boil.
Once the turmeric powder is boiling, remove from the heat and let it cook for a minute.
Once the turmeric powder has boiled, add the chicken and cook until it turns translucent from the chicken.
Add the sesame oil and cook till the sesame turns translucent and soft.
Once cooked, drain off excess 