In [1]:
!pip install transformers==4.5.1
from transformers import AutoTokenizer 

Collecting transformers==4.5.1
  Downloading transformers-4.5.1-py3-none-any.whl (2.1 MB)
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.12.5
    Uninstalling transformers-4.12.5:
      Successfully uninstalled transformers-4.12.5
Successfully installed transformers-4.5.1


In [2]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns


## Understanding the data


In [3]:
data= json.load(open('recipes_raw_nosource_ar.json','r'))

In [4]:
keys = list(data.keys()) 
data[keys[0]]

{'title': 'Slow Cooker Chicken and Dumplings',
 'ingredients': ['4 skinless, boneless chicken breast halves ADVERTISEMENT',
  '2 tablespoons butter ADVERTISEMENT',
  '2 (10.75 ounce) cans condensed cream of chicken soup ADVERTISEMENT',
  '1 onion, finely diced ADVERTISEMENT',
  '2 (10 ounce) packages refrigerated biscuit dough, torn into pieces ADVERTISEMENT',
  'ADVERTISEMENT'],
 'instructions': 'Place the chicken, butter, soup, and onion in a slow cooker, and fill with enough water to cover.\nCover, and cook for 5 to 6 hours on High. About 30 minutes before serving, place the torn biscuit dough in the slow cooker. Cook until the dough is no longer raw in the center.\n',
 'picture_link': '55lznCYBbs2mT8BTx6BTkLhynGHzM.S'}

In [5]:
recipe_sources = ['ar','epi','fn']

In [6]:
df = pd.DataFrame()
sources,titles,ingredients,instructions=[],[],[],[]
for recipe_source in recipe_sources:
  data= json.load(
        open(f'recipes_raw_nosource_{recipe_source}.json','r'))
  for _,recipe in data.items():
    if ('title' in recipe) and ('ingredients' in recipe) and ('instructions' in recipe):
      sources.append(recipe_source)
      titles.append(recipe['title'])
      ingredients.append([ingredient.replace
                        ('ADVERTISEMENT','') for ingredient in recipe['ingredients']])
      instructions.append(str(recipe['instructions']).replace('ADVERTISEMENT','').replace('\n',''))
df['source']= sources  
df['title']=titles
df['ingredients']=ingredients
df['instructions']=instructions

In [8]:
import pickle
df.to_pickle('temp.csv')

In [9]:

df['ingredient_count']= df['ingredients'].str.len().fillna(0).astype(int)
df['instruction_length']= df['instructions'].str.split().str.len().fillna(0).astype(int)




In [10]:
df.head()

Unnamed: 0,source,title,ingredients,instructions,ingredient_count,instruction_length
0,ar,Slow Cooker Chicken and Dumplings,"[4 skinless, boneless chicken breast halves , ...","Place the chicken, butter, soup, and onion in ...",6,52
1,ar,Awesome Slow Cooker Pot Roast,[2 (10.75 ounce) cans condensed cream of mushr...,"In a slow cooker, mix cream of mushroom soup, ...",5,43
2,ar,Brown Sugar Meatloaf,"[1/2 cup packed brown sugar , 1/2 cup ketchup ...",Preheat oven to 350 degrees F (175 degrees C)....,11,64
3,ar,Best Chocolate Chip Cookies,"[1 cup butter, softened , 1 cup white sugar , ...",Preheat oven to 350 degrees F (175 degrees C)....,12,72
4,ar,Homemade Mac and Cheese Casserole,"[8 ounces whole wheat rotini pasta , 3 cups fr...",Preheat oven to 350 degrees F. Line a 2-quart ...,14,171


In [11]:
df.to_pickle('df_recipes_raw.pkl')

## Preprocessing

In [12]:
import pandas as pd

In [13]:
df= pd.read_pickle('df_recipes_raw.pkl')
special_token = '<|endoftext|>'


In [14]:
print(df.shape)

(124647, 6)


In [15]:

df =df[(df.ingredient_count>=3) & (df.instruction_length>=10)]
print(df.shape)

(121774, 6)


In [16]:
# ingredients:/n Instructions : /n special_token
df['combined']=' \n Ingredients: \n' + df.ingredients.str.join('\n') + ' \n Instructions: \n' + df.instructions + special_token 

In [17]:
df.iloc[100].ingredients

['1/2 cup butter, melted ',
 '2 eggs, beaten ',
 '1 (8.5 ounce) package dry corn bread mix ',
 '1 (15 ounce) can whole kernel corn, drained ',
 '1 (14.75 ounce) can creamed corn ',
 '1 cup sour cream ',
 '']

In [18]:
df.iloc[100].instructions

'Preheat oven to 350 degrees F (175 degrees C), and lightly grease a 9x9 inch baking dish.In a medium bowl, combine butter, eggs, corn bread mix, whole and creamed corn and sour cream. Spoon mixture into prepared dish.Bake for 45 minutes in the preheated oven, or until the top is golden brown.'

In [19]:
print(df.iloc[100].combined)

 
 Ingredients: 
1/2 cup butter, melted 
2 eggs, beaten 
1 (8.5 ounce) package dry corn bread mix 
1 (15 ounce) can whole kernel corn, drained 
1 (14.75 ounce) can creamed corn 
1 cup sour cream 
 
 Instructions: 
Preheat oven to 350 degrees F (175 degrees C), and lightly grease a 9x9 inch baking dish.In a medium bowl, combine butter, eggs, corn bread mix, whole and creamed corn and sour cream. Spoon mixture into prepared dish.Bake for 45 minutes in the preheated oven, or until the top is golden brown.<|endoftext|>


In [20]:
df.to_pickle('df_recipes_preprocessed.pkl')

In [21]:
dataset_train = df[:120000].combined.values
dataset_val = df[120000:].combined.values

In [23]:
with open('dataset_train.txt','w', encoding="utf-8") as f:
  f.write(' \n'.join(dataset_train))

with open('dataset_val.txt','w', encoding="utf-8") as f:
  f.write(' \n'.join(dataset_val))



In [24]:
from transformers import AutoModel

In [25]:
tokenizer = AutoTokenizer.from_pretrained('distilgpt2')

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [26]:
from transformers import AutoModelForCausalLM


In [27]:
model = AutoModelForCausalLM.from_pretrained('distilgpt2')

Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [28]:
prompt_text= 'yesterday I went to the shop and '
encoded_prompt=tokenizer.encode(
    prompt_text,
    add_special_tokens=False,
    return_tensors='pt'

)
encoded_prompt

tensor([[8505, 6432,  314, 1816,  284,  262, 6128,  290,  220]])

In [29]:
output_sequences = model.generate(
    input_ids=encoded_prompt,
    max_length=700,
    temperature=0.9,
    top_k=20,
    top_p=0.9,
    repetition_penalty=1,
    do_sample=True,
    num_return_sequence=1,
)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [30]:
tokenizer.decode(output_sequences[0])

"yesterday I went to the shop and iced it with a bunch of coffee. I didn't know where to find it, but I didn't know where to get it. I had to find my home and I was so excited.\nThis recipe was made with the help of the friends who were there. I hope that you will too! I am so happy with it and am so happy to share this recipe with you!\nIngredients 1/2 cup butter\n4 cups cocoa powder\n1/4 cup sugar\n2 tablespoons butter\n2 cups sugar\n2 tablespoons coconut milk\n2 tablespoons butter\n2 cups baking powder\n1/4 cup milk\n1/4 cup flour\n1/4 cup unsweetened cocoa powder\n1/4 cup unsweetened cocoa powder\n1/4 cup milk\n1/4 cup baking powder\n1/4 cup butter\n1/4 cup cocoa powder\n2 tablespoons sugar\n1/4 cup milk\n1/4 cup milk\n1/4 cup butter\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\n1/4 cup milk\

In [31]:
!pip install datasets



In [32]:

import torch 
print(torch.__version__)

1.10.0


In [33]:
!nvidia-smi

Wed Dec 15 12:03:13 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 496.49       Driver Version: 496.49       CUDA Version: 11.5     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   54C    P8    12W /  N/A |    161MiB /  8192MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [34]:
import os
import time
import datetime
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler

from transformers import GPT2LMHeadModel,  GPT2Tokenizer, GPT2Config, GPT2LMHeadModel
from transformers import AdamW, get_linear_schedule_with_warmup

In [35]:
# Load the GPT tokenizer.
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', bos_token='<|startofrecipe|>', eos_token='<|endofrecipe|>', pad_token='<|pad|>')
# add special tokens for title, ingredients and instruction seperator
special_tokens_dict = {'additional_special_tokens': ['<|startofingre|>', '<|startofinstruc|>']}
# check the number of special tokens
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print('We have added', num_added_toks, 'tokens')

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embedding are fine-tuned or trained.


We have added 2 tokens


In [36]:
class Tokenization(Dataset):

  def __init__(self, txt_list, tokenizer, gpt2_type="gpt2", max_length=768):

    self.tokenizer = tokenizer
    self.input_ids = []
    self.attn_masks = []

    for txt in txt_list:

      encodings_dict = tokenizer(txt, truncation=True, max_length=max_length, padding="max_length")

      self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
      self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
    
  def __len__(self):
    return len(self.input_ids)

  def __getitem__(self, idx):
    return self.input_ids[idx], self.attn_masks[idx]
 


In [37]:
dataset_train = Tokenization(dataset_train, tokenizer, max_length=200)

In [38]:
# Split into training and validation sets
train_size = int(0.9 * len(dataset_train))
val_size = len(dataset_train) - train_size

train_dataset, val_dataset = random_split(dataset_train, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

108,000 training samples
12,000 validation samples


In [39]:
batch_size = 3

# Create the DataLoaders for our training and validation datasets.
# We'll take training samples in random order. 
train_dataloader = DataLoader(
            dataset_train,  # The training samples.
            sampler = RandomSampler(dataset_train), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            dataset_val, # The validation samples.
            sampler = SequentialSampler(dataset_val), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )



In [40]:
import random
import numpy as np
# I'm not really doing anything with the config buheret
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# instantiate the model
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)

# this step is necessary because I've added some tokens (bos_token, etc) to the embeddings
# otherwise the tokenizer and model tensors won't match up
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")

# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [41]:
epochs = 3
learning_rate = 5e-4
warmup_steps = 1e2
epsilon = 1e-8

# this produces sample output every 100 steps
sample_every = 1000
# I save the model every 5000 step
save_every = 5000
# save the model to this file name
save_file = 'trial_2'


In [42]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
optimizer = AdamW(model.parameters(),
                  lr = learning_rate,
                  eps = epsilon
                )


In [43]:
# Total number of training steps is [number of batches] x [number of epochs]. 
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs
print('Total number of steps: ', total_steps)
# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = warmup_steps, 
                                            num_training_steps = total_steps)

Total number of steps:  120000


In [44]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cuda


In [45]:
training_stats = []
print("Currently using device type: ", device)

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================

    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    losses = []

    total_train_loss = 0

    model.train()

    for step, batch in enumerate(train_dataloader):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()        

        outputs = model(  b_input_ids,
                          labels=b_labels, 
                          attention_mask =b_masks,
                          token_type_ids=None
                        )

        loss = outputs[0]  

        batch_loss = loss.item()
        total_train_loss += batch_loss
        losses.append(batch_loss)
        
    
    # Calculate perplexity.
    losses = torch.tensor(losses)
    #val_perplexity = math.exp(torch.mean(losses))

    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation perplexity: {0:.2f}".format(val_perplexity))
    # Record all statistics from this epoch.
    #training_stats.append(
     #   {
      #      'epoch': epoch_i + 1,
       #     'Training Loss': avg_train_loss,
        #    'Valid. Loss': avg_val_loss,
         #   'Training Perplexity': train_perplexity,
          #  'Valid. Perplexity': val_perplexity,
     #   }
    #)

#print("")
print("Training complete!")


Currently using device type:  cuda

Training...

Training...

Training...
Training complete!


In [46]:
model.save_pretrained(save_file)

In [47]:
# Evaluate the test data

# prepare datasets for dev_list and test_list
dataset_test = Tokenization(dataset_val, tokenizer, max_length=768)

# load the datasets
test_dataloader = DataLoader(
            dataset_test, # The validation samples.
            sampler = SequentialSampler(dataset_test), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [48]:
import math
def evaluate_model(model, dataloaded):
    model = model.to(device)
    model.eval()

    losses = []
    perplexity = []
    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in dataloaded:

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():        

            outputs  = model(b_input_ids, 
    #                            token_type_ids=None, 
                            attention_mask = b_masks,
                            labels=b_labels)

            loss = outputs[0]  

        batch_loss = loss.item()
        losses.append(batch_loss)
        total_eval_loss += batch_loss        

    avg_val_loss = total_eval_loss / len(dataloaded)

    # Calculate perplexity.
    losses = torch.tensor(losses)
    val_perplexity = math.exp(torch.mean(losses))
    perplexity.append(val_perplexity)

    print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    print("  Validation perplexity: {0:.2f}".format(val_perplexity))
    return avg_val_loss, val_perplexity


In [49]:
import math
print('Testing...')
test_loss, test_perplexity = evaluate_model(model, test_dataloader)
test_eval_df = pd.DataFrame(columns = ["test_loss", "test_perplexity"])
test_eval_df['test_loss'] = test_loss
test_eval_df['test_perplexity'] = test_perplexity
test_eval_df.to_csv("test_eval.csv")


Testing...
  Validation Loss: 43.90
  Validation perplexity: 11633862030592804864.00


In [56]:
# Load pre-trained model

model = GPT2LMHeadModel.from_pretrained("trial_2", config=configuration)


# create initial words for generating recipes

df = pd.read_pickle('df_recipes_preprocessed.pkl')
df.head()

initial_words = []
for text in df['combined']:
    initial_words.append(text.split(" ")[1])
    


In [57]:
generated_recipes = []
for i in range(len(initial_words[:20])):    
    input_ids = tokenizer(initial_words[i], return_tensors='pt').input_ids
    model.to(input_ids.device)
    sample_outputs = model.generate(
                                        input_ids,
                                        num_beams=5, 
                                        no_repeat_ngram_size=2, 
                                        max_length = 200,
                                        num_return_sequences=1,
                                        eos_token_id=tokenizer.eos_token_id
                                    )
    generated_recipes.append(tokenizer.decode(sample_outputs[0]))












Setting `pad_token_id` to `eos_token_id`:50258 for open-end generation.


RuntimeError: cannot reshape tensor of 0 elements into shape [-1, 0] because the unspecified dimension size -1 can be any value and is ambiguous

In [None]:
generated_recipes[:5]