# Generating recipies with character-level RNN

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
import pandas as pd
import numpy as np
from pathlib import Path
import torch
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
import random

## Dataset

Get the data from here http://www.ffts.com/recipes/lg/lg32965.zip and unzip it into data directory.

In [265]:
PATH = Path("data")
files = [f for f in PATH.iterdir() if str(f).endswith('.mmf')]
files[:5]

[PosixPath('data/1000.mmf'),
 PosixPath('data/18000.mmf'),
 PosixPath('data/14000.mmf'),
 PosixPath('data/22000.mmf'),
 PosixPath('data/30000.mmf')]

In [24]:
len(files)

33

In [3]:
! head data/1000.mmf

MMMMM----- Recipe via Meal-Master (tm) v8.05
 
      Title: "BE MINE" LOLLIPOPS
 Categories: Candies, Valentine
      Yield: 8 Servings
 
           Text only
 
  Source: Better Homes and Gardens, Febuary 1998 Prep time: 20 minutes
  Cook: 6 to 8 minutes


### Processing data

In [262]:
import unidecode
def read_file(path):
    with open(path, encoding="utf8", errors='ignore') as f:
        content = unidecode.unidecode(f.read())
    return content

In [263]:
content = read_file("data/1000.mmf")
content2 = read_file("data/18000.mmf")

In [261]:
n + n2

2330591

In [260]:
cc = content2 + content
len(cc)

2330591

In [72]:
content[:1000]

'MMMMM----- Recipe via Meal-Master (tm) v8.05\n \n      Title: "BE MINE" LOLLIPOPS\n Categories: Candies, Valentine\n      Yield: 8 Servings\n \n           Text only\n \n  Source: Better Homes and Gardens, Febuary 1998 Prep time: 20 minutes\n  Cook: 6 to 8 minutes\n  \n  2 1/2 to 3 1/2-inch round or heart-shaped metal cookie cutters 8 oz.\n  assorted red, pink, and/or clearhard candies 35 to 60 (2 to 3 oz.)\n  assorted small decorative candies, such as red cinnamon candies,\n  small nonpareils, colored candy hearts, spice drops, and gumdrops\n  Edible rose petals or other flower petals (optional) Lollipop sticks\n  \n  Place unwrapped hard candies in a heavy plastic bag, then place bag\n  on top of folded towel and crush candies into small chunks wiht meat\n  mallet or small hammer.\n  \n  Make only three or four lollipops at one time.  Line a baking sheet\n  with foil.  Place desired cookie cutters on foil, at least 2 inches\n  apart. Divide crushed candies evenly among cutters, appro

In [73]:
n

1226806

In [74]:
import string
all_characters = string.printable

In [75]:
all_characters

'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~ \t\n\r\x0b\x0c'

In [204]:
n_characters = len(all_characters)
n_characters

100

In [77]:
# easy way to encode characters
np.array([all_characters.index(c) for c in content[:100]])

array([48, 48, 48, 48, 48, 74, 74, 74, 74, 74, 94, 53, 14, 12, 18, 25, 14,
       94, 31, 18, 10, 94, 48, 14, 10, 21, 74, 48, 10, 28, 29, 14, 27, 94,
       69, 29, 22, 70, 94, 31,  8, 75,  0,  5, 96, 94, 96, 94, 94, 94, 94,
       94, 94, 55, 18, 29, 21, 14, 77, 94, 63, 37, 40, 94, 48, 44, 49, 40,
       63, 94, 47, 50, 47, 47, 44, 51, 50, 51, 54, 96, 94, 38, 10, 29, 14,
       16, 24, 27, 18, 14, 28, 77, 94, 38, 10, 23, 13, 18, 14, 28])

In [78]:
seq_len=1000
contents = [content[i*seq_len:(i+1)*seq_len] for i in range(n//seq_len)]

In [79]:
len(contents)

1226

In [80]:
len(contents)*seq_len

1226000

# Dataset

In [97]:
all_characters.index(" ")

94

In [98]:
def char_encoding(c):
    if c in all_characters:
        return all_characters.index(c)
    else:
        return all_characters.index(" ")

In [264]:
 class RecipeDataset(Dataset):
    def __init__(self, path="data/1000.mmf", seq_len=1000):
        content = read_file(path) 
        n = len(content)
        self.contents = [content[i*seq_len:(i+1)*seq_len] for i in range(n//seq_len)]
        # shift by 1
        self.ys = [content[i*seq_len + 1 :(i+1)*seq_len +1] for i in range(n//seq_len)]
        
    def __len__(self):
        return len(self.contents)
    
    def __getitem__(self, idx):
        x = self.contents[idx]
        y = self.ys[idx]
        x = np.array([char_encoding(c) for c in x])
        y = np.array([char_encoding(c) for c in y])
        return x, y

In [266]:
train_ds = RecipeDataset(files, seq_len=30)

In [267]:
x, y = train_ds[0]

In [268]:
x, y

(array([48, 48, 48, 48, 48, 74, 74, 74, 74, 74, 94, 53, 14, 12, 18, 25, 14,
        94, 31, 18, 10, 94, 48, 14, 10, 21, 74, 48, 10, 28]),
 array([48, 48, 48, 48, 74, 74, 74, 74, 74, 94, 53, 14, 12, 18, 25, 14, 94,
        31, 18, 10, 94, 48, 14, 10, 21, 74, 48, 10, 28, 29]))

In [269]:
# you can start by just sending one file
train_ds = RecipeDataset(files, seq_len=10000)

In [279]:
len(train_ds)

3810

In [281]:
batch_size = 100
train_dl = DataLoader(train_ds, batch_size=batch_size)

## Model with one-hot encoding input

In [271]:
class RecipeRNN(nn.Module):
    def __init__(self, emb_size, hidden_size, vocab_size):
        super(RecipeRNN, self).__init__()

        self.hidden_size = hidden_size
        # code here

    def forward(self, x, hidden):
        #code here
        return output, hidden

    def initHidden(self, bash_size):
        return torch.zeros(bash_size, self.hidden_size)

## Debugging model

In [272]:
vocab_size = len(all_characters)
emb_size = 50
hidden_size = 150
model = RecipeRNN(emb_size, hidden_size, vocab_size)

In [282]:
x, y = next(iter(train_dl))

In [283]:
x.shape, y.shape

(torch.Size([100, 10000]), torch.Size([100, 10000]))

In [284]:
h = model.initHidden(100)

In [285]:
loss = 0
for i in range(x.shape[1]):
    y_t, h = model(x[:,i].long(), h)
    loss += F.cross_entropy(y_t, y[:, i].long())
loss /= x.shape[1]

In [286]:
# note that just the last x_t is used in the loss
# update
loss.item()

4.600766658782959

## Training

In [287]:
vocab_size = len(all_characters)
emb_size = 50
hidden_size = 150
model = RecipeRNN(emb_size, hidden_size, vocab_size)

In [288]:
def get_optimizer(model, lr = 0.01, wd = 0.00001):
    optim = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=wd)
    return optim

In [289]:
def train(model, optim, train_dl):
    model.train()
    total = 0
    sum_loss = 0
    for x, y in train_dl:
        batch = x.shape[0]
        h = model.initHidden(batch)
        loss = 0
        x = x.long()
        y = y.long()
        for i in range(x.shape[1]):
            out, h = model(x[:,i], h)
            loss += F.cross_entropy(out, y[:, i])
        loss /= x.shape[1]
        
        optim.zero_grad()
        loss.backward()
        optim.step()
        total += batch
        sum_loss += batch*(loss.item())
    return sum_loss/total

In [290]:
def train_loop(model, lr, train_dl, epochs=20):
    optim = get_optimizer(model, lr =lr, wd = 0.0)
    for i in range(epochs):
        loss = train(model, optim, train_dl)
        if i%5 == 1: print("train loss %.3f" % (loss))

In [293]:
vocab_size = len(all_characters)
emb_size = 50
hidden_size = 150
model = RecipeRNN(emb_size, hidden_size, vocab_size)

In [None]:
train_loop(model, 0.01, train_dl,  epochs=20)

train loss 1.790
train loss 1.297


In [252]:
train_loop(model, 0.001, train_dl, epochs=40)

train loss 1.206
train loss 1.202
train loss 1.201
train loss 1.200
train loss 1.199
train loss 1.198
train loss 1.197
train loss 1.197


## Generating recipies

In [240]:
hidden = model.initHidden(1)
inp = torch.LongTensor([all_characters.index("M")])

In [241]:
inp.shape, hidden.shape

(torch.Size([1]), torch.Size([1, 150]))

In [242]:
model.emb(inp).shape

torch.Size([1, 50])

In [243]:
output, hidden = model(inp, hidden)
output.shape

torch.Size([1, 100])

In [244]:
temperature=0.8
output_dist = output.data.view(-1).div(temperature).exp()
output_dist

tensor([4.5325e-03, 5.4097e+01, 3.6071e+00, 4.1176e+00, 1.2773e+01, 1.2075e-01,
        9.5394e+01, 5.4767e+00, 1.8936e+00, 3.1331e+00, 1.0619e+02, 1.5974e+01,
        3.9435e-04, 4.1891e-03, 2.9012e+02, 6.6120e-02, 3.2566e+00, 2.0937e-04,
        8.7681e-01, 2.4478e-04, 9.8413e-06, 9.7300e-03, 1.1631e+00, 8.5289e-04,
        2.1804e+02, 1.2005e-03, 1.4019e-02, 4.6674e-02, 1.3489e-01, 9.1231e-07,
        5.2851e-01, 4.6521e-03, 4.0609e-02, 8.0901e-02, 6.2744e-02, 9.8108e-03,
        1.3753e+02, 7.2103e+00, 1.2496e+01, 2.2803e+01, 2.0441e+02, 2.1760e+00,
        1.7172e+00, 4.8473e+00, 8.6792e+01, 2.7507e+00, 1.7293e+00, 1.3518e-01,
        9.7000e+04, 2.3831e+00, 5.6466e+03, 2.5048e+01, 1.7197e+00, 1.2665e+00,
        1.0054e+00, 1.4442e+00, 1.7010e+02, 2.3135e+00, 5.6544e+01, 1.9075e-01,
        5.3645e+00, 1.4553e-02, 5.1243e+00, 7.8919e-01, 9.4639e-01, 3.0532e-01,
        1.5756e-01, 1.3458e+00, 4.4846e-01, 3.6145e-02, 1.0283e+02, 4.7754e-01,
        3.0641e-01, 8.7199e+00, 4.7178e+

In [245]:
top_i = torch.multinomial(output_dist, 1)
top_i

tensor([48])

In [246]:
predicted_char = all_characters[top_i]
predicted_char

'M'

In [247]:
def generate(model, predict_len=1000, temperature=0.8):
    hidden = model.initHidden(1)
    inp = torch.LongTensor([all_characters.index("M")])
    predicted = "M"
    for p in range(predict_len):
        output, hidden = model(inp, hidden)
        
        # Sample from the network as a multinomial distribution
        output_dist = output.data.view(-1).div(temperature).exp()
        top_i = torch.multinomial(output_dist, 1)

        # Add predicted character to string and use as next input
        predicted_char = all_characters[top_i[0]]
        predicted += predicted_char
        inp = top_i
    return predicted

In [253]:
predicted = generate(model, predict_len=1000, temperature=0.8)
print(predicted)

MMMMCOLLIGATE CAKE
 Categories: Meats, December or for the bar (3) desired format egg and simmer 2 minutes oa the in the powder
      1 ts Butter, sliced
      1 c  Straw
      1 c  Water
      1    Lemon chopped
           Chopped
      1 tb Salt
  cover container wine of heat, peppers, Indiesy. Time on the
  spices
      1 c  Fine
      1 c  Milk
    3/4 c  Sugar
    1/2 c  Floruce cheese, minced
      1 ts Salt
        2 quse 1 brots to the to dough in egg bowl, way beas mixture in water in the dough is frying
  serving seeds, Sauces to think
      1    Egg
      2 tb Hoy tablespoon and remaining whipped almond cheese the mago=; Mix to eashrone; garlic
  or one quarted toie vegetables, cool balls are skillet cheese
      2 tb Soy suttie and peel ground and stepping until with the boil. Heat the eggs each.  Makes of a can necessary sauce, turnuts & 4 sarbogeter soak
           - juice, heaviamin
           - "Demoni be unbagreming that that the
  lemon juice
    1/4 c  Butter
      1

In [254]:
predicted = generate(model, predict_len=1000, temperature=0.8)
print(predicted)

MMMMM.7ASSER TRED ON THE SALAD
 Categories: Chili peeles into desired punt out granulaned holiday
  from this corn it dish. Rollet. **TOma. [Cook and tossive out the boiling water incorpos of day almond extract
  1 1/2 ts Sodium-Mast,
        wittom and baking sheets with microwaves
    1/4 c  Crushed and about 2 minutes. Serve.  Saute. Bake over mucksent almond combined Vinegar
      1 ts Peel
           Seasone letts a pasta
      2 c  An the pan. Mix to boiler and seal onions and spices, green notes
  cookies brown,
  skin. Top will pans
           Garlic sue befots per gring mixture.
  
  Cook and garlic, Indian center hands of the ingredients in for the 17 dboped
  or raw adding.
  
  Diced and pan and casted ex3, or until ozzlers are sanda sliced sauce in more the paste very spread beef beef some the cheese and almond extract
        2    Egg Nancepted and chopped hot soy.  Mix the dough of a cookie sheet of the remaining conbeding your shake blumbles
      3 tb Chili
      Yield

## Lab
** Write the model
** Train and generate recipes
** Modify the dataset to include more data 

# References
The generator is modified from this https://github.com/spro/char-rnn.pytorch/blob/282bcb6b15ab3929d6a588b455cfc0f19f32add4/generate.py