In [1]:
import pandas as pd
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Masking, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
import random
import re

In [2]:
df_loaded = pd.read_csv('./clean_data/clean_data.csv')
df_loaded.drop('Unnamed: 0', axis=1, inplace=True)

**Preparation of dataset for ML training**

In [3]:
df_loaded.head()

Unnamed: 0,strDrink,strCategory,strGlass,strIngredients,Alc_type,Basic_taste,strInstructions,strMeasures,Value,MeasureName,Value_numeric,Value_ml,Value_gr,Garnish_amount,Garnish_type,MeasureName_copy
0,'57 Chevy with a White License Plate,Cocktail,Highball glass,Creme De Cacao White,Creamy Liqueur,,1. Fill a rocks glass with ice 2.add white cre...,1 oz white,1,oz,1.0,30.0,,,,white
1,1-900-FUK-MEUP,Shot,Old-fashioned glass,Absolut Kurant,Vodka,,Shake ingredients in a mixing tin filled with ...,1/2 oz,1/2,oz,0.5,15.0,,,,oz
2,110 in the shade,Beer,Beer Glass,Lager,Beer,,Drop shooter in glass. Fill with beer,16 oz,16,oz,16.0,480.0,,,,oz
3,151 Florida Bushwacker,Milk / Float / Shake,Beer mug,Malibu Rum,Rum,,Combine all ingredients. Blend until smooth. G...,1/2 oz,1/2,oz,0.5,15.0,,,,oz
4,155 Belmont,Cocktail,White wine glass,Dark Rum,Rum,,Blend with ice. Serve in a wine glass. Garnish...,1 shot,1,shot,1.0,25.0,,,,shot


In [4]:
df = df_loaded.drop('MeasureName_copy', axis=1)

In [5]:
df.drop(df.loc[(df['Value_ml'].isnull()) & (df['Value_gr'].isnull()) & (df['Garnish_amount'].isnull())].index,
                                                                                                           axis=0, inplace=True)

In [6]:
# Create and fill new column with correct measure names

df['Measure'] = np.nan
df['Measure'] = df['Measure'].astype('object')
df['Measure'] = np.where((~df['Value_ml'].isnull()), 'ml', 'gr')
df['Measure'] = np.where(~df['Garnish_amount'].isnull(), 'garnish', df['Measure'])
df['Measure'].unique()

array(['ml', 'gr', 'garnish'], dtype=object)

In [7]:
# Create and fill new column with correct amounts of each ingredient

df['Volume'] = np.nan
df['Volume'] = df['Volume'].astype('object')
df['Volume'] = np.where(df['Measure'] == 'ml', df['Value_ml'], df['Value_gr'])
df['Volume'] = np.where(df['Measure'] == 'garnish', (df['Garnish_amount']+' '+df['Garnish_type']), df['Volume'])
df.head()

Unnamed: 0,strDrink,strCategory,strGlass,strIngredients,Alc_type,Basic_taste,strInstructions,strMeasures,Value,MeasureName,Value_numeric,Value_ml,Value_gr,Garnish_amount,Garnish_type,Measure,Volume
0,'57 Chevy with a White License Plate,Cocktail,Highball glass,Creme De Cacao White,Creamy Liqueur,,1. Fill a rocks glass with ice 2.add white cre...,1 oz white,1,oz,1.0,30.0,,,,ml,30
1,1-900-FUK-MEUP,Shot,Old-fashioned glass,Absolut Kurant,Vodka,,Shake ingredients in a mixing tin filled with ...,1/2 oz,1/2,oz,0.5,15.0,,,,ml,15
2,110 in the shade,Beer,Beer Glass,Lager,Beer,,Drop shooter in glass. Fill with beer,16 oz,16,oz,16.0,480.0,,,,ml,480
3,151 Florida Bushwacker,Milk / Float / Shake,Beer mug,Malibu Rum,Rum,,Combine all ingredients. Blend until smooth. G...,1/2 oz,1/2,oz,0.5,15.0,,,,ml,15
4,155 Belmont,Cocktail,White wine glass,Dark Rum,Rum,,Blend with ice. Serve in a wine glass. Garnish...,1 shot,1,shot,1.0,25.0,,,,ml,25


In [8]:
# Combine ingredient, volume and measure in one sentence

def combine(line, ingr_col1, ingr_col2=None):
    if ingr_col2:
        lst1 = [line[ingr_col1], line[ingr_col2], line['Volume'], line['Measure']]
    else:
        lst1 = [line[ingr_col1], line['Volume'], line['Measure']]
    lst2 = [str(i) for i in lst1 if str(i) != 'nan']
    return ' '.join(lst2)

    
# combine(df.iloc[1], 'Alc_type')      # test

In [9]:
# One column - alcohol type and basic taste
# Another column - ingredients as they are
# Third column - alcohol type and non-alcoholic ingredient

df['Sentence_type'] = df.apply(lambda x: combine(x, 'Alc_type', 'Basic_taste'), axis=1)
df['Sentence_ingr'] = df.apply(lambda x: combine(x, 'strIngredients'), axis=1)
df['Sentence_type_ingr'] = np.where(df['Alc_type'].isnull(),
                                    df.apply(lambda x: combine(x, 'strIngredients'), axis=1),
                                    df.apply(lambda x: combine(x, 'Alc_type'), axis=1))

In [10]:
df.head()

Unnamed: 0,strDrink,strCategory,strGlass,strIngredients,Alc_type,Basic_taste,strInstructions,strMeasures,Value,MeasureName,Value_numeric,Value_ml,Value_gr,Garnish_amount,Garnish_type,Measure,Volume,Sentence_type,Sentence_ingr,Sentence_type_ingr
0,'57 Chevy with a White License Plate,Cocktail,Highball glass,Creme De Cacao White,Creamy Liqueur,,1. Fill a rocks glass with ice 2.add white cre...,1 oz white,1,oz,1.0,30.0,,,,ml,30,Creamy Liqueur 30.0 ml,Creme De Cacao White 30.0 ml,Creamy Liqueur 30.0 ml
1,1-900-FUK-MEUP,Shot,Old-fashioned glass,Absolut Kurant,Vodka,,Shake ingredients in a mixing tin filled with ...,1/2 oz,1/2,oz,0.5,15.0,,,,ml,15,Vodka 15.0 ml,Absolut Kurant 15.0 ml,Vodka 15.0 ml
2,110 in the shade,Beer,Beer Glass,Lager,Beer,,Drop shooter in glass. Fill with beer,16 oz,16,oz,16.0,480.0,,,,ml,480,Beer 480.0 ml,Lager 480.0 ml,Beer 480.0 ml
3,151 Florida Bushwacker,Milk / Float / Shake,Beer mug,Malibu Rum,Rum,,Combine all ingredients. Blend until smooth. G...,1/2 oz,1/2,oz,0.5,15.0,,,,ml,15,Rum 15.0 ml,Malibu Rum 15.0 ml,Rum 15.0 ml
4,155 Belmont,Cocktail,White wine glass,Dark Rum,Rum,,Blend with ice. Serve in a wine glass. Garnish...,1 shot,1,shot,1.0,25.0,,,,ml,25,Rum 25.0 ml,Dark Rum 25.0 ml,Rum 25.0 ml


In [11]:
# Reallocate ingredients as columns so each cocktail takes only one row

df_cut = df[['strDrink', 'Sentence_type', 'Sentence_ingr', 'Sentence_type_ingr']]
s =  df_cut.groupby('strDrink').cumcount().add(1)
df_cut = (df_cut.set_index(['strDrink',s]).unstack().sort_index(axis=1, level=1))
df_cut.columns = ['{}_{}'.format(a, b) for a,b in df_cut.columns]

df_cut = df_cut.reset_index()
df_cut.head()

Unnamed: 0,strDrink,Sentence_ingr_1,Sentence_type_1,Sentence_type_ingr_1,Sentence_ingr_2,Sentence_type_2,Sentence_type_ingr_2,Sentence_ingr_3,Sentence_type_3,Sentence_type_ingr_3,...,Sentence_type_ingr_8,Sentence_ingr_9,Sentence_type_9,Sentence_type_ingr_9,Sentence_ingr_10,Sentence_type_10,Sentence_type_ingr_10,Sentence_ingr_11,Sentence_type_11,Sentence_type_ingr_11
0,'57 Chevy with a White License Plate,Creme De Cacao White 30.0 ml,Creamy Liqueur 30.0 ml,Creamy Liqueur 30.0 ml,Vodka 30.0 ml,Vodka 30.0 ml,Vodka 30.0 ml,,,,...,,,,,,,,,,
1,1-900-FUK-MEUP,Absolut Kurant 15.0 ml,Vodka 15.0 ml,Vodka 15.0 ml,Grand Marnier 7.5 ml,Triple Sec 7.5 ml,Triple Sec 7.5 ml,Chambord Raspberry Liqueur 7.5 ml,Sweet Liqueur 7.5 ml,Sweet Liqueur 7.5 ml,...,Pineapple Juice 7.5 ml,,,,,,,,,
2,110 in the shade,Lager 480.0 ml,Beer 480.0 ml,Beer 480.0 ml,Tequila 45.0 ml,Tequila 45.0 ml,Tequila 45.0 ml,,,,...,,,,,,,,,,
3,151 Florida Bushwacker,Malibu Rum 15.0 ml,Rum 15.0 ml,Rum 15.0 ml,Light Rum 15.0 ml,Rum 15.0 ml,Rum 15.0 ml,151 Proof Rum 15.0 ml,Rum 15.0 ml,Rum 15.0 ml,...,Vanilla Ice-Cream 128.0 gr,,,,,,,,,
4,155 Belmont,Dark Rum 25.0 ml,Rum 25.0 ml,Rum 25.0 ml,Light Rum 50.0 ml,Rum 50.0 ml,Rum 50.0 ml,Vodka 25.0 ml,Vodka 25.0 ml,Vodka 25.0 ml,...,,,,,,,,,,


In [12]:
def combine(line, col):
    lst1 = [line[col+'_'+str(i)] for i in range(1, 12)]
    lst2 = [str(i) for i in lst1 if str(i) != 'nan']
    return ' '.join(lst2)

        
# combine(df_cut.iloc[1], 'Sentence_ingr')      # test

In [13]:
cols = ['Sentence_ingr', 'Sentence_type', 'Sentence_type_ingr']
for i in cols:
    df_cut[i] = df_cut.apply(lambda x: combine(x, i), axis=1)

In [14]:
df_cut = df_cut[['strDrink', 'Sentence_ingr', 'Sentence_type', 'Sentence_type_ingr']]
df_cut.head()

Unnamed: 0,strDrink,Sentence_ingr,Sentence_type,Sentence_type_ingr
0,'57 Chevy with a White License Plate,Creme De Cacao White 30.0 ml Vodka 30.0 ml,Creamy Liqueur 30.0 ml Vodka 30.0 ml,Creamy Liqueur 30.0 ml Vodka 30.0 ml
1,1-900-FUK-MEUP,Absolut Kurant 15.0 ml Grand Marnier 7.5 ml Ch...,Vodka 15.0 ml Triple Sec 7.5 ml Sweet Liqueur ...,Vodka 15.0 ml Triple Sec 7.5 ml Sweet Liqueur ...
2,110 in the shade,Lager 480.0 ml Tequila 45.0 ml,Beer 480.0 ml Tequila 45.0 ml,Beer 480.0 ml Tequila 45.0 ml
3,151 Florida Bushwacker,Malibu Rum 15.0 ml Light Rum 15.0 ml 151 Proof...,Rum 15.0 ml Rum 15.0 ml Rum 15.0 ml Creamy Liq...,Rum 15.0 ml Rum 15.0 ml Rum 15.0 ml Creamy Liq...
4,155 Belmont,Dark Rum 25.0 ml Light Rum 50.0 ml Vodka 25.0 ...,Rum 25.0 ml Rum 50.0 ml Vodka 25.0 ml sweet 25...,Rum 25.0 ml Rum 50.0 ml Vodka 25.0 ml Orange J...


**Building a model**

In [143]:
# Transform the column we want to use for training into list of strings

text = df_cut['Sentence_ingr'].tolist()
text[:5]

['Creme De Cacao White 30.0 ml Vodka 30.0 ml',
 'Absolut Kurant 15.0 ml Grand Marnier 7.5 ml Chambord Raspberry Liqueur 7.5 ml Midori Melon Liqueur 7.5 ml Malibu Rum 7.5 ml Amaretto 7.5 ml Cranberry Juice 15.0 ml Pineapple Juice 7.5 ml',
 'Lager 480.0 ml Tequila 45.0 ml',
 'Malibu Rum 15.0 ml Light Rum 15.0 ml 151 Proof Rum 15.0 ml Dark Creme De Cacao 30.0 ml Cointreau 30.0 ml Milk 90.0 ml Coconut Liqueur 30.0 ml Vanilla Ice-Cream 128.0 gr',
 'Dark Rum 25.0 ml Light Rum 50.0 ml Vodka 25.0 ml Orange Juice 25.0 ml']

In [144]:
# Test - remove all measures

# text = df_cut['Sentence_ingr'].apply(lambda x: re.sub('gr', '', re.sub('ml', '', re.sub(r'\d', '', x))))
# text[:5]

In [145]:
# Create Tokenizer object
tokenizer = Tokenizer(num_words=None,      # Define how many most common words to keep. If none, all will be kept
                      filters='',          # Includes punctuation by default but we need to keep dots and we don't have anything
#                       filters='.#$%&()*+-<=>@[\\]^_`{|}~\t\n',
                                           # to clean
                      lower=False,         # We'd like to keep uppercase words
                      split=' ')           # Words in strings are split by whitespace

# Train the tokenizer to the texts
tokenizer.fit_on_texts(text)

# Convert list of strings into list of lists of integers
sequences = tokenizer.texts_to_sequences(text)

sequences[:2]

[[39, 40, 82, 35, 2, 1, 12, 2, 1],
 [68,
  196,
  3,
  1,
  100,
  101,
  71,
  1,
  116,
  91,
  24,
  71,
  1,
  107,
  102,
  24,
  71,
  1,
  108,
  10,
  71,
  1,
  30,
  71,
  1,
  54,
  4,
  3,
  1,
  50,
  4,
  71,
  1]]

In [148]:
# The way to convert it back to words

idx_word = tokenizer.index_word
idx_word

{1: 'ml',
 2: '30.0',
 3: '15.0',
 4: 'Juice',
 5: 'gr',
 6: 'garnish',
 7: '60.0',
 8: '45.0',
 9: 'Lemon',
 10: 'Rum',
 11: '1',
 12: 'Vodka',
 13: 'Gin',
 14: 'Orange',
 15: 'Sugar',
 16: '22.5',
 17: 'Cream',
 18: 'Light',
 19: '4.0',
 20: '25.0',
 21: 'Lime',
 22: 'Brandy',
 23: '1.0',
 24: 'Liqueur',
 25: 'Vermouth',
 26: 'Cherry',
 27: '10.0',
 28: 'Triple',
 29: 'Sec',
 30: 'Amaretto',
 31: '2.0',
 32: '90.0',
 33: 'berry',
 34: 'Bitters',
 35: 'White',
 36: 'Sweet',
 37: '120.0',
 38: 'Grenadine',
 39: 'Creme',
 40: 'De',
 41: 'Peel',
 42: 'slice',
 43: 'Water',
 44: 'Irish',
 45: 'Powdered',
 46: 'Tequila',
 47: 'Kahlua',
 48: 'twist',
 49: 'Egg',
 50: 'Pineapple',
 51: 'Dry',
 52: 'Soda',
 53: 'Maraschino',
 54: 'Cranberry',
 55: '6.0',
 56: '0',
 57: 'top',
 58: 'up',
 59: "Bailey'S",
 60: 'Schnapps',
 61: 'Dark',
 62: 'Sour',
 63: '150.0',
 64: '14.0',
 65: 'Milk',
 66: '20.0',
 67: 'Syrup',
 68: 'Absolut',
 69: 'Peach',
 70: 'Coffee',
 71: '7.5',
 72: '50.0',
 73: '8.0',


Set features and labels. In our case features will be pair of words and a label will be the third word following defined pair. We will repeat this labeling with step 1 (i.e. every time second word becomes first word etc) for every sentence we have.

In [149]:
features = []
labels = []

training_length = 10

# Iterate through the sequences of tokens:
for seq in sequences:
    
    # Create a multiple training examples from each sequence
    for i in range(training_length, len(seq)):
        
        # Extract the features and label
        extract = seq[(i - training_length):(i + 1)]
        
        # Set the features and label
        features.append(extract[:-1])
        labels.append(extract[-1])
        
features = np.array(features)
features.shape

(1938, 10)

Transform labels to one-hot encoded, this way neural network trains the most effectively.

In [150]:
# Number of words in vocabulary
num_words = len(idx_word) + 1

# Empty array to hold labels
label_array = np.zeros((len(features), num_words), dtype = np.int32)

# One hot encode for labels
for example_index, word_index in enumerate(labels):
    label_array[example_index, word_index] = 1

label_array.shape

(1938, 397)

In [151]:
label_array

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [152]:
# Find word corresponding to encoding
idx_word[np.argmax(label_array[100])]

'Cranberry'

In [153]:
# Create callbacks

callbacks = [EarlyStopping(monitor='val_loss', patience=5),
            ModelCheckpoint('../models/model.h5')]

In [154]:
model = Sequential()
# Embedding layer

model.add(
    Embedding(input_dim=num_words,
              input_length = training_length,
              output_dim=20,
#               weights=[embedding_matrix],
              trainable=True,
              mask_zero=True))

# Masking layer for pre-trained embeddings
model.add(Masking(mask_value=0.0))

# Recurrent payer
model.add(LSTM(64, return_sequences=False, dropout=0.1, recurrent_dropout=0.1))

# Fully connected payer
model.add(Dense(64, activation='relu'))

# Dropout for regularization
model.add(Dropout(0.5))

# Output layer
model.add(Dense(num_words, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [155]:
model.fit(features, label_array, epochs=100, callbacks=callbacks)
# model.fit(features, label_array, epochs=10)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100


Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100


Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<tensorflow.python.keras.callbacks.History at 0x1a9bea45700>

In [156]:
def generate_output(model,
                    first_ingredient,
                    idx_word,
                    seed_length=4,
                    new_words=50,
                    diversity=1,
                    return_output=False,
                    n_gen=1):       # Number of generations?
    """Generate `new_words` words of output from a trained model and format into HTML."""

    # Identify code of the ingredient given by user:
    for key, value in idx_word.items():
        if value == first_ingredient:
            first_ingr_code = key
#     print(first_ingr_code)
    
    
    # Identify index of ingredient given by user based on its code:    
    a = []                              # List of all sequences where given ingredient appears (index of sequence + index of
                                                                                                                  # ingredient)
    for seq in sequences:
        for word in seq:
            if word == 10:
                a.append((sequences.index(seq), seq.index(word)))
    
    b = random.choice(a)                # Randomly selected sequence (i.e. cocktail)
    
    len1 = len(sequences[b[0]])         # Lengh of randomly selected sequence where given ingredient is presented
    len2 = b[1] + seed_length + 1       # Lengh of seed based on index of given ingredient in randomly selected 

    while len1 < len2:
        b = random.choice(a)
        len1 = len(sequences[b[0]])
        len2 = b[1] + seed_length + 1

    gen_list = []
    
    for n in range(n_gen):
        # Extract the seed sequence
        seed = sequences[b[0]][b[1]:(seed_length+1)]
#         print(seed)
        generated = seed          # Transform words to code for further decoding toghether with new added words
#         print(generated)
        
        # Keep adding new words
        for i in range(new_words):

            # Make a prediction from the seed
            preds = model.predict(np.array(seed).reshape(1, -1))[0].astype(np.float64)
#             print(preds[0:10])
            
#             Diversify
            preds = np.log(preds) / diversity
            exp_preds = np.exp(preds)
#             print(preds)

            # Softmax
            preds = exp_preds / sum(exp_preds)
#             print(preds[:10])

            # Choose the next word
            probas = np.random.multinomial(1, preds, 1)[0]
#             print(probas)

            next_idx = np.argmax(probas)
#             print(next_idx)

            # New seed adds on old word
            seed += [next_idx]
            generated.append(next_idx)
#         print(generated)
        
#         Showing generated and actual abstract
        n = []

        for i in generated:
            n.append(idx_word.get(i))

        gen_list.append(n)
    return(gen_list)

In [157]:
generate_output(model, 'Rum', idx_word, new_words=10)



  preds = np.log(preds) / diversity


[['Rum',
  '60.0',
  'ml',
  'Cherry',
  '1',
  '1',
  'slice',
  'slice',
  'garnish',
  'garnish',
  'Rum',
  'Rum',
  '14.0',
  '14.0',
  '1',
  '1',
  'twist',
  'twist',
  'garnish',
  'garnish',
  'Rum',
  'Rum',
  '45.0',
  '45.0']]

In [102]:
idx_word

{1: 'ml',
 2: '30.0',
 3: '15.0',
 4: 'Juice',
 5: 'gr',
 6: 'garnish',
 7: '60.0',
 8: '45.0',
 9: 'Lemon',
 10: 'Rum',
 11: '1',
 12: 'Vodka',
 13: 'Gin',
 14: 'Orange',
 15: 'Sugar',
 16: '22.5',
 17: 'Cream',
 18: 'Light',
 19: '4.0',
 20: '25.0',
 21: 'Lime',
 22: 'Brandy',
 23: '1.0',
 24: 'Liqueur',
 25: 'Vermouth',
 26: 'Cherry',
 27: '10.0',
 28: 'Triple',
 29: 'Sec',
 30: 'Amaretto',
 31: '2.0',
 32: '90.0',
 33: 'berry',
 34: 'Bitters',
 35: 'White',
 36: 'Sweet',
 37: '120.0',
 38: 'Grenadine',
 39: 'Creme',
 40: 'De',
 41: 'Peel',
 42: 'slice',
 43: 'Water',
 44: 'Irish',
 45: 'Powdered',
 46: 'Tequila',
 47: 'Kahlua',
 48: 'twist',
 49: 'Egg',
 50: 'Pineapple',
 51: 'Dry',
 52: 'Soda',
 53: 'Maraschino',
 54: 'Cranberry',
 55: '6.0',
 56: '0',
 57: 'top',
 58: 'up',
 59: "Bailey'S",
 60: 'Schnapps',
 61: 'Dark',
 62: 'Sour',
 63: '150.0',
 64: '14.0',
 65: 'Milk',
 66: '20.0',
 67: 'Syrup',
 68: 'Absolut',
 69: 'Peach',
 70: 'Coffee',
 71: '7.5',
 72: '50.0',
 73: '8.0',


Functions below have been copied from https://github.com/WillKoehrsen/recurrent-neural-networks/blob/master/notebooks/Exploring%20Model%20Results.ipynb

In [52]:
def generate_output(model,
                    sequences,
                    idx_word,
                    seed_length=50,
                    new_words=50,
                    diversity=1,
                    return_output=False,
                    n_gen=1):
    """Generate `new_words` words of output from a trained model and format into HTML."""

    # Choose a random sequence
    seq = random.choice(sequences)
#     print(seq)
    # Choose a random starting point
#     seed_idx = random.randint(0, len(seq) - seed_length - 10)
    seed_idx = random.randint(0, len(seq))
#     print(seed_idx)
    # Ending index for seed
    end_idx = seed_idx + seed_length
#     print(end_idx)
    gen_list = []

    for n in range(n_gen):
        # Extract the seed sequence
        seed = seq[seed_idx:end_idx]
#         print(seed)
        original_sequence = [idx_word[i] for i in seed]
#         print(original_sequence)
        generated = seed[:] + ['#']
#         print(generated)

        # Find the actual entire sequence
        actual = generated[:] + seq[end_idx:end_idx + new_words]
#         print(actual)
        # Keep adding new words
        for i in range(new_words):
            print(i)

            # Make a prediction from the seed
            preds1 = model.predict(np.array(seed).reshape(1, -1))[0].astype(np.float64)
#             print(preds1)

            # Diversify
            preds = np.log(preds1) / diversity
            exp_preds = np.exp(preds)
            print(type(exp_preds[:1]))

            # Softmax
            preds = exp_preds / sum(exp_preds)
            print(preds1 == preds)

            # Choose the next word
            probas = np.random.multinomial(1, preds, 1)[0]
#             print(probas)

            next_idx = np.argmax(probas)
#             print(next_idx)

            # New seed adds on old word
            #             seed = seed[1:] + [next_idx]
            seed += [next_idx]
            generated.append(next_idx)
        print(generated)

        # Showing generated and actual abstract
        n = []

        for i in generated:
#             n.append(idx_word.get(i, '< --- >'))
            n.append(idx_word.get(i))

        gen_list.append(n)
    return(gen_list)

#     a = []

#     for i in actual:
#         a.append(idx_word.get(i, '< --- >'))

#     a = a[seed_length:]

#     gen_list = [gen[seed_length:seed_length + len(a)] for gen in gen_list]

#     if return_output:
#         return original_sequence, gen_list, a

#     # HTML formatting
#     seed_html = ''
#     seed_html = addContent(seed_html, header(
#         'Seed Sequence', color='darkblue'))
#     seed_html = addContent(seed_html,
#                            box(remove_spaces(' '.join(original_sequence))))

#     gen_html = ''
#     gen_html = addContent(gen_html, header('RNN Generated', color='darkred'))
#     gen_html = addContent(gen_html, box(remove_spaces(' '.join(gen_list[0]))))

#     a_html = ''
#     a_html = addContent(a_html, header('Actual', color='darkgreen'))
#     a_html = addContent(a_html, box(remove_spaces(' '.join(a))))

#     return seed_html, gen_html, a_html

In [53]:
generate_output(model, sequences, idx_word, seed_length=5, new_words=10)

0
<class 'numpy.ndarray'>
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False 

<class 'numpy.ndarray'>
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False Fa

[['60.0',
  'ml',
  'Coffee',
  'Liqueur',
  '30.0',
  None,
  'ml',
  '3785.0',
  'gr',
  'Gin',
  'gr',
  'Dry',
  'Orange',
  'slice',
  'garnish',
  'garnish']]

In [80]:
def addContent(old_html, raw_html):
    old_html += raw_html
    return old_html

In [82]:
def header(text, color = 'black', gen_text = None):
    if gen_text:
        raw_html = f'<h1 style="color: {color};"><p><center>' + str(
        text) + '<span style="color: red">' + str(gen_text) + '</center></p></h1>'
    else:
        raw_html = f'<h1 style="color: {color};"><center>' + str(
            text) + '</center></h1>'
    return raw_html

In [83]:
def box(text, gen_text=None):
    if gen_text:
        raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;"> <p>' + str(
            text) +'<span style="color: red">' + str(gen_text) + '</p></div>'

    else:
        raw_html = '<div style="border:1px inset black;padding:1em;font-size: 20px;">' + str(
            text) + '</div>'
    return raw_html

In [84]:
def seed_sequence(model, s, word_idx, idx_word, 
                  diversity = 0.75, num_words = 50):
    """Generate output starting from a seed sequence."""
    # Original formated text
    start = format_sequence(s).split()
    gen = []
    s = start[:]
    # Generate output
    for _ in range(num_words):
        # Conver to arry
        x = np.array([word_idx.get(word, 0) for word in s]).reshape((1, -1))

        # Make predictions
        preds = model.predict(x)[0].astype(float)

        # Diversify
        preds = np.log(preds) / diversity
        exp_preds = np.exp(preds)
        # Softmax
        preds = exp_preds / np.sum(exp_preds)
        # Pick next index
        next_idx = np.argmax(np.random.multinomial(1, preds, size = 1))
        s.append(idx_word[next_idx])
        gen.append(idx_word[next_idx])
    
    # Formatting in html
    start = remove_spaces(' '.join(start)) + ' '
    gen = remove_spaces(' '.join(gen)) 
    html = ''
    html = addContent(html, header('Input Seed ', color = 'black', gen_text = 'Network Output'))
    html = addContent(html, box(start, gen))
    return html

In [86]:
def remove_spaces(s):
    """Remove spaces around punctuation"""
    s = re.sub(r'\s+([.,;?])', r'\1', s)
    
    return s

In [None]:
seed_html, gen_html, a_html = generate_output(model, sequences, idx_word)
HTML(seed_html)
HTML(gen_html)
HTML(a_html)