In [1]:
import pandas as pd
import plotly.express as px
import tensorflow as tf
import numpy as np
from tensorflow.keras.layers import Input, Dense, LSTM, Dropout, TextVectorization, Embedding
from tensorflow.keras.models import Model

In [None]:
def data_processing(data_path: str) -> pd.DataFrame:
    """
    Read the input data, and transform into the structure needed for training a model
    
    Inputs:
        data_path:
            A string path to the input Mr Boston Cocktail Dataset
        
    Outputs:
        data:
            A pandas dataframe with all of the needed columns for training
    """
    
    

In [2]:
df = pd.read_csv("mr-boston-flattened.csv")
df

Unnamed: 0,name,category,measurement-1,ingredient-1,measurement-2,ingredient-2,measurement-3,ingredient-3,measurement-4,ingredient-4,measurement-5,ingredient-5,measurement-6,ingredient-6,instructions,glass,glass-size
0,Gauguin,Cocktail Classics,2 oz,Light Rum,1 oz,Passion Fruit Syrup,1 oz,Lemon Juice,1 oz,Lime Juice,,,,,Combine ingredients with a cup of crushed ice ...,Old-Fashioned Glass,6 to 8 ounces
1,Fort Lauderdale,Cocktail Classics,1 1/2 oz,Light Rum,1/2 oz,Sweet Vermouth,1/4 oz,Juice of Orange,1/4 oz,Juice of a Lime,,,,,Shake with ice and strain into old-fashioned g...,Old-Fashioned Glass,6 to 8 ounces
2,Apple Pie,Cordials and Liqueurs,3 oz,Apple schnapps,1 oz,Cinnamon schnapps,,Apple slice,,,,,,,Pour into ice-filled old-fashioned glass. Garn...,Old-Fashioned Glass,6 to 8 ounces
3,Cuban Cocktail No. 1,Cocktail Classics,1/2 oz,Juice of a Lime,1/2 oz,Powdered Sugar,2 oz,Light Rum,,,,,,,Shake with ice and strain into cocktail glass.,Cocktail Glass,6 or more ounces
4,Cool Carlos,Cocktail Classics,1 1/2 oz,Dark rum,2 oz,Cranberry Juice,2 oz,Pineapple Juice,1 oz,Orange curacao,1 oz,Sour Mix,,,"Mix all ingredients except curacao with ice, s...",Collins Glass,14 to 16 ounces
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,Wallis Blue Cocktail,Gin,1,"Lime wedge, superfine sugar",1 oz,Gin,1 oz,Triple Sec,1 oz,Fresh Lime Juice,,,,,Rim old-fashioned glass with lime and sugar. F...,Old-Fashioned Glass,6 to 8 ounces
986,Minnehaha Cocktail,Cocktail Classics,1/4 oz,Juice of Orange,1/2 oz,Dry Vermouth,1/2 oz,Sweet Vermouth,1 oz,Old Mr. Boston Dry Gin,,,,,Shake well with cracked ice and strain into 4 ...,Cocktail Glass,6 or more ounces
987,Wallick Cocktail,Gin,1 1/2 oz,Gin,1 1/2 oz,Dry Vermouth,1 oz,Triple Sec,,,,,,,Stir with ice and strain into chilled cocktail...,Cocktail Glass,6 or more ounces
988,Waikiki Beachcomber,Gin,3/4 oz,Gin,3/4 oz,Triple Sec,1/2 oz,Pineapple Juice,,,,,,,Shake with ice and strain into chilled cocktai...,Cocktail Glass,6 or more ounces


In [3]:
textvec = TextVectorization(
    max_tokens=10000,
    output_sequence_length=5,
    pad_to_max_tokens=True
)

textvec.adapt(df["name"])

voc = textvec.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

In [32]:
voc

['',
 '[UNK]',
 'cocktail',
 'the',
 'fizz',
 'no',
 'rum',
 'martini',
 'gin',
 'tequila',
 'sour',
 'vodka',
 'flip',
 'and',
 'punch',
 'highball',
 'collins',
 '1',
 'rose',
 'cooler',
 'brandy',
 'white',
 'pink',
 'golden',
 'cherry',
 'whiskey',
 'special',
 'orange',
 'bourbon',
 'apple',
 'milk',
 'margarita',
 'lady',
 '2',
 'purple',
 'of',
 'la',
 'irish',
 'dry',
 'daisy',
 'tea',
 'saronno',
 'rickey',
 'red',
 'green',
 'club',
 'chocolate',
 'blue',
 'black',
 'velvet',
 'sling',
 'silver',
 'peach',
 'mint',
 'lemonade',
 'grasshopper',
 'el',
 'coffee',
 'bronx',
 'boston',
 'blossom',
 'amaretto',
 'whisky',
 'tnt',
 'sunrise',
 'street',
 'sherry',
 'screwdriver',
 'royal',
 'pineapple',
 'passion',
 'oldfashioned',
 'kiss',
 'julep',
 'jamaican',
 'jack',
 'grapefruit',
 'gimlet',
 'flying',
 'dubonnet',
 'dream',
 'de',
 'cuban',
 'canadian',
 'bull',
 'bloody',
 'apricot',
 'a',
 'wine',
 'widows',
 'vermouth',
 'thunder',
 'taylor',
 'swizzle',
 'stone',
 'sting

In [4]:
path_to_glove_file = "glove.6B.300d.txt"


embeddings_index = {}
with open(path_to_glove_file, encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


In [5]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))


Converted 985 words (90 misses)


In [6]:
ingredients = pd.concat([df["ingredient-1"], df["ingredient-2"], df["ingredient-3"], df["ingredient-4"], df["ingredient-5"], df["ingredient-6"]]).unique()
ingredients.shape

from sklearn.preprocessing import LabelEncoder

ingredient_encoder = LabelEncoder()
ingredient_encoder.fit(ingredients)

LabelEncoder()

In [7]:
df["ingredient-1"] = ingredient_encoder.transform(df["ingredient-1"])
df["ingredient-2"] = ingredient_encoder.transform(df["ingredient-2"])
df["ingredient-3"] = ingredient_encoder.transform(df["ingredient-3"])
df["ingredient-4"] = ingredient_encoder.transform(df["ingredient-4"])
df["ingredient-5"] = ingredient_encoder.transform(df["ingredient-5"])
df["ingredient-6"] = ingredient_encoder.transform(df["ingredient-6"])

In [8]:
df

Unnamed: 0,name,category,measurement-1,ingredient-1,measurement-2,ingredient-2,measurement-3,ingredient-3,measurement-4,ingredient-4,measurement-5,ingredient-5,measurement-6,ingredient-6,instructions,glass,glass-size
0,Gauguin,Cocktail Classics,2 oz,325,1 oz,445,1 oz,297,1 oz,334,,687,,687,Combine ingredients with a cup of crushed ice ...,Old-Fashioned Glass,6 to 8 ounces
1,Fort Lauderdale,Cocktail Classics,1 1/2 oz,325,1/2 oz,551,1/4 oz,289,1/4 oz,292,,687,,687,Shake with ice and strain into old-fashioned g...,Old-Fashioned Glass,6 to 8 ounces
2,Apple Pie,Cordials and Liqueurs,3 oz,35,1 oz,113,,37,,687,,687,,687,Pour into ice-filled old-fashioned glass. Garn...,Old-Fashioned Glass,6 to 8 ounces
3,Cuban Cocktail No. 1,Cocktail Classics,1/2 oz,292,1/2 oz,479,2 oz,325,,687,,687,,687,Shake with ice and strain into cocktail glass.,Cocktail Glass,6 or more ounces
4,Cool Carlos,Cocktail Classics,1 1/2 oz,169,2 oz,142,2 oz,469,1 oz,428,1 oz,530,,687,"Mix all ingredients except curacao with ice, s...",Collins Glass,14 to 16 ounces
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,Wallis Blue Cocktail,Gin,1,660,1 oz,234,1 oz,568,1 oz,193,,687,,687,Rim old-fashioned glass with lime and sugar. F...,Old-Fashioned Glass,6 to 8 ounces
986,Minnehaha Cocktail,Cocktail Classics,1/4 oz,289,1/2 oz,178,1/2 oz,551,1 oz,396,,687,,687,Shake well with cracked ice and strain into 4 ...,Cocktail Glass,6 or more ounces
987,Wallick Cocktail,Gin,1 1/2 oz,234,1 1/2 oz,178,1 oz,568,,687,,687,,687,Stir with ice and strain into chilled cocktail...,Cocktail Glass,6 or more ounces
988,Waikiki Beachcomber,Gin,3/4 oz,234,3/4 oz,568,1/2 oz,469,,687,,687,,687,Shake with ice and strain into chilled cocktai...,Cocktail Glass,6 or more ounces


In [9]:
measurements = pd.concat([df["measurement-1"], df["measurement-2"], df["measurement-3"], df["measurement-4"], df["measurement-5"], df["measurement-6"]]).unique()
measurements.shape

measurement_encoder = LabelEncoder()
measurement_encoder.fit(measurements)

LabelEncoder()

In [10]:
df["measurement-1"] = measurement_encoder.transform(df["measurement-1"])
df["measurement-2"] = measurement_encoder.transform(df["measurement-2"])
df["measurement-3"] = measurement_encoder.transform(df["measurement-3"])
df["measurement-4"] = measurement_encoder.transform(df["measurement-4"])
df["measurement-5"] = measurement_encoder.transform(df["measurement-5"])
df["measurement-6"] = measurement_encoder.transform(df["measurement-6"])

In [11]:
pieces = []
pieces.append(df[["name"]])

for i in range(6):
    temp = pd.DataFrame(data=tf.keras.utils.to_categorical(df[f"ingredient-{i+1}"]), columns=[f"I{i+1} - {c}" for c in ingredient_encoder.classes_])
    pieces.append(temp)
    
    temp = pd.DataFrame(data=tf.keras.utils.to_categorical(df[f"measurement-{i+1}"]), columns=[f"M{i+1} - {c}" for c in measurement_encoder.classes_])
    pieces.append(temp)

data = pd.concat(pieces, axis=1)
data

Unnamed: 0,name,I1 - Old Mr. Boston Triple Sec,I1 - 100-proof Vodka,I1 - 151-Proof Rum,I1 - 17-year-old J. Wray and Nephew Ltd. Rum,I1 - 7-Up,I1 - Absinthe,I1 - Absinthe Substitute,I1 - Absinthe or pastis,I1 - Acai berry flavored vodka,...,M6 - 64 Ginger ale,M6 - 7,M6 - 8 Milk,M6 - For glass,M6 - champagne,M6 - fruits in season,M6 - spirals of lemon and orange peel,M6 - splash,M6 - twist of lemon,M6 - nan
0,Gauguin,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Fort Lauderdale,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,Apple Pie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,Cuban Cocktail No. 1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,Cool Carlos,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
985,Wallis Blue Cocktail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
986,Minnehaha Cocktail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
987,Wallick Cocktail,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
988,Waikiki Beachcomber,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
encoder_inputs = Input(shape=(1,), dtype=tf.string)

e = textvec(encoder_inputs)

e = Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False,
)(e)

encoder_outputs = LSTM(128)(e)

d = Dense(64)(encoder_outputs)
d = Dropout(0.25)(d)
d = Dense(64)(d)
d = Dropout(0.25)(d)

i1 = Dense(len(ingredient_encoder.classes_), activation="sigmoid", name="i1_output")(d)
i2 = Dense(len(ingredient_encoder.classes_), activation="sigmoid", name="i2_output")(d)
i3 = Dense(len(ingredient_encoder.classes_), activation="sigmoid", name="i3_output")(d)
i4 = Dense(len(ingredient_encoder.classes_), activation="sigmoid", name="i4_output")(d)
i5 = Dense(len(ingredient_encoder.classes_), activation="sigmoid", name="i5_output")(d)
i6 = Dense(len(ingredient_encoder.classes_), activation="sigmoid", name="i6_output")(d)

m1 = Dense(len(measurement_encoder.classes_), activation="sigmoid", name="m1_output")(d)
m2 = Dense(len(measurement_encoder.classes_), activation="sigmoid", name="m2_output")(d)
m3 = Dense(len(measurement_encoder.classes_), activation="sigmoid", name="m3_output")(d)
m4 = Dense(len(measurement_encoder.classes_), activation="sigmoid", name="m4_output")(d)
m5 = Dense(len(measurement_encoder.classes_), activation="sigmoid", name="m5_output")(d)
m6 = Dense(len(measurement_encoder.classes_), activation="sigmoid", name="m6_output")(d)

decoder_outputs = [i1, m1, i2, m2, i3, m3, i4, m4, i5, m5, i6, m6]
model = Model(encoder_inputs, decoder_outputs)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 text_vectorization (TextVector  (None, 5)           0           ['input_1[0][0]']                
 ization)                                                                                         
                                                                                                  
 embedding (Embedding)          (None, 5, 300)       323100      ['text_vectorization[0][0]']     
                                                                                                  
 lstm (LSTM)                    (None, 128)          219648      ['embedding[0][0]']          

In [13]:
losses = {
    "i1_output":"categorical_crossentropy",
    "i2_output":"categorical_crossentropy",
    "i3_output":"categorical_crossentropy",
    "i4_output":"categorical_crossentropy",
    "i5_output":"categorical_crossentropy",
    "i6_output":"categorical_crossentropy",
    "m1_output":"categorical_crossentropy",
    "m2_output":"categorical_crossentropy",
    "m3_output":"categorical_crossentropy",
    "m4_output":"categorical_crossentropy",
    "m5_output":"categorical_crossentropy",
    "m6_output":"categorical_crossentropy",
}

optimizer='adam'
metrics = {
    "i1_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "i2_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "i3_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "i4_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "i5_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "i6_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "m1_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "m2_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "m3_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "m4_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "m5_output":tf.metrics.CategoricalAccuracy(name='acc'),
    "m6_output":tf.metrics.CategoricalAccuracy(name='acc'),
}

model.compile(loss=losses, metrics=metrics, optimizer=optimizer)

In [14]:
X = data[["name"]].values

Ys = {}

for i in range(6):
    temp = data[[col for col in data.columns if f"I{i+1} - " in col]]
    Ys[f"i{i+1}_output"] = temp.values
    if temp.values.shape[1] != len(ingredient_encoder.classes_):
        raise RuntimeError(f"Too many columns pulled! Problem on I{i+1}")
    
    temp = data[[col for col in data.columns if f"M{i+1} - " in col]]
    Ys[f"m{i+1}_output"] = temp.values
    if temp.values.shape[1] != len(measurement_encoder.classes_):
        raise RuntimeError(f"Too many columns pulled! Problem on I{i+1}")

model.fit(X, Ys, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100


Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100


Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100


Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100


Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100


Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100


Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100


Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1bab9c6d6d0>

In [17]:
len(predicted_ingredients)

12

In [37]:
def create_recipe(name: str):
    """
    
    """
    
    predicted_ingredients = model.predict([name])
    
    recipe = []
    for i in range(6):
        ingredient = predicted_ingredients[2*i]
        measurement = predicted_ingredients[2*i + 1]
        
        ingredient = np.argmax(ingredient)
        measurement = np.argmax(measurement)
        
        ingredient = ingredient_encoder.classes_[ingredient]
        measurement = measurement_encoder.classes_[measurement]
        
        if str(ingredient).strip().lower() == "nan":
            continue
        else:
            recipe.append(f"{measurement} of {ingredient}")
    
    return recipe

create_recipe("Lemon Sour")

['1 1/2 oz of  Lemon wedge, superfine sugar',
 '2 oz of  Gin',
 '1/4 oz of  Grenadine',
 '1/4 oz of  Simple Syrup']