In [1]:
import warnings
import pandas as pd
import numpy as np
import nltk
import collections
import random
# import matplotlib.pyplot as plt
# import seaborn as sns
# import os
# import re
# import math
# from sqlalchemy import create_engine
# import pymysql
# import requests
# from bs4 import BeautifulSoup
# from sklearn.linear_model import LinearRegression
# from statsmodels.tsa.ar_model import AR
# from statsmodels.tools.eval_measures import rmse
# from scipy import stats

In [2]:
pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('./clean_data/clean_data.csv')
df.drop('Unnamed: 0', axis=1, inplace=True)

### Step 1: transform dataset:
- each cocktail = one row
- all ingredients generalized and combined in one column

In [4]:
# Add non-alcoholic ingredients in a separate column

df['non_alc_ingredients'] = np.where(~df['Basic_taste'].isnull(), df['strIngredients'], np.nan)

In [5]:
# Reduce dataset to only the columns required for current step

df_ingredients = df[['strDrink', 'strCategory', 'Alc_type', 'non_alc_ingredients', 'Garnish_type']]

In [6]:
# Drop rows with garnish, this part of the cocktail will be processed on the later stages

df_ingredients.drop(df_ingredients.loc[~df_ingredients['Garnish_type'].isnull()].index, axis=0, inplace=True)

In [7]:
# Drop rows with related to punches to avoid scaling issue in next steps

df_ingredients.drop(df_ingredients.loc[df_ingredients['strCategory'].str.contains('Punch')].index, axis=0, inplace=True)

In [8]:
# Replace nan values with empty string to align data type for every value

df_ingredients.fillna('', inplace=True)

In [9]:
# Replace ' ' with '_' to keep adjectives with nouns

df_ingredients['Alc_type'] = df_ingredients['Alc_type'].apply(lambda x: x.replace(' ', '_'))
df_ingredients['non_alc_ingredients'] = df_ingredients['non_alc_ingredients'].apply(lambda x: x.replace(' ', '_'))

In [10]:
# Reallocate ingredients as columns so each column takes only one row

df_ingredients = df_ingredients[df_ingredients['Alc_type'] != 'Alc_type']
s =  df_ingredients.groupby('strDrink').cumcount().add(1)
df_ingredients = (df_ingredients.set_index(['strDrink',s])
        .unstack()
        .sort_index(axis=1, level=1)
       )
df_ingredients.columns = ['{}_{}'.format(a, b) for a,b in df_ingredients.columns]

df_ingredients = df_ingredients.reset_index()
df_ingredients.head()

Unnamed: 0,strDrink,Alc_type_1,Garnish_type_1,non_alc_ingredients_1,strCategory_1,Alc_type_2,Garnish_type_2,non_alc_ingredients_2,strCategory_2,Alc_type_3,Garnish_type_3,non_alc_ingredients_3,strCategory_3,Alc_type_4,Garnish_type_4,non_alc_ingredients_4,strCategory_4,Alc_type_5,Garnish_type_5,non_alc_ingredients_5,strCategory_5,Alc_type_6,Garnish_type_6,non_alc_ingredients_6,strCategory_6,Alc_type_7,Garnish_type_7,non_alc_ingredients_7,strCategory_7,Alc_type_8,Garnish_type_8,non_alc_ingredients_8,strCategory_8
0,'57 Chevy with a White License Plate,Creamy_Liqueur,,,Cocktail,Vodka,,,Cocktail,,,,,,,,,,,,,,,,,,,,,,,,
1,1-900-FUK-MEUP,Vodka,,,Shot,Triple_Sec,,,Shot,Sweet_Liqueur,,,Shot,Sweet_Liqueur,,,Shot,Rum,,,Shot,Sweet_Liqueur,,,Shot,,,Cranberry_Juice,Shot,,,Pineapple_Juice,Shot
2,110 in the shade,Beer,,,Beer,Tequila,,,Beer,,,,,,,,,,,,,,,,,,,,,,,,
3,151 Florida Bushwacker,Rum,,,Milk / Float / Shake,Rum,,,Milk / Float / Shake,Rum,,,Milk / Float / Shake,Creamy_Liqueur,,,Milk / Float / Shake,Triple_Sec,,,Milk / Float / Shake,,,Milk,Milk / Float / Shake,Sweet_Liqueur,,,Milk / Float / Shake,,,Vanilla_Ice-Cream,Milk / Float / Shake
4,155 Belmont,Rum,,,Cocktail,Rum,,,Cocktail,Vodka,,,Cocktail,,,Orange_Juice,Cocktail,,,,,,,,,,,,,,,,


In [11]:
df_ingredients['Ingredients'] = df_ingredients.drop(['strDrink'], axis=1).fillna('').apply(lambda x:
                                                                          ' '.join(x), axis=1).apply(lambda x: x.lower())
df_ingredients = df_ingredients[['strDrink', 'Ingredients']]

In [12]:
df_ingredients.head()

Unnamed: 0,strDrink,Ingredients
0,'57 Chevy with a White License Plate,creamy_liqueur cocktail vodka cocktail ...
1,1-900-FUK-MEUP,vodka shot triple_sec shot sweet_liqueur ...
2,110 in the shade,beer beer tequila beer ...
3,151 Florida Bushwacker,rum milk / float / shake rum milk / float ...
4,155 Belmont,rum cocktail rum cocktail vodka cocktail...


### Step 2:
- split ingerients by pairs
- compute most common pairs (N - to be defined depending on results)

In [13]:
# Generate pairs from ingredients of each cocktail and combine them in one list

bigram = [list(nltk.bigrams(nltk.word_tokenize(i))) for i in df_ingredients['Ingredients']]
pairs_list = [j for i in bigram for j in i]
print(len(pairs_list))
pairs_list[:10]

3917


[('creamy_liqueur', 'cocktail'),
 ('cocktail', 'vodka'),
 ('vodka', 'cocktail'),
 ('vodka', 'shot'),
 ('shot', 'triple_sec'),
 ('triple_sec', 'shot'),
 ('shot', 'sweet_liqueur'),
 ('sweet_liqueur', 'shot'),
 ('shot', 'sweet_liqueur'),
 ('sweet_liqueur', 'shot')]

There are some tuples where elements are swopped but essential taste of such combination is not unique. Such tuples need to be alighned.

In [14]:
# First, define a list of tuples to be amended

to_aligh = []
for a in pairs_list:
    for b in pairs_list:
        if a != b:
            if b[1] == a[0]:
                if b[0] == a[1]:            # If both elements of tuple are equal to the swopped tuple under the check
                    if a not in to_aligh and b not in to_aligh:
                        to_aligh.append(b)

In [15]:
# Second, amend them

pairs_list_aligned = []
for i in pairs_list:
    if i in to_aligh:
        pairs_list_aligned.append((i[1], i[0]))
    else:
        pairs_list_aligned.append(i)

In [16]:
# Check that total number of tuples hasn't changed, only content should be amended

len(pairs_list_aligned) == len(pairs_list)

True

In [17]:
# Define 25% of the most common pairs as a separate list

counter=collections.Counter(pairs_list_aligned)
print(len(counter))
common_pairs = counter.most_common(int(len(counter)*0.25))
common_pairs

422


[(('ordinary', 'drink'), 968),
 (('/', 'float'), 100),
 (('gin', 'ordinary'), 72),
 (('sweet_liqueur', 'ordinary'), 65),
 (('rum', 'ordinary'), 63),
 (('milk', '/'), 50),
 (('/', 'shake'), 50),
 (('vodka', 'ordinary'), 50),
 (('drink', 'sweet_liqueur'), 47),
 (('brandy', 'ordinary'), 47),
 (('coffee', '/'), 46),
 (('/', 'tea'), 46),
 (('triple_sec', 'ordinary'), 43),
 (('shot', 'sweet_liqueur'), 41),
 (('drink', 'triple_sec'), 39),
 (('drink', 'lemon_juice'), 39),
 (('lemon_juice', 'ordinary'), 39),
 (('whisky', 'ordinary'), 39),
 (('cocktail', 'vodka'), 35),
 (('vermouth', 'ordinary'), 35),
 (('creamy_liqueur', 'shot'), 34),
 (('bitter', 'ordinary'), 32),
 (('drink', 'gin'), 29),
 (('drink', 'powdered_sugar'), 29),
 (('powdered_sugar', 'ordinary'), 29),
 (('drink', 'bitter'), 28),
 (('drink', 'orange_juice'), 25),
 (('orange_juice', 'ordinary'), 25),
 (('drink', 'lemon'), 25),
 (('lemon', 'ordinary'), 25),
 (('grenadine', 'ordinary'), 24),
 (('creamy_liqueur', 'ordinary'), 24),
 (('co

In [18]:
common_ingredients = []

for n in common_pairs:
    common_ingredients.append(n[0][0])
    common_ingredients.append(n[0][1])
common_ingredients = list(set(common_ingredients))
common_ingredients

['schnapps',
 'lemon_juice',
 'grenadine',
 'tea',
 'carbonated_water',
 'creamy_liqueur',
 'float',
 'cranberry_juice',
 'club_soda',
 'egg_white',
 'beer',
 'soft',
 'vodka',
 'wine',
 'maraschino_cherry',
 'rum',
 'sugar',
 'champagne',
 'other/unknown',
 'shot',
 'vermouth',
 'sweet_and_sour',
 'shake',
 'water',
 'sour_mix',
 'tequila',
 'gin',
 'ordinary',
 'lime',
 'lime_juice',
 'orange_juice',
 'coffee',
 'sweet_liqueur',
 'sugar_superfine',
 'sambuca',
 'soda',
 'brandy',
 'light_cream',
 'lemon',
 'triple_sec',
 '/',
 'powdered_sugar',
 'orange',
 'drink',
 'bitter',
 'pineapple_juice',
 'cocktail',
 'milk',
 'whisky']

### Step 3: generate a frame for new combinations:
- define total number of ingerients (random choice from a range 3-6))
- from ingredients included in top pairs pick one randomly
- find suitable pair for this ingredient (from all pairs, but give top pairs bigger weight)
- do the same for the next ingredient but check that it's not included already
- etc until limit is reached

In [19]:
# Define total number of ingredients for a new cocktail

n_ingr = random.randint(3,7)

In [20]:
new_cocktail = []

In [21]:
# Pick the first ingredient randomly from the top popular

first_ingr = random.choice(common_ingredients)
new_cocktail.append(first_ingr)

In [22]:
# Identify remaning ingredients

counter = 1          # Not zero because the first ingredient is defined separatelly above
while counter < n_ingr:
    temp_list = []
    for i in set(pairs_list_aligned):
        if new_cocktail[-1] in i:
            temp_list.append(i)
    random_pair = random.choice(temp_list)
    if random_pair[0] == new_cocktail[-1]:
        next_ingr = random_pair[1]
    else:
        next_ingr = random_pair[0]
    new_cocktail.append(next_ingr)
    counter += 1

In [23]:
new_cocktail

['sugar', 'shot', 'sprite', 'ordinary']

### Step 4: define volume of each ingredient

In [24]:
# Create new dataframe with relevant columns only

df_measures = df[['strDrink', 'strCategory', 'Alc_type', 'non_alc_ingredients', 'Value_ml', 'Value_gr', 'Garnish_type']]

In [25]:
# Drop rows with garnish, this part of the cocktail will be processed on the later stages

df_measures.drop(df_measures.loc[~df_measures['Garnish_type'].isnull()].index, axis=0, inplace=True)

In [26]:
# Remove cocktails given in big volumes to align scale

df_measures.drop(df_measures.loc[df_measures['strCategory'].str.contains('Punch')].index, axis=0, inplace=True)

In [27]:
# Combine values in one column and define measure for each in a separate column

df_measures['Measure'] = np.where(df_measures['Value_ml'].isnull(), 'gr', 'ml')
df_measures['Value'] = df_measures['Value_ml'].fillna(0) + df_measures['Value_gr'].fillna(0)
df_measures['Ingredients'] = df_measures['Alc_type'].fillna('') + df_measures['non_alc_ingredients'].fillna('')
df_measures['Ingredients'] = df_measures['Ingredients'].apply(lambda x: x.lower()).apply(lambda x: x.replace(' ', '_'))
df_measures.head()

Unnamed: 0,strDrink,strCategory,Alc_type,non_alc_ingredients,Value_ml,Value_gr,Garnish_type,Measure,Value,Ingredients
0,'57 Chevy with a White License Plate,Cocktail,Creamy Liqueur,,30.0,,,ml,30.0,creamy_liqueur
1,1-900-FUK-MEUP,Shot,Vodka,,15.0,,,ml,15.0,vodka
2,110 in the shade,Beer,Beer,,480.0,,,ml,480.0,beer
3,151 Florida Bushwacker,Milk / Float / Shake,Rum,,15.0,,,ml,15.0,rum
4,155 Belmont,Cocktail,Rum,,25.0,,,ml,25.0,rum


In [28]:
# Combine value and measure in one string, this way it will be easier to pick up random value together with correct measure

df_measures['Value'] = df_measures['Value'].astype('object').apply(lambda x: str(x))
df_measures['Value_Measure'] = df_measures[['Value', 'Measure']].apply(lambda x: ' '.join(x), axis=1)
df_measures.head()

Unnamed: 0,strDrink,strCategory,Alc_type,non_alc_ingredients,Value_ml,Value_gr,Garnish_type,Measure,Value,Ingredients,Value_Measure
0,'57 Chevy with a White License Plate,Cocktail,Creamy Liqueur,,30.0,,,ml,30.0,creamy_liqueur,30.0 ml
1,1-900-FUK-MEUP,Shot,Vodka,,15.0,,,ml,15.0,vodka,15.0 ml
2,110 in the shade,Beer,Beer,,480.0,,,ml,480.0,beer,480.0 ml
3,151 Florida Bushwacker,Milk / Float / Shake,Rum,,15.0,,,ml,15.0,rum,15.0 ml
4,155 Belmont,Cocktail,Rum,,25.0,,,ml,25.0,rum,25.0 ml


In [29]:
# Pick up random volume of each ingredient

for i in new_cocktail:
    volume = [random.choice(df_measures.loc[df_measures['Ingredients'] == i]['Value_Measure'].tolist()) for i in new_cocktail]
    new_cocktail_final = pd.DataFrame({'Ingredient': new_cocktail, 'Volume': volume})
    new_cocktail_final

IndexError: Cannot choose from an empty sequence

### Step 5: define garnish

***1) Prepare list of main ingredients with garnish for each cocktail in dataset***

In [None]:
# Define reduced dataframe which contains only relevant fields

df_garnish = df[['strDrink', 'strIngredients', 'Alc_type', 'Value_ml', 'Garnish_amount', 'Garnish_type']]

In [None]:
df_garnish['Ingredients'] = np.where(df_garnish['Alc_type'].isnull(), df_garnish['strIngredients'], df_garnish['Alc_type'])

In [None]:
# Combine value and measure of garnish in one column so it will be easier to pick it up later in the code

df_garnish['Garnish_ingr'] = np.where(~df_garnish['Garnish_amount'].isnull(), df_garnish['strIngredients'], np.nan)
df_garnish['Garnish'] = df_garnish[['Garnish_ingr', 'Garnish_amount', 'Garnish_type']].fillna('').apply(lambda x:
                                                                                                       ' '.join(x), axis=1)
df_garnish['Garnish'] = df_garnish['Garnish'].apply(lambda x: x.replace('0 top', 'top'))
df_garnish['Garnish'] = np.where(df_garnish['Garnish'] == '  ', np.nan, df_garnish['Garnish'])

In [None]:
# Rearrange dataframe that way that it's possible to identify garnish per cocktail per dominant ingredient

drink_name_list = []
garnish_list = []
ingredient_list = []

for drink in df_garnish['strDrink'].unique():      # For each cocktial
    df_selected = df_garnish.loc[df_garnish['strDrink'] == drink]
    max_value = df_selected['Value_ml'].max()

    for ingr in df_selected.loc[df_selected['Value_ml'] == max_value]['Ingredients']:
        for garnish in df_selected['Garnish'].unique():
            drink_name_list.append(drink)
            garnish_list.append(garnish)           # Include all garnishes for the drink
            ingredient_list.append(ingr)           # Include an ingredient taking the biggest part of the drink

df_garnish_final = pd.DataFrame({'Drink':drink_name_list, 'Ingredient':ingredient_list, 'Garnish':garnish_list})
df_garnish_final.drop(df_garnish_final.loc[df_garnish_final['Garnish'].isnull()].index, axis=0, inplace = True)

In [None]:
df_garnish_final.head()

***2) Identify a main ingredient of a generated cocktail***

In [None]:
new_cocktail_final

In [None]:
new_cocktail_final = new_cocktail_final.join(new_cocktail_final['Volume'].str.rsplit(n=1, expand=True).rename(columns={
                                                                                                    0: 'Value', 1: 'Measure'}))
new_cocktail_final['Value'] = new_cocktail_final['Value'].astype('float')

In [None]:
main_ingr = random.choice(new_cocktail_final.loc[new_cocktail_final['Value'] ==
                                                                     new_cocktail_final['Value'].max()]['Ingredient'].tolist())

In [None]:
new_cocktail_final.drop(['Value', 'Measure'], axis=1, inplace=True)

***3) Find suitable garnish and add to the recipe***

In [None]:
garnish_to_add = df_garnish_final.loc[df_garnish_final['Ingredient'] == main_ingr]['Garnish'].tolist()

if garnish_to_add:
    new_cocktail_final.loc[len(new_cocktail_final)] = ['garnish', garnish_to_add]

In [None]:
new_cocktail_final

### List alcoholic ingredients

In [None]:
df.head()

In [None]:
df['Alc_type'].unique()

In [None]:
for i in df['Alc_type'].unique():
    print(i)
    print(df.loc[df['Alc_type'] == i]['strIngredients'].unique())
    print('*'*100)

In [None]:
# for i in new_cocktail:
#     random.choice(df.loc[df['Alc_type'] == i]['strIngredients'].unique().tolist())

In [None]:
new_cocktail_final

In [None]:
a = df_measures.loc[df_measures['Ingredients'] == 'gin']
a.loc[a['Measure'] == 'gr']

In [None]:
df_measures.loc[df_measures['Value'] == '0.0']

In [None]:
len(df_measures)

In [None]:
df.loc[df['strIngredients'] == "Lavender"]

In [None]:
# Try to set up lists of liqueurs and maybe other alcohol ingredients.

In [None]:
# Refactoring might be done to rearrange the code that way that to generate new cocktail you don't have to run whole file