In [6]:
# Chrome version 125
chrome_driver_path = "../../bin/chromedriver-win64/chromedriver.exe"

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import pickle
import os

def get_cocktail_links(url):
    
    # set the path to the chromedriver
    chrome_options = Options()
    chrome_options.headless = True

    service = Service(chrome_driver_path)
    service.start()
    driver = webdriver.Chrome(service=service, options=chrome_options)

    driver.get(url)

    driver.implicitly_wait(10)

    try:
        no_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'rcc-confirm-button'))
        )
        no_button.click()
        print("Clicked on 'YES' button")
    except:
        print("Failed to find or click 'YES' button")

    while True:
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.CLASS_NAME, 'Button.Primary.Cyan.block.mt24.m__auto'))
            )
            load_more_button.click()
            print("Clicked on 'Load More' button")
        except:
            print("Failed to find or click 'Load More' button")
            break

    links = driver.find_elements(By.TAG_NAME, 'a')

    cocktail_links = []

    for link in links:
        href = link.get_attribute('href')
        if href and '/cocktails/' in href:
            cocktail_links.append(href)

    driver.quit()

    return cocktail_links

# load the pickle file
os.makedirs('../../data/raw/', exist_ok=True)
try:
    with open('../../data/raw/cocktail_links_list.pkl', 'rb') as f:
        cocktail_links_uique = pickle.load(f)
except:
    cocktail_links_uique = set()

url = 'https://cocktailclub.com/cocktails'

cocktail_links = get_cocktail_links(url)

print(len(cocktail_links))
print(cocktail_links[-5:])
cocktail_links_uique.update(cocktail_links)
print(len(cocktail_links_uique))

# Save the pickle file

with open('../../data/raw/cocktail_links_list.pkl', 'wb') as f:
    pickle.dump(cocktail_links_uique, f)

Clicked on 'YES' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Clicked on 'Load More' button
Failed to find or click 'Load More' button
199
['https://cocktailclub.com/cocktails/gin-tonic', 'https://cocktailclub.com/cocktails/pink-honey-grapefruit', 'https://cocktailclub.com/cocktails/aviation-tanqueray-10', 'https://cocktailclub.com/cocktails/white-lady-supreme', 'https://cocktailclub.com/cocktails/lynchburg-lemonade-long-drink-style']
306


In [2]:
import pandas as pd
from datasets import load_dataset

dataset = load_dataset("brianarbuckle/cocktail_recipes", split='train')

df = pd.DataFrame(dataset)

ner_column = df['ner']

ner_set = set()
for line in ner_column:
    for entity in line:
        ner_set.add(entity)
    
print(ner_set)

{'kahlua', 'cocchi americano', 'curaçao', 'cherry heering', 'sake', 'orange slice', 'benedictine', 'red wine', 'amaro', 'vermouth', 'whiskey', 'champagne', 'brandy', 'rum', 'absinthe', 'orange juice', 'grand marnier', 'midori', 'cognac', 'vodka', 'sloe gin', 'gin', 'sambuca', 'drambuie', 'schnapps', 'pernod', 'chambord', 'cointreau', 'wine', 'pastis', 'chartreuse', 'irish cream', 'galliano', 'bitters', 'limoncello', 'campari', 'lillet', 'triple sec', 'pimm’s', 'tequila', 'white wine', 'scotch', 'beer', 'bourbon', 'aperol', 'sherry', 'amaretto', 'coffee liqueur', 'maraschino liqueur'}


In [7]:
# save df to json
os.makedirs('../../data/clean/', exist_ok=True)
df.to_json('../../data/clean/cocktail_recipes.json', orient='records')

In [8]:
# import re
def extrait_cocktail_recipe(url):
    # set the path to the chromedriver
    chrome_options = Options()
    chrome_options.headless = True

    service = Service(chrome_driver_path)
    service.start()
    driver = webdriver.Firefox(service=service, options=chrome_options)

    driver.get(url)

    # driver.implicitly_wait(10)

    try:
        no_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'rcc-confirm-button'))
        )
        no_button.click()
        print("Clicked on 'YES' button")
    except:
        print("Failed to find or click 'YES' button")

    cocktail_recipe = {}

    # find title
    title_element = driver.find_element(By.CLASS_NAME, 'title')
    title_text = title_element.text
    # print(title_text)
    cocktail_recipe['title'] = title_text

    # find ingredients
    ingredients_div = driver.find_element(By.CLASS_NAME, 'Cocktail_Ingredients__meg4Z')
    ingredient_divs = ingredients_div.find_elements(By.CLASS_NAME, 'Cocktail_ingredientListitem__1nxMw')
    ingredient_lists = []
    for ingredient_div in ingredient_divs:
        spans = ingredient_div.find_elements(By.TAG_NAME, 'span')
        ingredient_text = [span.text for span in spans]
        ingredient_lists.append(' '.join(ingredient_text))
    # print(ingredient_lists)
    cocktail_recipe['ingredients'] = ingredient_lists

    # find steps
    tab_list = driver.find_element(By.CLASS_NAME, 'react-tabs__tab-list')
    tabs = tab_list.find_elements(By.CLASS_NAME, 'react-tabs__tab')
    try:
        tabs[1].click()
        print("Clicked on 'Step-by-step'")
    except:
        print("Failed to find or click 'Step-by-step'")

    steps_div = driver.find_element(By.CLASS_NAME, 'Cocktail_Steps__bNwrE')
    step_divs = steps_div.find_elements(By.TAG_NAME, 'div')
    step_texts = []
    for step_div in step_divs:
        h3_text = step_div.find_element(By.TAG_NAME, 'h3').text.replace('\n', ' ')
        p_text = step_div.find_element(By.TAG_NAME, 'p').text
        ul_texts = [li.text for li in step_div.find_elements(By.TAG_NAME, 'li')]
        step_text = ','.join(ul_texts)
        
        step_texts.append(h3_text + ": " + p_text + ". " + step_text)
    # print(step_texts)
    cocktail_recipe['directions'] = step_texts

    # find tools
    # try:
    #     tabs[2].click()
    #     print("Clicked on 'Tools'")
    # except:
    #     print("Failed to find or click 'Tools'")

    # tools_div = driver.find_element(By.CLASS_NAME, 'row.Cocktail_Tools__Y0R_R')
    # tool_divs = tools_div.find_elements(By.TAG_NAME, 'div')
    # tools = []
    # for tool_div in tool_divs:
    #     tool_text = tool_div.text
    #     tools.append(tool_text)
    # print(tools)
    # cocktail_recipe['tools'] = ', '.join(tools)

    # find description
    description_div = driver.find_element(By.CSS_SELECTOR, '[itemprop="description"]')
    description_text = description_div.text.replace('\n', ' ')
    # print(description_text)
    cocktail_recipe['misc'] = description_text.split('Bartender Tip:')

    # find Tasting Profile Alcohol/Bitter/Sour/Sweet (percent)
    # taste_profile_div = driver.find_element(By.CLASS_NAME, 'Cocktail_TasteProfile__5L4Fv')
    # taste_profile_divs = taste_profile_div.find_elements(By.TAG_NAME, 'div')

    # tps = []

    # for div in taste_profile_divs:
    #     style = div.get_attribute('style')
    #     match = re.search(r'\b\d+\b', style)
    #     tps.append(match.group()) if match else None

    # print(tps)
    # cocktail_recipe['alcohol'] = tps[0]
    # cocktail_recipe['bitter'] = tps[1]
    # cocktail_recipe['sour'] = tps[2]
    # cocktail_recipe['sweet'] = tps[3]

    driver.quit()

    cocktail_recipe['source'] = "cocktailclub"

    ingredients = [i.lower() for i in ingredient_lists]
    ner = []
    for i in ingredients:
        for ner_item in ner_set:
            if ner_item in i:
                ner.append(ner_item)
            

    cocktail_recipe['ner'] = ner

    return cocktail_recipe

extrait_cocktail_recipe('https://cocktailclub.com/cocktails/long-island-iced-tea')
# extrait_cocktail_recipe('https://cocktailclub.com/cocktails/gin-tonic')


Clicked on 'YES' button
Clicked on 'Step-by-step'


{'title': 'Long Island Iced Tea',
 'ingredients': ['10 ml Gin',
  '10 ml Vodka',
  '10 ml Tequila',
  '10 ml Triple Sec',
  '10 ml Light Rum',
  '20 ml Sugar syrup',
  '20 ml Lemon juice',
  ' Top up coca cola',
  '1  Lemon wedge'],
 'directions': ['1 Prepare: Cut a wedge of lemon ready for garnish and for juicing. ',
  '2 Ice: Fill your highball glass and cocktail shaker with cubed ice. ',
  '3 Add ingredients: Pour in all the 5 white spirits, the lemon juice and the sugars into your shaker. 10 ml Gin,10 ml Vodka,10 ml Tequila,10 ml Triple Sec,10 ml Light Rum,20 ml Sugar syrup,20 ml Lemon juice',
  '4 Shake: Close the shaker lid and shake really hard for 7 seconds. ',
  '5 Strain: Take off the small top off the shaker and strain out the ingredients into your chilled glass.. ',
  '6 Coke: Pour the coke in to fill up the glass. Top up coca cola',
  '7 Stir, garnish and serve: Using your barspoon, stir all the ingredients together and add in your lemon wedge as garnish. 1 Lemon wedge'],


In [11]:
import json
import pickle

with open('../../data/raw/cocktail_links_list.pkl', 'rb') as f:
    cocktail_links = pickle.load(f)

cocktail_recipes = []

for link in list(cocktail_links):
    cocktail_recipes.append(extrait_cocktail_recipe(link))


Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'
Clicked on 'YES' button
Clicked on 'Step-by-step'


In [12]:

# save to text
with open('../../data/raw/cocktailclub_recipes.txt', 'w') as f:
    f.write(str(cocktail_recipes))


In [13]:

# save to json
# Ctrl(Cmd)+Alt+M for JSON pretty.
try:
    with open('../../data/raw/cocktailclub_recipes.json', 'w') as f:
        json.dump(cocktail_recipes, f)
except:
    print("Failed to save to JSON")