In [1]:
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
import re
from googletrans import Translator
translator = Translator()

In [2]:
def trans(txt):
    result = translator.translate(txt).text
    if '\u200b' in result:
            result = result.replace('\u200b', '')
    return result

def get_keywords(ul):
    zh_keyword = []
    en_keyword = []
    zh_ingredient = []
    en_ingredient = []
    key = ul.find_all('span', {'class', 'ingredient_name'})
    for k in key:
        zh_key = re.sub(r'[A-Za-z.]', '', k.a.string)
        if ('：' in zh_key) or (zh_key in zh_keyword) or (zh_key is None):
            continue
        en_key = trans(zh_key).lower()
        zh_keyword.append(zh_key)
        en_keyword.append(en_key)
        zh_ingredient.append(zh_key)
        en_ingredient.append(en_key) 
    return zh_keyword, en_keyword, zh_ingredient, en_ingredient

def get_seasoning(ul):
    zh_seasoning = []
    en_seasoning = []
    seasoning = ul.find_all('span', {'class', 'ingredient_name'})
    for s in seasoning:
        if s.a.string is None:
            continue
        else:
            zh_seasoning.append(s.a.string)
            en_seasoning.append(trans(s.a.string).lower()) 
    return zh_seasoning, en_seasoning

def get_marinade(ul):
    zh_marinade = []
    en_marinade = []
    marinade = ul.find_all('span', {'class', 'ingredient_name'})
    for m in marinade:
        if m.a.string is None:
            continue
        else:
            zh_marinade.append(m.a.string)
            en_marinade.append(trans(m.a.string).lower()) 
    return zh_marinade, en_marinade

In [3]:
url = 'https://www.ytower.com.tw/recipe/recipe-search.asp?MAINFOOD=%BD%DE%A6%D7'
recipe_links = []
driver1 = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

driver1.get(url)    

while(len(recipe_links) <= 5):
    driver1.execute_script("window.scrollTo(0,document.body.scrollHeight)")
    time.sleep(3)
    soup = BeautifulSoup(driver1.page_source, 'html.parser')
    att_a = soup.find('div', {'id' : 'recipe_suggest'}).find_all('a')
    for a in att_a:
        link = a.get('href')
        if ('Note' in link) or (link in recipe_links):
            continue
        else:
            recipe_links.append(link)
driver1.close()
driver1.quit()

In [4]:
en_prompt = []
en_completion = []
zh_prompt = []
zh_completion = []
recipt = []

driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()))
i = 1

for link in recipe_links:
    print('Processing:', i, '/', len(recipe_links))
    url = 'https://www.ytower.com.tw' + link
    recipt.append(url)
    
    driver.get(url)
    time.sleep(2.5)
    
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    name = soup.find('h2').a.string
    cur_name = re.sub(r'[\\(\d+\\)]', '', name)

    all_ul = soup.find_all('ul', {'class', 'ingredient'})

    zh_keyword = []
    en_keyword = []
    zh_ingredient = []
    en_ingredient = []
    zh_seasoning = []
    en_seasoning = []
    zh_marinade = []
    en_marinade = []

    if len(all_ul) > 3:
        continue
    else:
        if len(all_ul) == 1: # 食材
            zh_keyword, en_keyword, zh_ingredient, en_ingredient = get_keywords(all_ul[0])
        elif len(all_ul) == 2: # 食材+調味料
            zh_keyword, en_keyword, zh_ingredient, en_ingredient = get_keywords(all_ul[0])
            zh_seasoning, en_seasoning = get_seasoning(all_ul[1])
        elif len(all_ul) == 3: # 食材+調味料+醃料
            zh_keyword, en_keyword, zh_ingredient, en_ingredient = get_keywords(all_ul[0])
            zh_seasoning, en_seasoning = get_seasoning(all_ul[1])
            zh_marinade, en_marinade = get_marinade(all_ul[2])

    all_step = soup.find_all('li', {'class', 'step'})
    zh_step = []
    en_step = []
    for step in all_step:
        step = str(step).replace('<li class="step">', '').replace('<br/>', '').replace('</li>', '').replace(' ','').replace('\n', '')
        zh_step.append(step)
        en_step.append(trans(step)) 

    zh_prompt.append('、'.join(zh_keyword))  # 中文的prompt
    en_prompt.append('、'.join(en_keyword))  # 英文的prompt

    zh_com = ''
    if len(zh_ingredient) != 0:
        zh_com += '{}\n食材：{}\n'.format(cur_name,'，'.join(filter(lambda x: x if x is not None else '', zh_ingredient)))
    if len(zh_seasoning) != 0:
        zh_com += '調味料：{}\n'.format('，'.join(filter(lambda x: x if x is not None else '', zh_seasoning)))
    if len(zh_marinade) != 0:
        zh_com += '醃料：{}\n'.format('，'.join(filter(lambda x: x if x is not None else '', zh_marinade)))
    if len(zh_step) != 0:
        zh_com += '作法：{}'.format(''.join(filter(lambda x: x if x is not None else '', zh_step)))

    en_com = ''
    if len(en_ingredient) != 0:
        en_com += '{}\nIngredients:{}\n'.format(trans(cur_name),','.join(filter(lambda x: x if x is not None else '', en_ingredient)))
    if len(en_seasoning):
        en_com += 'Seasonings:{}\n'.format(','.join(filter(lambda x: x if x is not None else '', en_seasoning)))
    if len(en_marinade) != 0:
        en_com += 'Marinade:{}\n'.format(','.join(filter(lambda x: x if x is not None else '', en_marinade)))
    if len(en_step) != 0:
        en_com += 'Method:{}'.format(' '.join(filter(lambda x: x if x is not None else '', en_step)))

    zh_completion.append(zh_com)
    en_completion.append(en_com)

    zh_df = pd.DataFrame({'prompt':zh_prompt, 'completion':zh_completion, 'links':recipt})
    en_df = pd.DataFrame({'prompt':en_prompt, 'completion':en_completion, 'links':recipt})
    zh_df.to_excel('./data/new_zh_pork_1126.xlsx', index = None, encoding = 'utf-8')
    en_df.to_excel('./data/new_en_pork_1126.xlsx', index = None, encoding = 'utf-8')
    i += 1
driver.close()
driver.quit()

Processing: 1 / 24
Processing: 2 / 24
Processing: 3 / 24
Processing: 4 / 24
Processing: 5 / 24
Processing: 6 / 24
Processing: 7 / 24
Processing: 8 / 24
Processing: 9 / 24
Processing: 10 / 24
Processing: 11 / 24
Processing: 12 / 24
Processing: 13 / 24
Processing: 14 / 24
Processing: 15 / 24
Processing: 16 / 24
Processing: 17 / 24
Processing: 18 / 24
Processing: 19 / 24
Processing: 20 / 24
Processing: 21 / 24
Processing: 22 / 24
Processing: 23 / 24
Processing: 24 / 24
