In [None]:
!pip install translate

In [538]:
from translate import Translator
translator= Translator(from_lang = 'ru', to_lang="en")

In [2]:
import numpy as np
import pandas as pd
import json

# Проект по анализу данных

## Сбор данных

### 1. Из датасета enriched_recipes

In [533]:
with open('enriched_recipes.json') as f:
    data = json.load(f)
    df = pd.json_normalize(
        data, 'parsed_ingredients',
        meta=['total_time_minutes',
              'url',
              'title'
             ],
        record_prefix='ingredient_', errors='ignore')

Чистим

In [534]:
df.drop("ingredient_comment", axis=1, inplace=True)
df.drop("ingredient_input", axis=1, inplace=True)
df.drop("ingredient_other", axis=1, inplace=True)
df.drop("ingredient_range_end", axis=1, inplace=True)
df = df.loc[:, ['url', 'title', 'total_time_minutes', 'ingredient_name', 'ingredient_qty', 'ingredient_unit']]
df['source'] = "allrecipes"
df.replace(np.nan, '', inplace=True)

Много уникальных ингредиентов. Это плохо. Попробуем упростить

In [535]:
len(df['ingredient_name'].value_counts().index.tolist())

38057

In [536]:
simple_ingredients = ['garlic', 'sugar', 'onion' ,'butter', 'milk', 'mustard', 'egg', 'olive oil', 'breadcrumbs', 'cumin', 'oregano', 'parsley', 
                      'thyme', 'parmesan', 'soy sauce', 'chicken broth', 'white wine', 'flour', 'cornstarch', 'ginger', 'yoghurt', 'cilantro', 
                      'basil', 'tumeric', 'salt', 'cinnamon', 'vanilla', 'chicken breast', 'rice', 'cauliflower', 'carrot', 'cabbage', 'cheese', 'chicken thigh', 
                      'avocado', 'curry', 'tomato', 'zucchini', 'paprika', 'coriander', 'vinegar', 'mince beef', 'black pepper', 'red wine', 'rosemary', 'kidney beans', 
                      'masala', 'pumpkin', 'lettuce']

Если в названии ингредиента есть одно из этих слов, то название ингредиента просто заменяется на слово

In [537]:
for simple in simple_ingredients:
    df.loc[df['ingredient_name'].str.contains(simple), 'ingredient_name'] = simple

Немного лучше

In [539]:
len(df['ingredient_name'].value_counts().index.tolist())

30534

### 2. Русский датасет из задания 2

In [540]:
russian_df = pd.read_csv('all_recipes.csv')

Чистим

In [541]:
russian_df.drop('Unnamed: 0', axis=1, inplace=True)
russian_df.drop('calories', axis=1, inplace=True)
russian_df.drop('description', axis=1, inplace=True)
russian_df.drop('query', axis=1, inplace=True)
russian_df.drop('macros', axis=1, inplace=True)
russian_df['time'] = russian_df['time'].str.replace(" ч", '*60').str.replace(' мин','*1').str.replace(' д','*1440').str.replace(' ','+').apply(eval)
russian_df['ingredients'] = [x.strip('[]').split(',') for x in russian_df['ingredients']]

Тоже чистим

In [542]:
russian_exploded = russian_df.explode('ingredients')
russian_exploded['ingredients'] = russian_exploded['ingredients'].str.strip(" ''")
russian_exploded[['ingredient_name', 'quantities']] = russian_exploded['ingredients'].str.split(pat=" - ", expand=True)
russian_exploded[['ingredient_qty', 'ingredient_unit']] = russian_exploded['quantities'].str.split(expand=True)
russian_exploded.drop('ingredients', axis=1, inplace=True)
russian_exploded.drop('quantities', axis=1, inplace=True)

In [543]:
unique_ingredients = russian_exploded['ingredient_name'].unique()

Делает dict с переводами всех уникальных ингридиентов. Качество перевода так-себе, работает долго (у меня крутится где-то минут 8)

In [544]:
translation_dict = {}
for russian in unique_ingredients:
    try:
        translation = translator.translate(russian)
        translation_dict[russian] = translation
    except:
        pass

Чистим dict с переводами (иногда перевод вообще неправильный)

In [545]:
translation_dict['какао'] = 'cocoa'
translation_dict['сахар'] = 'sugar'
translation_dict['куриные яйца'] = 'eggs'
translation_dict['сахарная пудра'] = 'powdered sugar'
translation_dict['разрыхлитель'] = 'baking powder'
translation_dict['горький шоколад'] = 'dark chocolate'
translation_dict['яичные белки'] = 'egg whites'
translation_dict['мята'] = 'mint'
translation_dict['кофейный ликер'] = 'coffee liqueur'
translation_dict['апельсины'] = 'oranges'
translation_dict['сода'] = 'baking soda'
translation_dict['куриное филе'] = 'chicken breast'
translation_dict['ягоды'] = 'berries'
translation_dict['копченая колбаса'] = 'smoked sausage'
translation_dict['куриные грудки'] = 'chicken breast'
translation_dict['поваренная соль'] = 'salt'
translation_dict['черника'] = 'blueberry'
translation_dict['колбаса'] = 'sausage'
translation_dict['мак пищевой'] = 'poppy seeds'
translation_dict['лимонная цедра'] = 'lemon peel'
translation_dict['кукурузный крахмал'] = 'cornstartch'
translation_dict['творожный сыр'] = 'cottage cheese'
translation_dict['столовый уксус'] = 'vinegar'
translation_dict['пищевой краситель'] = 'food coloring'
translation_dict['сироп'] = 'syrup'
translation_dict['апельсиновый ликер'] = 'orange liqueur'
translation_dict['черемуха'] = 'bird cherry'
translation_dict['стручок ванили'] = 'vanilla pod'
translation_dict['вино красное'] = 'red wine'
translation_dict['душистый перец'] = 'allspice'
translation_dict['капуста белокочанная'] = 'cabbage'
translation_dict['виски'] = 'whisky'
translation_dict['печень говяжья'] = 'cow liver'
translation_dict['перец чили'] = 'chili'
translation_dict['томатная паста'] = 'tomato paste'
translation_dict['мойва'] = 'capelin'
translation_dict['настойка'] = 'tincture'
translation_dict['хурма'] = 'persimmon'
translation_dict['фейхоа'] = 'feijoa'

dict с переводами единиц измерения

In [546]:
unit_dict = {
    'гр':'gram',
    'стак.':'cup',
    'стол.л.':'tablespoon',
    'чайн.л.':'teaspoon',
    'шт.':'units',
    'вкусу':'to taste',
    'л': 'liter',
    'мл':'ml',
    'пак.':'package',
    'кг':'kilogram',
    'банк.':'can',
    'десерт.л.':'teaspoon',
    'зубч.':'clove'
}

Переводим (иногда с первого раза не работает и надо несколько раз нажать. Не могу даже представить почему)

In [547]:
russian_exploded["ingredient_name"] = russian_exploded["ingredient_name"].apply(
    lambda name: translation_dict.get(name, name))
russian_exploded["ingredient_unit"] = russian_exploded["ingredient_unit"].apply(
    lambda name: unit_dict.get(name, name))
russian_exploded["ingredient_name"] = russian_exploded["ingredient_name"].str.lower()

Чистим

In [548]:
russian_exploded['source'] = "russian"
russian_exploded.rename(columns={'link': 'url', 'name': 'title', 'time':'total_time_minutes'}, inplace=True)
russian_exploded = russian_exploded.loc[:, ['url', 'title', 'total_time_minutes', 'ingredient_name', 'ingredient_qty', 'ingredient_unit', 'source']]
russian_exploded['ingredient_name'] = russian_exploded['ingredient_name'].str.replace('/[^a-zA-Z ]+/g', '', regex=True)

Результат

In [549]:
russian_exploded

Unnamed: 0,url,title,total_time_minutes,ingredient_name,ingredient_qty,ingredient_unit,source
0,https://1000.menu/cooking/43048-domashnii-klas...,Домашний классический тирамису,200,savoyardi cookies,200,gram,russian
0,https://1000.menu/cooking/43048-domashnii-klas...,Домашний классический тирамису,200,coffee,1,cup,russian
0,https://1000.menu/cooking/43048-domashnii-klas...,Домашний классический тирамису,200,cocoa,70,gram,russian
0,https://1000.menu/cooking/43048-domashnii-klas...,Домашний классический тирамису,200,brandy,1,tablespoon,russian
0,https://1000.menu/cooking/43048-domashnii-klas...,Домашний классический тирамису,200,mascarpone,250,gram,russian
...,...,...,...,...,...,...,...
774,https://1000.menu/cooking/10221-varene-iz-ship...,Варенье из шиповника,180,sugar,6,cup,russian
774,https://1000.menu/cooking/10221-varene-iz-ship...,Варенье из шиповника,180,citric acid,по,to taste,russian
775,https://1000.menu/cooking/9773-varene-iz-irgi,Варенье из ирги на зиму пятиминутка,360,amelanchier,900,gram,russian
775,https://1000.menu/cooking/9773-varene-iz-irgi,Варенье из ирги на зиму пятиминутка,360,sugar,450,gram,russian


Соединяем

In [550]:
all_and_russian = pd.concat([df, russian_exploded], ignore_index=True)

In [551]:
all_and_russian

Unnamed: 0,url,title,total_time_minutes,ingredient_name,ingredient_qty,ingredient_unit,source
0,http://allrecipes.com/Recipe/6664/,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,butter,1/2,cup,allrecipes
1,http://allrecipes.com/Recipe/6664/,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,onion,1,cup,allrecipes
2,http://allrecipes.com/Recipe/6664/,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,cornmeal,1 3/4,cup,allrecipes
3,http://allrecipes.com/Recipe/6664/,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,flour,1 1/4,cup,allrecipes
4,http://allrecipes.com/Recipe/6664/,"Basil, Roasted Peppers and Monterey Jack Cornb...",100,sugar,1/4,cup,allrecipes
...,...,...,...,...,...,...,...
843959,https://1000.menu/cooking/10221-varene-iz-ship...,Варенье из шиповника,180,sugar,6,cup,russian
843960,https://1000.menu/cooking/10221-varene-iz-ship...,Варенье из шиповника,180,citric acid,по,to taste,russian
843961,https://1000.menu/cooking/9773-varene-iz-irgi,Варенье из ирги на зиму пятиминутка,360,amelanchier,900,gram,russian
843962,https://1000.menu/cooking/9773-varene-iz-irgi,Варенье из ирги на зиму пятиминутка,360,sugar,450,gram,russian


### 3. Собираем дополнительные данные с recipetineats.com

In [209]:
from bs4 import BeautifulSoup
import requests

Собираем ссылки и названия

In [216]:
site = 'https://www.recipetineats.com/category/main-dishes/?fwp_paged='
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"}
links = []
names = []
for i in range(16):
    response = requests.post(site+str(i+1), headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        for recipe in soup.find_all(class_='entry-title-link'):
            links.append(recipe.get('href'))
            names.append(recipe.text.strip())
    

In [217]:
recipetineats = pd.DataFrame({
    'link':links,
    'name':names
})

In [218]:
recipetineats

Unnamed: 0,link,name
0,https://www.recipetineats.com/fall-apart-massa...,5 Minute Fall-apart Massaman Lamb Shoulder
1,https://www.recipetineats.com/sausage-ragu/,Sausage Ragu with Pappardelle Pasta
2,https://www.recipetineats.com/qeema-indian-cur...,Qeema – Easy Indian Curried Beef Mince
3,https://www.recipetineats.com/moroccan-baked-e...,Easy Moroccan Stuffed Eggplant (beef or lamb)
4,https://www.recipetineats.com/moroccan-lamb-me...,Moroccan Lamb Meatballs
...,...,...
315,https://www.recipetineats.com/slow-cooker-cris...,Slow Cooker Crispy Chinese Shredded Chicken
316,https://www.recipetineats.com/fish-piccata-cri...,Fish Piccata With Crispy Smashed Potatoes (15 ...
317,https://www.recipetineats.com/chicken-green-ve...,Chicken & Vegetable Rice (One Pot)
318,https://www.recipetineats.com/syrian-chicken-g...,Syrian Chicken


Для каждой ссылки собираем время приготовления в минутах и ингридиенты

In [271]:
times = []
mega_ingredients = []
mega_links = []
for link in recipetineats['link']:
    response = requests.post(link, headers=headers)
    if response.status_code == 200:
        complete_ingredients = []
        soup = BeautifulSoup(response.text, 'html.parser')
        hours = soup.find(class_ = 'wprm-recipe-cook_time-hours')
        if hours:
            sr_tag = hours.find(class_='sr-only')
            sr_tag.extract()
            hours = hours.text.strip()
        else: hours = 0
        minutes = soup.find(class_ = 'wprm-recipe-cook_time-minutes')
        if minutes:
            sr_tag = minutes.find(class_='sr-only')
            sr_tag.extract()
            minutes = minutes.text.strip()
        else: minutes = 0
        time = int(hours)*60+int(minutes)
        times.append(time)
        ingredients =soup.find_all(class_= 'wprm-recipe-ingredient')
        for ingredient in ingredients:
            try: 
                quantity = ingredient.find(class_ = 'wprm-recipe-ingredient-amount').text.strip()
            except: quantity = 0
            try:
                name = ingredient.find(class_ = 'wprm-recipe-ingredient-name').text.strip()
            except: name = 0
            try:
                unit = ingredient.find(class_ = 'wprm-recipe-ingredient-unit').text.strip()
                if unit=='':
                    unit = 0
            except: unit = 0
            text_ingredient = str(quantity) + "-----" + str(unit) + "-----" + str(name)
            complete_ingredients.append(text_ingredient)
        mega_ingredients.append(complete_ingredients)
        mega_links.append(link)

In [510]:
recipetineats_ingredients = pd.DataFrame({
    'link':mega_links,
    'total_time_minutes':times,
    'mega_ingredients':mega_ingredients
})

In [511]:
recipetineats_ingredients

Unnamed: 0,link,total_time_minutes,mega_ingredients
0,https://www.recipetineats.com/fall-apart-massa...,270,"[2-2.25kg/ 4 – 4.5 lb-----0-----lamb shoulder,..."
1,https://www.recipetineats.com/sausage-ragu/,135,"[2-----tbsp-----olive oil, 1-----0-----small o..."
2,https://www.recipetineats.com/qeema-indian-cur...,15,"[3-----tbsp-----vegetable or canola oil, 4----..."
3,https://www.recipetineats.com/moroccan-baked-e...,45,"[2-----x 250g/8oz-----eggplants, 3/4-----tsp--..."
4,https://www.recipetineats.com/moroccan-lamb-me...,10,"[1 1/2-----tbsp-----olive oil, 500 g / 1 lb---..."
...,...,...,...
315,https://www.recipetineats.com/slow-cooker-cris...,480,[3 - 4 lb / 1.5 - 2 kg-----0-----whole chicken...
316,https://www.recipetineats.com/fish-piccata-cri...,10,"[2-----pieces-----firm white fish fillets, 6--..."
317,https://www.recipetineats.com/chicken-green-ve...,15,"[300g (10oz)-----0-----chicken breast, 2-----t..."
318,https://www.recipetineats.com/syrian-chicken-g...,30,"[2-----tbsp-----olive oil, 2 lb / 1 kg-----0--..."


Соединяем с названиями

In [512]:
recipetineats_exploded = pd.merge(recipetineats, recipetineats_ingredients, on='link', how='left')

Чистим

In [513]:
recipetineats_exploded = recipetineats_exploded.explode('mega_ingredients')
recipetineats_exploded[['ingredient_qty_bad', 'ingredient_unit_bad', 'ingredient_name']] = recipetineats_exploded['mega_ingredients'].str.split(pat="-----", expand=True)
recipetineats_exploded.drop('mega_ingredients', axis=1, inplace=True)
recipetineats_exploded.ingredient_qty_bad.replace('\/ (.*)', '', regex=True, inplace=True)

  recipetineats_exploded.ingredient_qty_bad.replace('\/ (.*)', '', regex=True, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  recipetineats_exploded.ingredient_qty_bad.replace('\/ (.*)', '', regex=True, inplace=True)


Чистим

In [None]:
recipetineats_exploded[['ingredient_qty', 'ingredient_unit_bad2']] = recipetineats_exploded['ingredient_qty_bad'].str.extract('([^a-zA-Z ]+)( ?[a-zA-Z]+)?', expand=True)
recipetineats_exploded.drop('ingredient_qty_bad', axis=1, inplace=True)
recipetineats_exploded.replace(np.nan, '', inplace=True)
recipetineats_exploded.replace('0', '', inplace=True)
recipetineats_exploded['ingredient_unit'] = recipetineats_exploded['ingredient_unit_bad'] + recipetineats_exploded['ingredient_unit_bad2']
recipetineats_exploded.drop('ingredient_unit_bad', axis=1, inplace=True)
recipetineats_exploded.drop('ingredient_unit_bad2', axis=1, inplace=True)
recipetineats_exploded["ingredient_name"] = recipetineats_exploded["ingredient_name"].str.lower()

Опять много уникальных ингридиентов. Чистим

In [516]:
len(recipetineats_exploded['ingredient_name'].value_counts().index.tolist())

1618

In [518]:
for simple in simple_ingredients:
    recipetineats_exploded.loc[recipetineats_exploded['ingredient_name'].str.contains(simple), 'ingredient_name'] = simple

In [519]:
len(recipetineats_exploded['ingredient_name'].value_counts().index.tolist())

854

In [520]:
recipetineats_exploded

Unnamed: 0,link,name,total_time_minutes,ingredient_name,ingredient_qty,ingredient_unit
0,https://www.recipetineats.com/fall-apart-massa...,5 Minute Fall-apart Massaman Lamb Shoulder,270,lamb shoulder,2-2.25,kg
0,https://www.recipetineats.com/fall-apart-massa...,5 Minute Fall-apart Massaman Lamb Shoulder,270,curry,114,g
0,https://www.recipetineats.com/fall-apart-massa...,5 Minute Fall-apart Massaman Lamb Shoulder,270,milk,400,g
0,https://www.recipetineats.com/fall-apart-massa...,5 Minute Fall-apart Massaman Lamb Shoulder,270,chicken stock/broth,3,cups
0,https://www.recipetineats.com/fall-apart-massa...,5 Minute Fall-apart Massaman Lamb Shoulder,270,onion,1,
...,...,...,...,...,...,...
318,https://www.recipetineats.com/syrian-chicken-g...,Syrian Chicken,30,thyme,3,sprigs of
318,https://www.recipetineats.com/syrian-chicken-g...,Syrian Chicken,30,dried currants or sultanas,1/4,cup
318,https://www.recipetineats.com/syrian-chicken-g...,Syrian Chicken,30,giant couscous,8,oz
318,https://www.recipetineats.com/syrian-chicken-g...,Syrian Chicken,30,yoghurt,,


In [521]:
recipetineats_exploded['source'] = "recipetineats"
recipetineats_exploded.rename(columns={'link': 'url', 'name': 'title'}, inplace=True)
recipetineats_exploded = recipetineats_exploded.loc[:, ['url', 'title', 'total_time_minutes', 'ingredient_name', 'ingredient_qty', 'ingredient_unit', 'source']]

Соединяем все вместе

In [522]:
all_recipes = pd.concat([all_and_russian, recipetineats_exploded], ignore_index=True)

In [26]:
all_recipes.loc[all_recipes['ingredient_name'] == 'vanilla sugar', 'ingredient_name'] = 'vanilla'
all_recipes.loc[all_recipes['ingredient_name'] == 'ground cinnamon', 'ingredient_name'] = 'cinnamon'
all_recipes.loc[all_recipes['ingredient_name'] == 'vanilla extract', 'ingredient_name'] = 'vanilla'
all_recipes.loc[all_recipes['ingredient_name'] == 'black pepper powder', 'ingredient_name'] = 'black pepper'
all_recipes.loc[all_recipes['ingredient_name'] == 'vanilla tincture', 'ingredient_name'] = 'vanilla'

In [27]:
len(all_recipes['ingredient_name'].unique())

31295

In [28]:
all_recipes.to_csv('my_all_recipes.csv')

In [3]:
all_recipes = pd.read_csv('my_all_recipes.csv')