#  Прототип рекомендательного сервиса рецептов

## Описание данных

датасет из Epicurious, подготовленный HugoDarwood

## План работы

 1.Исследование:
 - Подготовка данных
 - Прогноз рейтинга с помощью регрессионных алгоритмов и их ансамблей (), метрика RMSE
 - Бинаризация целевой переменной в классы `bad` (0, 1) (невкусное), `so-so` (2, 3) (нормальное), `great` (4, 5) (вкусное). 
 - Алгоритмы классификации и подбор метрики для решения задачи прогноза.
 - Определение метрики на тестовой выборке.
 - Сохранение лучшей модели.
 
 2.Пищевая ценность и похожие рецепты:
 - Сбор в датафрейм информации о пищевой ценности продуктов из `FoodData Central API`
 - Сбор в датафрейм для каждого рецепта из набора данных ссылки на сайт epicurious.com и подробную информацию о нем (название рецепта, рейтинг на платформе и URl). 



## 0. Импорты

In [3]:
#Импорт библиотек
import pandas as pd
import numpy as np
import joblib
import itertools
#Выбор модели
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
#Модели
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
#Ансамбли
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
#Метрики
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import plot_confusion_matrix
#Наивный классификатор
from sklearn.dummy import DummyClassifier
from sklearn.dummy import DummyRegressor
#парсинг
import requests
from bs4 import BeautifulSoup
#графики
import matplotlib.pyplot as plt

In [4]:
import warnings
warnings.filterwarnings('ignore')

## 1. Прогноз

### 1.1 Подготовка данных

Будем использовать датасет из Epicurious, подготовленный HugoDarwood

Отфильтруем поля: удалим как можно больше столбцов, которые не относятся к названиями ингридиентов. Если модель обучится на других данных, которые не будут передаваться на вход, то точность прогнозов будет заведомо ниже.

In [40]:
#Загрузим датасет и ознакомимся с ним
df = pd.read_csv('./data/epi_r.csv')
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 680 entries, title to turkey
dtypes: float64(679), object(1)
memory usage: 104.0+ MB


In [42]:
df.describe(include='all')

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
count,20052,20052.0,15935.0,15890.0,15869.0,15933.0,20052.0,20052.0,20052.0,20052.0,...,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0,20052.0
unique,17736,,,,,,,,,,...,,,,,,,,,,
top,Pastry Dough,,,,,,,,,,...,,,,,,,,,,
freq,28,,,,,,,,,,...,,,,,,,,,,
mean,,3.714467,6322.958,100.160793,346.8775,6225.975,0.000299,5e-05,0.000848,0.001346,...,0.001247,0.026332,5e-05,0.000299,0.014861,0.00015,0.000349,0.001396,0.000948,0.022741
std,,1.340829,359046.0,3840.318527,20456.11,333318.2,0.017296,0.007062,0.029105,0.036671,...,0.035288,0.160123,0.007062,0.017296,0.121001,0.012231,0.018681,0.037343,0.030768,0.14908
min,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,3.75,198.0,3.0,7.0,80.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,,4.375,331.0,8.0,17.0,294.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,,4.375,586.0,27.0,33.0,711.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
df['yellow squash'].unique()

array([0., 1.])

In [44]:
stroka = df.loc[0]
stroka[stroka!=0]

title           Lentil, Apple, and Turkey Wrap 
rating                                      2.5
calories                                  426.0
protein                                    30.0
fat                                         7.0
sodium                                    559.0
apple                                       1.0
bean                                        1.0
cookie                                      1.0
fruit                                       1.0
kid-friendly                                1.0
lentil                                      1.0
lettuce                                     1.0
sandwich                                    1.0
tomato                                      1.0
vegetable                                   1.0
turkey                                      1.0
Name: 0, dtype: object

Видно, что столбцы с 0-6 не относятся к ингридиентам и мы можем их удалить. Для каждого рецепта необходимые ингридиенты отмечены "1" в соответствующем ингридиенту столбце. Также есть столбцы kid-friendly, bastille day, bon appétit, food processor, new year's eve, winter - которые так же не относятся к ингридиентам, а характеризуют само блюдо.  

Надо как-то отделить столбцы, которые не относятся к ингридиентам. Уберем из наименований столбцов те, названия которых встречаются в title - это скорее всего ингридиенты.

In [45]:
#создаем множество на основании слов, которые встречаются в title
set_title = set()
for i, row in df.iterrows():
    title_list = row['title'].lower().replace(',', ' ').replace('-', ' ').replace('(', ' ').replace(')', ' ').replace('"', ' ').split(' ')
    exceptions = ['and', 'with', 'in', 'on', ]
    set_title = set_title.union({x for x in title_list if x not in exceptions})
set_title

{'',
 'saffroned',
 'pappadam',
 'whale',
 'side',
 'earth',
 'mini',
 'zing',
 'three',
 "peng's",
 'ramos',
 'leek',
 'suckling',
 'adult',
 'croquetas',
 'waikaloa',
 'elixir',
 'shortcakes',
 'comte',
 'cow',
 'al',
 'blossoms',
 'multi',
 'mignonette',
 'nika',
 'hamburgers',
 'fillet',
 'sachertorte',
 'sex',
 'lobster',
 'year',
 'shabu',
 'wrap',
 'banh',
 '10',
 "bubbe's",
 'flatbread',
 'cannoli',
 'press',
 'pavé',
 'spicy',
 'papillote',
 'res',
 'hamburger',
 'croissant',
 'loudspeaker',
 'espresso',
 'hudson',
 'chilled',
 'buffalo',
 'ouzo',
 'feeling',
 'mizuna',
 'lassi',
 'peanutty',
 'lamb',
 'reviver',
 'cheeses',
 'royale',
 'slow',
 'hawaiian',
 'blazer',
 'energy',
 'chile–rubbed',
 'mosel',
 'blis',
 'nachos',
 'bazooka',
 'render',
 "d'orange",
 'strong',
 'thick',
 'coats',
 'gemelli',
 'palmiers',
 'late',
 'celery',
 'gpc',
 'prairie',
 'gulasch',
 "carla's",
 'dogs',
 'pisco',
 "buffet's",
 'provencales',
 'chupacabra',
 'francis',
 'slab',
 'ba',
 'garnie'

In [46]:
len(df.columns)

680

In [47]:
#Предполагаем, что скорее всего можно отнести к ингридиентам те столбцы, в названиях которых встречаются слова из title
columns = [col for col in df.columns if col not in set_title]
len(columns)

306

In [48]:
#В названиях столбцов встречаются названия штатов и крупных городов США, а также названия праздников. Попробуем отсортировать хоть часть из них.
list_of_USA_states = ['Alabama', 'Montgomery', 'Birmingham', 'Alaska', 'Juneau', 'Anchorage', 'Arizona', 'Phoenix', 'Arkansas', 'Little Rock', 
'California', 'Sacramento', 'Los Angeles', 'Colorado', 'Denver', 'Connecticut', 'Hartford', 'Bridgeport', 'Delaware', 'Dover', 
'Wilmington', 'Florida', 'Tallahassee', 'Jacksonville', 'Georgia', 'Atlanta', 'Hawaii', 'Honolulu', 'Idaho', 'Boise', 
'Illinois', 'Springfield', 'Chicago', 'Indiana', 'Indianapolis', 'Iowa', 'Des Moines', 'Kansas', 'Topeka', 'Wichita', 
'Kentucky', 'Frankfort', 'Louisville', 'Louisiana', 'Baton Rouge', 'New Orleans', 'Maine', 'Augusta', 'Portland', 'Maryland', 
'Annapolis', 'Baltimore', 'Massachusetts', 'Boston', 'Michigan', 'Lansing', 'Detroit', 'Minnesota', 'St. Paul', 'Minneapolis', 
'Mississippi', 'Jackson', 'Missouri', 'Jefferson City', 'Kansas City', 'Montana', 'Helena', 'Billings', 'Nebraska', 'Lincoln', 
'Omaha', 'Nevada', 'Carson City', 'Las Vegas', 'New Hampshire', 'Concord', 'Manchester', 'New Jersey', 'Trenton', 'Newark', 
'New Mexico', 'Santa Fe', 'Albuquerque', 'New York', 'Albany', 'New York', 'North Carolina', 'Raleigh', 'Charlotte', 
'North Dakota', 'Bismarck', 'Fargo', 'Ohio', 'Columbus', 'Oklahoma', 'Oklahoma City', 'Oregon', 'Salem', 'Portland', 
'Pennsylvania', 'Harrisburg', 'Philadelphia', 'Rhode Island', 'Providence', 'South Carolina', 'Columbia', 'South Dakota', 
'Pierre', 'Sioux Falls', 'Tennessee', 'Nashville', 'Memphis', 'Texas', 'Austin', 'Houston', 'Utah', 'Salt Lake City', 'Vermont',
'Montpelier', 'Burlington', 'Virginia', 'Richmond', 'Virginia Beach', 'Washington', 'Olympia', 'Seattle', 'West Virginia', 
'Charleston', 'Wisconsin', 'Madison', 'Milwaukee', 'Wyoming', 'Cheyenne']
list_of_USA_states = [x.lower() for x in list_of_USA_states]

In [49]:
#Оставляем как ингридиенты те, значения, которых так же нет в list_of_USA_states
columns = [col for col in columns if col not in list_of_USA_states]
len(columns)

264

Остальные значения придется обработать вручную )

In [50]:
#Список наименований, которых нет в set_title надо проверить вручную, чтобы найти оставшиеся ингридиенты
columns

['title',
 'rating',
 'calories',
 'sodium',
 '#cakeweek',
 '#wasteless',
 '22-minute meals',
 '3-ingredient recipes',
 '30 days of groceries',
 'advance prep required',
 'alcoholic',
 'anthony bourdain',
 'aperitif',
 'apple juice',
 'asian pear',
 'aspen',
 'australia',
 'back to school',
 'backyard bbq',
 'bastille day',
 'beef rib',
 'beef shank',
 'beef tenderloin',
 'bell pepper',
 'beverly hills',
 'blue cheese',
 'bok choy',
 'bon appétit',
 'bon app��tit',
 'broccoli rabe',
 'brown rice',
 'brunch',
 'brussel sprout',
 'bulgaria',
 'butternut squash',
 'butterscotch/caramel',
 'cambridge',
 'camping',
 'canada',
 'candy thermometer',
 'casserole/gratin',
 'chile pepper',
 'christmas eve',
 'cinco de mayo',
 'cobbler/crumble',
 'cocktail party',
 'coffee grinder',
 'cognac/armagnac',
 'collard greens',
 'condiment',
 'condiment/spread',
 'cook like a diner',
 'cookbook critic',
 'costa mesa',
 'cottage cheese',
 'cranberry sauce',
 'cream cheese',
 'créme de cacao',
 'cr��me de

In [51]:
#Получился список ингридиентов, которые надо оставить, столбец rating - оставляем так как он - target
ingridients = ['apple juice',  'asian pear','beef rib', 'beef shank','beef tenderloin','bell pepper','blue cheese',  'bok choy', 'broccoli rabe','brussel sprout', 'butternut squash', 'chile pepper','collard greens','condiment',
 'brown rice', 'cottage cheese','cranberry sauce', 'eau de vie', 'egg nog','flat bread','fortified wine',
 'cream cheese', 'dried fruit','fritter','fruit juice','goat cheese', 'green bean', 'ground beef',
 'créme de cacao','ground lamb',  'hot pepper','ice cream','iced coffee',
 'iced tea', 'jam or jelly', 'jerusalem artichoke','lamb chop','leafy green', 'legume', 'lemon juice', 'lima bean','lime juice',
 'lamb shank','macadamia nut', 'maple syrup', 'monterey jack', 'mustard greens', 'orange juice',
 'passion fruit',  'peanut butter','pine nut', 'pomegranate juice','pork chop',
 'pork rib', 'poultry sausage','rack of lamb','red wine','root vegetable','salad dressing','sesame oil',
 'pork tenderloin', 'soup/stew', 'sour cream', 'soy sauce', 'sparkling wine', 'sugar snap pea',  'swiss cheese','tree nut', 'triple sec', 'tropical fruit','white wine','whole wheat',
 'wild rice', 'yellow squash', 'spirit', 'butterscotch/caramel', 'cognac/armagnac', 'sweet potato/yam', 'milk/cream', 
              'condiment/spread', 'green onion/scallion', 'hominy/cornmeal/masa', 'phyllo/puff pastry dough',
              'soufflé/meringue', 'stuffing/dressing', 'rating']

In [52]:
#Создаем список названий ингридиентов - те значения, которые есть в set_title и их нет в list_of_USA_states
ingridients_1 = [col for col in df.columns if col in set_title]
ingridients_1 = [col for col in ingridients_1 if col not in list_of_USA_states]
print(len(ingridients_1))

358


In [53]:
#эти названия столбцов убираем их из датасета
under_question = ['protein',  'boil','chile','chill','drink','drinks','easter', 'england','fall', 'fat', 'fruit',
            'fry','grill', 'london','mexico', 'orzo','paleo', 'paris','party','passover', 'pernod', 'quiche','raw','roast',
            'rub','self','steam','stock','summer','thanksgiving','vegan','vegetarian', 'wedding', 'winter','snack', 'fat', 
            'anniversary', 'bake', 'birthday', 'braise', 'blender', 'halloween', 'jícama', 'kale','kirsch', 'microwave', 
            'picnic', 'wok', 'breakfast']
len(under_question)

49

In [54]:
#Сохраняем названия, которых нет в списке under_question - это ингридиенты
ingridients_1 = [ing for ing in ingridients_1 if ing not in under_question]
len(ingridients_1)

310

In [55]:
#Столбцы, которые оставим в датасете
total_ingridients = ingridients_1 + ingridients
len(total_ingridients)

396

In [56]:
#Датасет, состоящий из ингридиентов
df_new = df[total_ingridients]
df_new.head()

Unnamed: 0,almond,amaretto,anchovy,anise,appetizer,apple,apricot,artichoke,arugula,asparagus,...,cognac/armagnac,sweet potato/yam,milk/cream,condiment/spread,green onion/scallion,hominy/cornmeal/masa,phyllo/puff pastry dough,soufflé/meringue,stuffing/dressing,rating
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.5
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.375
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.75
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.125


In [57]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 396 entries, almond to rating
dtypes: float64(396)
memory usage: 60.6 MB


In [58]:
#Приведем тип данных столбцов (кроме rating) к int
column_names = df_new.columns[:-1]
df_new[column_names] = df_new[column_names].astype('int8')
df_new['rating'] = pd.to_numeric(df_new['rating'], downcast="float")

In [59]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20052 entries, 0 to 20051
Columns: 396 entries, almond to rating
dtypes: float32(1), int8(395)
memory usage: 7.6 MB


In [60]:
#Проверка на полные дубликаты
df_new.duplicated().sum() 

2134

In [61]:
#Удаление дубликатоов
df_new = df_new.drop_duplicates(keep='first')
df_new.duplicated().sum() 

0

In [62]:
#Сохраняем датафрейм в файл
df_new.to_csv('./data/ingridients.csv', index=False, compression={'method':'zip'})

In [63]:
#разделяем датасет на test и trein
X_train, X_test, y_train, y_test = train_test_split(df_new.drop('rating', axis=1), df_new.rating, stratify=df_new.rating,
                                                    test_size=0.2, random_state=21)

In [64]:
#очистка памяти
%reset_selective -f "^df$"

### 1.2 Регрессия

Для прогнозирования рейтинга попробуем различные алгоритмы и их гиперпараметры. Выберем лучшее решение на основе gridsearch и кроссвалидации и оценим RMSE на тестовой подвыборке.  

Посчитаем RMSE для наивного регрессора, в котором просто высчитали средний рейтинг.

In [29]:
def modelselection(grids, grid_dict):
    
    """Принимает на вход список экземпляров GridSearchCV и словарь, в котором ключами являются индексы из этого списка, 
    а значениями – названия моделей. Возвращает лучшую модель в своем классе, ее результат на тестовой 
    выборке """
    
    evaluations=[]
    
    for i in range(len(grids)):
        grids[i].fit(X_train, y_train)
        score_test = grids[i].score(X_test, y_test)
        evaluations.append(grids[i].best_score_)
            
        print('Estimator:', grid_dict[i])
        print('Best params:', grids[i].best_params_)
        print('Best score: ', grids[i].best_score_)
        print('Best test score: ', score_test)
        print()
            
    return print('Best estimator:', grid_dict[evaluations.index(max(evaluations))])

In [30]:
#Список экземпляров GridSearchCV

#LinearRegression
params = {'fit_intercept':[True,False],  
          'copy_X':[True, False]}
gs_lr = GridSearchCV(LinearRegression(), params, error_score='raise', n_jobs=-1, cv=3, 
                     scoring='neg_root_mean_squared_error')

#DecisionTreeRegressor
params = {'max_depth':[i for i in range(4, 10, 2)],
         'min_samples_split':[i for i in range(2, 4)],
          'min_samples_leaf':[i for i in range(2, 5)]
         }
gs_tree = GridSearchCV(DecisionTreeRegressor(random_state=21), params, error_score='raise', n_jobs=-1, 
                       cv=3, scoring='neg_root_mean_squared_error')

#RandomForestRegressor
params = {'n_estimators':[5, 10, 15, 20, 30, 50, 100, 150], 
          'max_depth':[i for i in range(4, 12, 2)],
          'min_samples_split':[i for i in range(2, 5)],
          'min_samples_leaf':[i for i in range(1, 5)]
         }
gs_rf = GridSearchCV(RandomForestRegressor(random_state=21), params, error_score='raise', n_jobs=-1, 
                       cv=3, scoring='neg_root_mean_squared_error')

#GradientBoostingRegressor
params = {'learning_rate': [0.01, 0.05],
          'n_estimators' : [50, 100, 150],
          'max_depth'    : [4,6,8],
          'min_samples_split' : [i for i in range(2, 5)],
          'min_samples_leaf' : [i for i in range(1, 5)],
                 }
gs_gbr = GridSearchCV(GradientBoostingRegressor(random_state=21), params, error_score='raise', n_jobs=-1, 
                       cv=3, scoring='neg_root_mean_squared_error')

#SVR
params = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
          'C' : [1,5,10],
          'degree' : [3,8],
          'gamma' : ('auto','scale')}
gs_svr = GridSearchCV(SVR(), params, error_score='raise', n_jobs=-1, cv=3, scoring='neg_root_mean_squared_error')

grids = [gs_lr, gs_tree, gs_rf, gs_gbr, gs_svr ]

In [31]:
#Словарь с названиями моделей
grid_dict = {0:'LinearRegression', 1:'DecisionTreeRegressor', 2:'RandomForestRegressor', 3:'GradientBoostingRegressor', 4:'SVR'}

In [32]:
modelselection(grids, grid_dict)

Estimator: LinearRegression
Best params: {'copy_X': True, 'fit_intercept': True}
Best score:  -63913268814.17646
Best test score:  -1002564779098.9299

Estimator: DecisionTreeRegressor
Best params: {'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best score:  -1.2834451375344054
Best test score:  -1.2811137366561487

Estimator: RandomForestRegressor
Best params: {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}
Best score:  -1.264673216661259
Best test score:  -1.2599552926913014

Estimator: GradientBoostingRegressor
Best params: {'learning_rate': 0.05, 'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 150}
Best score:  -1.2587524087384694
Best test score:  -1.254072298508256

Estimator: SVR
Best params: {'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Best score:  -1.293084321773685
Best test score:  -1.282169590536687

Best estimator: GradientBoostingRegressor


In [33]:
##Список экземпляров GridSearchCV для ансамблей
gbr = GradientBoostingRegressor(learning_rate=0.05, max_depth=6, min_samples_split=4,
                          n_estimators=150, random_state=21)
rf = RandomForestRegressor(max_depth=10, min_samples_leaf=4, n_estimators=150,
                      random_state=21)
svr = SVR(C=1)

#VotingRegressor
params = {'weights': [(1, 1, 1), (2, 1, 1), (1, 2, 1), (1, 1, 2), (2, 2, 1), (1, 2, 2), (2, 1, 2), (3, 2, 1), (1, 3, 2), 
                      (2, 1, 3), (3, 1, 2)]
         }
gs_vr = GridSearchCV(VotingRegressor(estimators=[('gbr', gbr), ('rf', rf), ('svr', svr)]), params, error_score='raise', 
                     n_jobs=-1, cv=3, scoring='neg_root_mean_squared_error')

#BaggingRegressor
params = {'base_estimator': [None, gbr, svr, rf],
          'n_estimators': [5, 10, 15]
}
gs_br = GridSearchCV(BaggingRegressor(), params, error_score='raise', n_jobs=-1, cv=3, scoring='neg_root_mean_squared_error')

#StackingRegressor
params = {'passthrough': [True, False],
          'final_estimator': [gbr, rf, svr, None]}
gs_sr = GridSearchCV(StackingRegressor(estimators=[('gbr', gbr), ('rf', rf), ('svr', svr)]), params, error_score='raise', 
                     n_jobs=-1, cv=3, scoring='neg_root_mean_squared_error')

grids = [gs_vr, gs_br, gs_sr]

In [34]:
#Словарь с названиями моделей
grid_dict = {0:'VotingRegressor', 1:'BaggingRegressor', 2:'StackingRegressor'}

In [35]:
modelselection(grids, grid_dict)

Estimator: VotingRegressor
Best params: {'weights': (2, 1, 1)}
Best score:  -1.257745668747706
Best test score:  -1.2501523123522564

Estimator: BaggingRegressor
Best params: {'base_estimator': GradientBoostingRegressor(learning_rate=0.05, max_depth=6, min_samples_split=4,
                          n_estimators=150, random_state=21), 'n_estimators': 15}
Best score:  -1.255572997696328
Best test score:  -1.2514304121452255

Estimator: StackingRegressor
Best params: {'final_estimator': None, 'passthrough': True}
Best score:  -1.249622975471454
Best test score:  -1.244102128369942

Best estimator: StackingRegressor


In [37]:
#Посчитайте RMSE для наивного регрессора, в котором вы просто высчитали средний рейтинг.
dummy = DummyRegressor(strategy="constant", constant=df_new.rating.mean())
dummy.fit(df_new.drop('rating', axis=1), df_new.rating)
dummy_pred = dummy.predict(df_new.rating)

print(f'RMSE наивного регрессора: {mean_squared_error(dummy_pred, df_new.rating, squared=False)}')

RMSE наивного регрессора: 1.3166418859981708


Выводы: Лучшие результаты показал ансамбль StackingRegressor, RMSE на test - 1.24, что немного лучше наивного регрессора (1.32)

### 1.3 Классификация

Бинаризируем значения целевой переменной путем округления рейтингов до ближайшего целого числа. Это и будут классы.

Для прогнозирования классов попробуем различные алгоритмы и их гиперпараметры. Выберем лучшее решение на основе gridsearch и кроссвалидации и посчитаем accuracy на тестовой подвыборке.

In [30]:
#Бинаризируйте значения целевой переменной
y_train =  round(y_train, 0)
y_test = round(y_test, 0)

In [42]:
#Список экземпляров GridSearchCV

#LogisticRegression
params = [{'solver' : ['saga'],
      'penalty' : ['l1', 'l2', 'none'],
      'fit_intercept' : [True, False],
      'C' : [0.01, 0.1, 1],
      'class_weight':['balanced', None]},
    {'solver' : ['newton-cg', 'lbfgs'],
      'penalty' : ['l2','none'],
      'fit_intercept' : [True, False],
      'C' : [0.01, 0.1, 1],
      'class_weight':['balanced', None],}
    ]
gs_logreg = GridSearchCV(LogisticRegression(), params, error_score='raise', n_jobs=-1, cv=3, scoring='accuracy')

#DecisionTreeClassifier
params = {'max_depth':[i for i in range(4, 10, 2)],
         'min_samples_split':[i for i in range(2, 4)],
          'min_samples_leaf':[i for i in range(2, 5)],
          'criterion':['gini', 'entropy']
         }
gs_clf_tree = GridSearchCV(DecisionTreeClassifier(random_state=21), params, error_score='raise', n_jobs=-1, cv=3, 
                           scoring='accuracy')

#RandomForestClassifier
params = {'n_estimators':[5, 10, 15, 20, 30, 50, 100, 150], 
          'max_depth':[i for i in range(4, 12, 2)],
          'min_samples_split':[i for i in range(2, 5)],
          'min_samples_leaf':[i for i in range(1, 5)]
         }
gs_clf_rf = GridSearchCV(RandomForestClassifier(random_state=21), params, error_score='raise', n_jobs=-1, cv=3, 
                           scoring='accuracy')

#GradientBoostingClassifier
params = {'learning_rate': [0.01, 0.05],
          'n_estimators' : [50, 100, 150],
          'max_depth'    : [4,6,8],
          'min_samples_split' : [i for i in range(2, 4)],
          'min_samples_leaf' : [i for i in range(1, 4)],
                 }
gs_clf_xgb = GridSearchCV(GradientBoostingClassifier(random_state=21), params, error_score='raise', n_jobs=-1, 
                          cv=3, scoring='accuracy')

#SVC
params = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
          'C' : [1,5,10],
          'degree' : [3,8],
          'gamma' : ('auto','scale')
         }
gs_clf_svc = GridSearchCV(SVC(), params, error_score='raise', n_jobs=-1, cv=3, scoring='accuracy')

grids = [gs_logreg, gs_clf_tree, gs_clf_rf, gs_clf_xgb, gs_clf_svc]

In [43]:
#Словарь с названиями моделей
grid_dict = {0:'LogisticRegression', 1:'DecisionTreeClassifier', 2:'RandomForestClassifier', 3:'GradientBoostingClassifier', 
             4:'SVC'}

In [44]:
modelselection(grids, grid_dict)

Estimator: LogisticRegression
Best params: {'C': 0.1, 'class_weight': None, 'fit_intercept': True, 'penalty': 'l2', 'solver': 'saga'}
Best score:  0.6785265801590623
Best test score:  0.677734375

Estimator: DecisionTreeClassifier
Best params: {'criterion': 'entropy', 'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best score:  0.6739221431561323
Best test score:  0.6746651785714286

Estimator: RandomForestClassifier
Best params: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 10}
Best score:  0.6723873308218223
Best test score:  0.6732700892857143

Estimator: GradientBoostingClassifier
Best params: {'learning_rate': 0.05, 'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 100}
Best score:  0.6744104925352309
Best test score:  0.6780133928571429

Estimator: SVC
Best params: {'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Best score:  0.6771312962187804
Best test score:  0.6788504464285714

Best estimat

In [45]:
#Рассчитаем accuracy наивного классификатора
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=21)
dummy_clf.fit(X_train, y_train)
dummy_pred = dummy_clf.predict(y_test)

print(f'Accuracy наивного классификатора: {accuracy_score(y_test, dummy_pred)}')

Accuracy наивного классификатора: 0.6665736607142857


Выводы: Лучшая модель - SVC, accuracy на test - 67.9%, что всего на 1% лучше наивного классификатора.

Снова проведем бинаризацию целевого столбца, преобразовав теперь целые числа в классы bad (0, 1) (невкусное), so-so (2, 3) (нормальное), great (4, 5) (вкусное).

Снова для прогнозирования классов попробуем различные алгоритмы и их гиперпараметры. Выберем лучшее решение на основе кроссвалидации и посчитаем accuracy на тестовой подвыборке.

In [31]:
#Проведем бинаризацию, преобразовав теперь целые числа в классы bad (0, 1) (невкусное), so-so (2, 3) (нормальное), great (4, 5) (вкусное).
bins = [0, 1, 3, 5]
labels = ['bad', 'so-so', 'great']
y_test = pd.cut(y_test, bins=bins, labels=labels, include_lowest=True)
y_train = pd.cut(y_train, bins=bins, labels=labels, include_lowest=True)

In [47]:
#Снова для прогнозирования классов попробуйте различные алгоритмы и их гиперпараметры.
modelselection(grids, grid_dict)

Estimator: LogisticRegression
Best params: {'C': 0.1, 'class_weight': None, 'fit_intercept': False, 'penalty': 'l2', 'solver': 'saga'}
Best score:  0.8034742570113019
Best test score:  0.8035714285714286

Estimator: DecisionTreeClassifier
Best params: {'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 4, 'min_samples_split': 2}
Best score:  0.8011022743128228
Best test score:  0.8018973214285714

Estimator: RandomForestClassifier
Best params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 10}
Best score:  0.7996372261755268
Best test score:  0.8002232142857143

Estimator: GradientBoostingClassifier
Best params: {'learning_rate': 0.05, 'max_depth': 4, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 50}
Best score:  0.8024277940560904
Best test score:  0.8030133928571429

Estimator: SVC
Best params: {'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'rbf'}
Best score:  0.80361378540533
Best test score:  0.8055245535714286

Best estim

In [203]:
#Рассчитаем accuracy наивного классификатора
dummy_clf = DummyClassifier(strategy="most_frequent", random_state=21)
dummy_clf.fit(X_train, y_train)
dummy_pred = dummy_clf.predict(y_test)

print(f'Accuracy наивного классификатора: {accuracy_score(y_test, dummy_pred)}')

Accuracy наивного классификатора: 0.7965959821428571


Вывод: Лучшая модель по прежнему SVC, accuracy на test - 80.6%. Но accuracy наивного классификатора 79.7%.

Что хуже: спрогнозировать плохой рейтинг, который на самом деле окажется хорошим, или спрогнозировать хороший рейтинг, который на самом деле окажется плохим? Заменим метрику accuracy другой соответствующей метрикой.

Хуже спрогнозировать хороший рейтинг, который окажется плохим. Будем для оценки использовать метрику precision. Именно введение precision не позволяет нам записывать все объекты в один класс, так как в этом случае мы получаем рост уровня False Positive. Цель precision – классифицировать все Positive семплы как Positive, не допуская ложных определений Negative как Positive:

$\large precision = \frac{TP}{TP + FP}$

Для прогнозирования классов с новой метрикой попробуем различные алгоритмы и их гиперпараметры. Выберем лучшее решение и посчитаем метрику на тестовой подвыборке.

In [32]:
#Для прогноза с новой метрикой воспользуемся make_scorer

custom_scorer = make_scorer(precision_score, average="weighted", zero_division=0)

In [33]:
#Список экземпляров GridSearchCV

#LogisticRegression
params = {'solver' : ['saga'],
      'penalty' : ['elasticnet'],
      'fit_intercept' : [True, False],
      'C' : [0.01, 0.1, 1],
      'class_weight':['balanced', None],
          'l1_ratio': [0.2, 0.4, 0.6, 0.8]}

gs_logreg = GridSearchCV(LogisticRegression(), params, error_score='raise', n_jobs=-1, cv=3, scoring=custom_scorer)

#DecisionTreeClassifier
params = {'max_depth':[i for i in range(4, 10, 2)],
         'min_samples_split':[i for i in range(2, 4)],
          'min_samples_leaf':[i for i in range(2, 5)],
          'criterion':['gini', 'entropy']
         }
gs_clf_tree = GridSearchCV(DecisionTreeClassifier(random_state=21), params, error_score='raise', n_jobs=-1, cv=3, 
                           scoring=custom_scorer)

#RandomForestClassifier
params = {'n_estimators':[50, 100, 150], 
          'max_depth':[i for i in range(6, 10, 2)],
          'min_samples_split':[i for i in range(2, 5)],
          'min_samples_leaf':[i for i in range(1, 5)],
          'bootstrap': [True, False]
         }
gs_clf_rf = GridSearchCV(RandomForestClassifier(random_state=21), params, error_score='raise', n_jobs=-1, cv=3, 
                           scoring=custom_scorer)

#GradientBoostingClassifier(random_state=21)
params = {'learning_rate': [0.01, 0.05],
          'n_estimators' : [100, 150],
          'max_depth'    : [6, 8],
          'min_samples_split' : [i for i in range(2, 4)],
          'min_samples_leaf' : [i for i in range(2, 4)],
                 }
gs_clf_xgb = GridSearchCV(GradientBoostingClassifier(random_state=21), params, error_score='raise', n_jobs=-1, cv=3, 
                                scoring=custom_scorer)

#SVC
params = {'kernel' : ('linear', 'poly', 'rbf', 'sigmoid'),
          'C' : [1,5,10],
          'degree' : [3,8],
          'gamma' : ('auto','scale')
         }
gs_clf_svc = GridSearchCV(SVC(), params, error_score='raise', n_jobs=-1, cv=3, scoring=custom_scorer)

grids = [gs_logreg, gs_clf_tree, gs_clf_rf, gs_clf_xgb, gs_clf_svc]

In [36]:
#Словарь с названиями моделей
grid_dict = {0:'LogisticRegression', 1:'DecisionTreeClassifier', 2:'RandomForestClassifier', 3:'GradientBoostingClassifier', 
             4:'SVC'}

In [38]:
modelselection(grids, grid_dict)

Estimator: LogisticRegression
Best params: {'C': 0.1, 'class_weight': 'balanced', 'fit_intercept': True, 'l1_ratio': 0.2, 'penalty': 'elasticnet', 'solver': 'saga'}
Best score:  0.719958790226292
Best test score:  0.729981960857308

Estimator: DecisionTreeClassifier
Best params: {'criterion': 'gini', 'max_depth': 8, 'min_samples_leaf': 3, 'min_samples_split': 2}
Best score:  0.7081627271690647
Best test score:  0.6967656550853726

Estimator: RandomForestClassifier
Best params: {'bootstrap': True, 'max_depth': 8, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 100}
Best score:  0.7226769913437016
Best test score:  0.7151592372118636

Estimator: GradientBoostingClassifier
Best params: {'learning_rate': 0.01, 'max_depth': 6, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best score:  0.7390416397961834
Best test score:  0.708326578955168

Estimator: SVC
Best params: {'C': 1, 'degree': 3, 'gamma': 'scale', 'kernel': 'poly'}
Best score:  0.719898907711064

### 1.4 Принятие решения

Выводы: Лучшая модель - GradientBoostingClassifier с precision 73.9%, ее будем использовать дальше.

In [65]:
#Сохранение лучшей модели регрессора
estimators = [
    ('gbr', GradientBoostingRegressor(learning_rate=0.05, max_depth=6, min_samples_split=4,
                          n_estimators=150, random_state=21)),
    ('rf', RandomForestRegressor(max_depth=10, min_samples_leaf=4, n_estimators=150,
                      random_state=21)),
    ('svr', SVR(C=1))]

best_model_reg = StackingRegressor(estimators = estimators, final_estimator=None, passthrough=True).fit(X_train, y_train) 

joblib.dump(best_model_reg, 'best_model_reg.pkl') 

['best_model_reg.pkl']

In [39]:
#Сохранение лучшей модели классификатора 

best_model_clf = GradientBoostingClassifier(learning_rate=0.01, max_depth=6, min_samples_leaf=2, 
                             min_samples_split=2, n_estimators=100, random_state=21).fit(X_train, y_train)

joblib.dump(best_model_clf, 'best_model_clf.pkl')

['best_model_clf.pkl']

## 2. Пищевая ценность

Соберем в датафрейм всю информацию о пищевой ценности продуктов из подготовленного и отфильтрованного набора данных (только столбцы с продуктами). Используйте для этого API.

Конвертируем все значения в % от суточной нормы потребления.

In [35]:
#Прочитаем список нутриентов, которые необходимо конвертировать в % от суточной нормы потребления. 
must_nutrients = pd.read_csv('./data/daily values.csv')
must_nutrients

Unnamed: 0,Nutrient,Unit of measure,Adults and Children ≥ 4 years
0,Vitamin A,Micrograms RAE2 (mcg),900.0
1,Vitamin C,Milligrams (mg),90.0
2,Calcium,Milligrams (mg),1300.0
3,Iron,Milligrams (mg),18.0
4,Vitamin D,Micrograms (mcg)3,20.0
5,Vitamin E,Milligrams (mg)4,15.0
6,Vitamin K,Micrograms (mcg),120.0
7,Thiamin,Milligrams (mg),1.2
8,Riboflavin,Milligrams (mg),1.3
9,Niacin,Milligrams NE5 (mg),16.0


In [90]:
#Сохраняем информацию по нутриентам из апи в список

dict_col=[]
col_names = []

for col in df_new.columns:
    url_search = f'https://api.nal.usda.gov/fdc/v1/foods/search?query={col}&api_key=S0MlcT2g7gaK9H3ifNoupHuaecg1ZxTI7hC5csF4'
    data_search = requests.get(url_search).json()
    try:
        foodNutrients = data_search['foods'][0]['foodNutrients']
        col_names.append(col)
        
        for i in range(len(foodNutrients)):
            col_values = []
            col_values.append(col)
            col_values.append(foodNutrients[i]['nutrientName'])
            col_values.append(foodNutrients[i]['value'])
            col_values.append(foodNutrients[i]['unitName'])
            dict_col.append(col_values)
    except:
        pass

In [391]:
#преобразуем полученные данные в датафрейм
data_nutrients = pd.DataFrame(dict_col, columns=['item', 'nutrient_name', 'value', 'unit_name'])
data_nutrients.head()

Unnamed: 0,item,nutrient_name,value,unit_name
0,almond,Protein,21.0,G
1,almond,Total lipid (fat),55.5,G
2,almond,"Carbohydrate, by difference",18.8,G
3,almond,Energy,614.0,KCAL
4,almond,"Alcohol, ethyl",0.0,G


In [392]:
#Приведем все к нижнему регистру
must_nutrients['Nutrient']=must_nutrients['Nutrient'].str.lower()
data_nutrients['nutrient_name'] = data_nutrients['nutrient_name'].str.lower()

In [393]:
#Вручную сопоставим названия нутриентов  из must_nutrients и data_nutrients
nutr_uniq = {
    'vitamin a':'vitamin a, rae',
    'vitamin c':'vitamin c, total ascorbic acid',
    'calcium':'calcium, ca',
    'iron': 'iron, fe',
    'vitamin d':'vitamin d (d2 + d3)', 
    'vitamin e': 'vitamin e (alpha-tocopherol)',
    'vitamin k': 'vitamin k (phylloquinone)',
    'thiamin':'thiamin',
    'riboflavin': 'riboflavin',
    'niacin': 'niacin',
    'folate': 'folate, total',
    'biotin': 'biotin',
    'pantothenic acid': 'pantothenic acid',
    'phosphorus': 'phosphorus, p',
    'magnesium': 'magnesium, mg',
    'zinc': 'zinc, zn',
    'selenium': 'selenium, se',
    'copper': 'copper, cu',
    'manganese': 'manganese, mn',
    'potassium': 'potassium, k',
    'choline': 'choline, total',
    'fat': 'total lipid (fat)',
    'saturated fat': 'fatty acids, total saturated',
    'cholesterol': 'cholesterol',
    'sodium': 'sodium, na',
    'dietary fiber': 'fiber, total dietary',
    'protein': 'protein',
    'added sugars': 'sugars, added'
}

In [394]:
def get_key(value):
    for k, v in nutr_uniq.items():
        if v == value:
            return k

In [395]:
#Заменим значения столбца nutrient_name на соответствующие им ключи словаря nutr_uniq
data_nutrients['nutrient_name'] = data_nutrients['nutrient_name'].apply(get_key)
data_nutrients['nutrient_name'].unique()

array(['protein', 'fat', None, 'dietary fiber', 'calcium', 'iron',
       'magnesium', 'phosphorus', 'potassium', 'sodium', 'zinc', 'copper',
       'selenium', 'vitamin a', 'vitamin e', 'vitamin d', 'vitamin c',
       'thiamin', 'riboflavin', 'niacin', 'folate', 'choline',
       'vitamin k', 'cholesterol', 'saturated fat', 'added sugars',
       'pantothenic acid', 'manganese', 'biotin'], dtype=object)

In [396]:
#Оставим в датасете только те строки, нутриенты которых есть в словаре
data_nutrients = data_nutrients.loc[data_nutrients['nutrient_name'].isin(nutr_uniq.keys())].reset_index(drop=True)
data_nutrients['nutrient_name'].unique()

array(['protein', 'fat', 'dietary fiber', 'calcium', 'iron', 'magnesium',
       'phosphorus', 'potassium', 'sodium', 'zinc', 'copper', 'selenium',
       'vitamin a', 'vitamin e', 'vitamin d', 'vitamin c', 'thiamin',
       'riboflavin', 'niacin', 'folate', 'choline', 'vitamin k',
       'cholesterol', 'saturated fat', 'added sugars', 'pantothenic acid',
       'manganese', 'biotin'], dtype=object)

In [397]:
#Проверим единицы измерения, совпадают ли они в обоих датафреймах
def un_nu(name):
    un = must_nutrients.loc[must_nutrients['Nutrient']==name, 'Unit of measure']
    try:
        ind1=str(un).index('(')
        ind2=str(un).index(')')
        un = str(un)[ind1+1:ind2]
        return un
    except:
        pass

In [399]:
#Единицы измерения совпадают, так как UG==mcg
data_nutrients['Unit of measure'] = data_nutrients['nutrient_name'].apply(un_nu)
data_nutrients.loc[data_nutrients['unit_name']=='UG', 'unit_name'] = 'mcg'

data_nutrients.loc[data_nutrients['unit_name'].str.lower()!=data_nutrients['Unit of measure'].str.lower()]

Unnamed: 0,item,nutrient_name,value,unit_name,Unit of measure


In [400]:
#переведем значения столбца value в % от суточной нормы. 
def day_norm(row):
    name = row['nutrient_name']
    value = row['value']
    norm = must_nutrients.loc[must_nutrients['Nutrient']==name, 'Adults and Children ≥ 4 years']
    new_value = float(value)/float(norm)*100
    return new_value

In [401]:
#Удалим ненужные столбцы
data_nutrients['daily_value'] = data_nutrients.apply(day_norm, axis=1)
data_nutrients.drop(['value', 'value', 'unit_name', 'Unit of measure'], axis=1, inplace=True)

In [40]:
#Сохраним датафрейм
nutrients_daily_value.to_csv('./data/nutrients_daily_value.csv', index=False)
dff = pd.read_csv('./data/nutrients_daily_value.csv')
dff.head()

Unnamed: 0,item,nutrient_name,value
0,almond,thiamin,3.416667
1,almond,sodium,9.869565
2,almond,selenium,4.363636
3,almond,saturated fat,32.75
4,almond,riboflavin,72.230769


## 3. Похожие рецепты

Для каждого рецепта из набора данных найдем ссылку на сайте epicurious.com и подробную информацию о нем (название рецепта, рейтинг на платформе и URl). Если сделать это не удалось, найдем для данного рецепта похожую ссылку в Интернете.

In [122]:
#Загрузим снова датасет
df = pd.read_csv('./data/epi_r.csv')
df.head()

Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [123]:
#Удалим полные дубликаты и дубликаты в столбце title
df.duplicated().sum() 

1801

In [124]:
df = df.drop_duplicates()
print(df.title.duplicated().sum())

515


In [127]:
#Окончательный список без дубликатов
titles = df.title
titles = titles.drop_duplicates()
print(titles.duplicated().sum())
print(len(titles))

0
17736


In [154]:
df = df.drop_duplicates(subset=['title'], keep='first').reset_index(drop=True)
len(df)

17736

In [128]:
def find_url_rating(title):
    
    """Функция принимает на вход название рецепта и возвращает его рейтинг и ссылку на сайте epicurious.com в виде списка."""
    
    title = title.strip()
    url = f'https://www.epicurious.com/search/{title}?content=recipe'
    try:
        res = requests.get(url)
        soup = BeautifulSoup(res.text, "html.parser")
        url_ending = soup.find(class_="view-complete-item", title=title)['href']
        url_title = f'https://www.epicurious.com{url_ending}'
        rating = soup.find(class_="view-complete-item", title=title).parent.find("dd")['data-rating']
        return [title, url_title, rating]
    except:
        return []

In [132]:
#Найдем ссылку для каждого рецепта
title_list = []
for elem in titles:
    title_list.append(find_url_rating(elem))

In [136]:
#Преобразуем список в датафрейм и сохраним его
data_url = pd.DataFrame(title_list)
data_url.to_csv('./data/links_to_recipes.csv', index=False)

In [137]:
#Прочитаем датафрейм и посмотрим есть ли пропущенные значения
links_to_recipes = pd.read_csv('./data/links_to_recipes.csv', names=['title', 'url', 'rating_url'], header=0)
links_to_recipes

Unnamed: 0,title,url,rating_url
0,"Lentil, Apple, and Turkey Wrap",https://www.epicurious.com/recipes/food/views/...,2.5
1,Boudin Blanc Terrine with Red Onion Confit,https://www.epicurious.com/recipes/food/views/...,4.6
2,Potato and Fennel Soup Hodge,https://www.epicurious.com/recipes/food/views/...,3.8
3,Mahi-Mahi in Tomato Olive Sauce,https://www.epicurious.com/recipes/food/views/...,4.8
4,Spinach Noodle Casserole,https://www.epicurious.com/recipes/food/views/...,3.4
...,...,...,...
17731,Chinese Barbecued Spareribs,https://www.epicurious.com/recipes/food/views/...,4.0
17732,Artichoke and Parmesan Risotto,https://www.epicurious.com/recipes/food/views/...,4.1
17733,Turkey Cream Puff Pie,https://www.epicurious.com/recipes/food/views/...,4.5
17734,Snapper on Angel Hair with Citrus Cream,https://www.epicurious.com/recipes/food/views/...,4.3


In [138]:
#Всего пропущено
links_to_recipes['title'].isna().sum()

1661

In [139]:
#Найдем пропущенные значения
missed_titles = titles.iloc[links_to_recipes.loc[links_to_recipes['title'].isna()].index]
missed_titles

8                                   Korean Marinated Beef 
12       Banana-Chocolate Chip Cake With Peanut Butter ...
21                                        "Fried" Chicken 
24                                Sea Salt-Roasted Pecans 
40                            Coconut-Key Lime Sheet Cake 
                               ...                        
20018     Stuffed Onions with Spiced Lamb and Pomegranate 
20021    Dill-Crusted Pork Tenderloin With Farro, Pea, ...
20026               Spinach with Chickpeas and Fried Eggs 
20029                         White Miso Peach/Pear/Apple 
20037                                             Russian 
Name: title, Length: 1661, dtype: object

In [160]:
#Удалим пропуски
links_to_recipes = links_to_recipes.loc[~links_to_recipes.title.isna()]

In [140]:
#Заполним пропущенные значения похожими ссылками из гугла

def find_in_google(query):
    
    """Функция принимает на вход название рецепта и возвращает первую ссылку на рецепт в google."""
    
    g_url = "http://www.google.com/search?q=" + query 
    res = requests.get(g_url)
    soup = BeautifulSoup(res.text, "html.parser")
    try:
        for link in soup.find_all('a'):
            if link.get('href').find('https') != -1:
                l = link.get('href')
                break
        ind1=l.index('https')
        ind2=l.index('&')
        url = l[ind1:ind2]
        return url
    except:
        pass

In [141]:
missed_titles_list = []
for elem in missed_titles:
    missed_titles_list.append(find_in_google(elem))

In [142]:
missed_titles_list[:5]

['https://damndelicious.net/2019/04/21/korean-beef-bulgogi/',
 'https://www.bonappetit.com/recipe/banana-chocolate-chip-cake-with-peanut-butter-frosting',
 'https://volshebnaya-eda.ru/kollekcia-receptov/zharenaya-kurica-na-skovorode-s-korochkoj-i-chesnokom/',
 'https://www.epicurious.com/recipes/food/views/sea-salt-roasted-pecans-233690',
 'https://www.epicurious.com/recipes/food/views/coconut-key-lime-sheet-cake']

In [143]:
#Преобразуем список в датафрейм
df_missed = pd.DataFrame()
df_missed['title'] = missed_titles.reset_index(drop=True)
df_missed['url'] = missed
df_missed['rating_url'] = title_rating.loc[missed_titles.index, 'rating'].reset_index(drop=True)
df_missed.tail()

Unnamed: 0,title,url,rating_url
1656,Stuffed Onions with Spiced Lamb and Pomegranate,https://www.epicurious.com/recipes/food/views/...,5.0
1657,"Dill-Crusted Pork Tenderloin With Farro, Pea, ...",https://www.allrecipes.com/recipe/70770/apple-...,5.0
1658,Spinach with Chickpeas and Fried Eggs,https://www.epicurious.com/recipes/food/views/...,5.0
1659,White Miso Peach/Pear/Apple,https://www.bbcgoodfood.com/recipes/thai-green...,3.75
1660,Russian,https://www.epicurious.com/recipes/food/views/...,0.0


In [187]:
#Объединим оба датафрейма
links_to_recipes_total = pd.concat([links_to_recipes, df_missed], axis=0).reset_index(drop=True)
links_to_recipes_total

Unnamed: 0,title,url,rating_url
0,"Lentil, Apple, and Turkey Wrap",https://www.epicurious.com/recipes/food/views/...,2.50
1,Boudin Blanc Terrine with Red Onion Confit,https://www.epicurious.com/recipes/food/views/...,4.60
2,Potato and Fennel Soup Hodge,https://www.epicurious.com/recipes/food/views/...,3.80
3,Mahi-Mahi in Tomato Olive Sauce,https://www.epicurious.com/recipes/food/views/...,4.80
4,Spinach Noodle Casserole,https://www.epicurious.com/recipes/food/views/...,3.40
...,...,...,...
17731,Stuffed Onions with Spiced Lamb and Pomegranate,https://www.epicurious.com/recipes/food/views/...,5.00
17732,"Dill-Crusted Pork Tenderloin With Farro, Pea, ...",https://www.allrecipes.com/recipe/70770/apple-...,5.00
17733,Spinach with Chickpeas and Fried Eggs,https://www.epicurious.com/recipes/food/views/...,5.00
17734,White Miso Peach/Pear/Apple,https://www.bbcgoodfood.com/recipes/thai-green...,3.75


In [188]:
#Добавим к датафрейму столбцы с ингридиентами
links_to_recipes_total = pd.concat([links_to_recipes_total, df], axis=1)
links_to_recipes_total.drop('title', axis=1, inplace=True)

Задание:
 - Сохраните новый датафрейм в CSV-файл, который вы будете использовать в своей основной программе.

In [190]:
#Cохраним итоговый датафрейм
links_to_recipes_total.to_csv('./data/url_result.csv', index=False)
pd.read_csv('./data/url_result.csv')

Unnamed: 0,url,rating_url,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,https://www.epicurious.com/recipes/food/views/...,2.50,2.500,426.0,30.0,7.0,559.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,https://www.epicurious.com/recipes/food/views/...,4.60,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://www.epicurious.com/recipes/food/views/...,3.80,3.750,165.0,6.0,7.0,165.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,https://www.epicurious.com/recipes/food/views/...,4.80,5.000,,,,,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,https://www.epicurious.com/recipes/food/views/...,3.40,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17731,https://www.epicurious.com/recipes/food/views/...,5.00,3.750,998.0,55.0,80.0,2027.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17732,https://www.allrecipes.com/recipe/70770/apple-...,5.00,4.375,671.0,22.0,28.0,583.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17733,https://www.epicurious.com/recipes/food/views/...,5.00,4.375,563.0,31.0,38.0,652.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
17734,https://www.bbcgoodfood.com/recipes/thai-green...,3.75,4.375,631.0,45.0,24.0,517.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [241]:
#В отдельный файл сохраним список всех ингридиентов
list_of_possible_ingredients =  df_new.columns[:-1]
joblib.dump(list_of_possible_ingredients, './data/list_of_possible_ingredients.pkl')

['./data/list_of_possible_ingredients.pkl']

## Выводы

Исходный датасет содержал 20052 рецептов и 680 признаков (ингридиенты). Из датасета удалены дубликаты и признаки, которые не относятся к ингридиентам.

Машинное обучение - регерессия:
 - RMSE наивного регрессора: 1.32
 - Лучшие результаты показал ансамбль StackingRegressor, RMSE на test - 1.24, что немного лучше наивного регрессора.

Машинное обучение - классификация:
 - После бинаризации целевой переменной в классы bad (0, 1) (невкусное), so-so (2, 3) (нормальное), great (4, 5) (вкусное):
   
   Accuracy наивного классификатора: 0.79

   Лучшая модель SVC, accuracy на test - 0.80
 - Изменение метрики на precision_score, average="weighted":

   Лучшая модель - GradientBoostingClassifier с precision 0.74 - будет использоваться далее.


Что можно улучшить:
 - улучшить прогноз за счет корректировки дисбаланса классов
 - оформление консольного приложения в виде telegram bot
 - дополнительный функционал (составление меню на день).