In [5]:
import pandas as pd 
from bs4 import BeautifulSoup
import requests
import re
import random

df = pd.read_csv('wine_pairings_v7.csv', index_col = 'wine')

#creates a new dataframe removing the bigger reds for vegetarian pairings
no_protein_remove = ['bold red', 'medium red']
df2 = df[~df['style'].isin(no_protein_remove)]



In [36]:
protein_tokens = {'red_meat': 1,
 'cured_meat': 8,
 'game': 2,
 'pork': 3,
 'poultry': 4,
 'fish': 5,
 'shellfish': 6,
 'mollusk': 7}
protein_tokens

{'red_meat': 1,
 'cured_meat': 8,
 'game': 2,
 'pork': 3,
 'poultry': 4,
 'fish': 5,
 'shellfish': 6,
 'mollusk': 7}

## model build 

In [7]:
def recipe_info(url):
    recipe_search = requests.get(url).text
    soup_recipe = BeautifulSoup(recipe_search, "html.parser")
    try:
        ingr_soup = soup_recipe.find_all("span", class_ = 'ingredients-item-name')
    except: 
        ingr_soup = soup_recipe.find_all("span", class_ = "recipe-ingred_txt added")
    ## ingredients
    ing_list = []
    for i in ingr_soup:
        ing = i.text
        ing_clean = ing.strip()
        ing_list.append(ing_clean)
    return ing_list
def listToString(s):  
    str1 = " "  
    return (str1.join(s))

def ing_classifier(s):
    bin_bool = {}
    for key in bin_dict:
        bin_bool[key] = bool([ele for ele in bin_dict[key] if (ele in s)])
    return bin_bool 


# classifiers for ingredient subsets, helps to identify vegetarian dishes and unpairable recipes
def protein_classifier(s):
    protein_bool = {}
    for key in protein_dict:
        protein_bool[key] = bool([ele for ele in protein_dict[key] if (ele in s)])
    return protein_bool 
def veg_classifier(s):
    veg_bool = {}
    for key in veg_dict:
        veg_bool[key] = bool([ele for ele in veg_dict[key] if (ele in s)])
    return veg_bool 




In [8]:
protein_dict = {}
#meat
protein_dict['red_meat'] = ['beef', 'hamburger', 'steak', 'ground round', 'veal', 'bison', 'buffalo', 'lamb', 'mutton', 'goat', 'venison', 'deer', 'elk', 'caribou', 'moose']
protein_dict['pork'] = ['pork', 'boar', 'pig', 'bratwurst', 'italian sausage', 'ham', 'knackwurst', 'frankfurter', 'hot dog', 'sausage']
protein_dict['poultry'] = ['chicken', 'turkey']
protein_dict['game'] = ['duck', 'pheasant', 'rabbit', 'quail', 'lapin', 'goose', 'grouse']
protein_dict['cured_meat'] = ['charcuterie', 'salumi','bacon', 'pancetta', 'mortadella', 'salami', 'pepperoni', 'pancetta', 'guanciale', 'capocollo', 'soppressata', 'pastrami', 'jamon iberico', 'bresaola', 'nduja', 'jamon serrano']
protein_dict['fish'] = ['salmon', 'tuna', 'trout', 'bass', 'seabass', 'snapper', 'cod', 'steelhead', 'yellowtail', 'hamachi', 'kampachi', 'amberjack', 'yellowjack', 'yellow jack', 'tilapia', 'mahi-mahi', 'flounder', 'halibut', 'swordfish', 'anchovy', 'sardine', 'catfish', 'grouper', 'haddock', 'mackerel', 'perch', 'whitefish', 'white fish', 'smelt']
protein_dict['shellfish'] = ['crab', 'lobster', 'crawfish', 'crayfish', 'langostino', 'shrimp', 'prawn', 'dungeness']
protein_dict['mollusk'] = ['oyster', 'cuttlefish', 'clam', 'scallop', 'octopus', 'squid', 'conch', 'mussel', 'periwinkle']

bin_dict = {}
#meat
bin_dict['red_meat'] = ['beef', 'hamburger', 'steak', 'ground round', 'veal', 'bison', 'buffalo', 'lamb', 'mutton', 'goat', 'venison', 'deer', 'elk', 'caribou', 'moose']
bin_dict['pork'] = ['pork', 'boar', 'pig', 'bratwurst', 'italian sausage', 'ham', 'knackwurst', 'frankfurter', 'hot dog', 'sausage']
bin_dict['poultry'] = ['chicken', 'turkey']
bin_dict['game'] = ['duck', 'pheasant', 'rabbit', 'quail', 'lapin', 'goose', 'grouse']
bin_dict['cured_meat'] = ['charcuterie', 'salumi','bacon', 'pancetta', 'mortadella', 'salami', 'pepperoni', 'pancetta', 'guanciale', 'capocollo', 'soppressata', 'pastrami', 'jamon iberico', 'bresaola', 'nduja', 'jamon serrano']
bin_dict['fish'] = ['salmon', 'tuna', 'trout', 'bass', 'seabass', 'snapper', 'cod', 'steelhead', 'yellowtail', 'hamachi', 'kampachi', 'amberjack', 'yellowjack', 'yellow jack', 'tilapia', 'mahi-mahi', 'flounder', 'halibut', 'swordfish', 'anchovy', 'sardine', 'catfish', 'grouper', 'haddock', 'mackerel', 'perch', 'whitefish', 'white fish', 'smelt']
bin_dict['shellfish'] = ['crab', 'lobster', 'crawfish', 'crayfish', 'langostino', 'shrimp', 'prawn', 'dungeness']
bin_dict['mollusk'] = ['oyster', 'cuttlefish', 'clam', 'scallop', 'octopus', 'squid', 'conch', 'mussel', 'periwinkle']
#herb
bin_dict['fresh_green'] = ['cilantro', 'basil', 'thai basil', 'mint', 'chervil', 'peppermint', 'borage', 'chamomile']
bin_dict['earthy_green'] = ['parsley', 'oregano', 'thyme', 'tarragon', 'marjoram', 'dill']
bin_dict['bitter_floral'] = ['sage', 'rosemary', 'lavender', 'bay leaf', 'pine', 'fir']
bin_dict['savory_brown'] = ['coriander', 'cumin', 'caraway', 'curry powder']
bin_dict['sharp_spicy'] = ['mustard', 'horseradish', 'szechuan pepper', 'wasabi']
bin_dict['perfumed_citrus_spicy'] = ['ginger', 'gingerroot', 'sorrel', 'galangal', 'turmeric', 'cardamom', 'saffron']
bin_dict['smoky_spicy'] = ['paprika', 'cayenne pepper', 'chili powder', 'chili pepper', 'ancho pepper', 'chili flakes', 'ancho chili', 'alleppo pepper', 'adobo', 'chipotle', 'chilpotle']
bin_dict['umami_spicy'] = ['white pepper', 'pink pepper', 'black pepper', 'green pepper', 'white peppercorn', 'pink peppercorn', 'black peppercorn', 'green peppercorn', 'soy sauce', 'olive']
bin_dict['baking_spice'] = ['cinnamon', 'clove', 'allspice', 'fenugreek', 'vanilla', 'nutmeg']
bin_dict['anise'] = ['anise', 'licorice', 'star anise', 'fennel', 'celery']
#cheese
bin_dict['butter_cream'] = ['butter', 'heavy cream', 'cream cheese', 'sour cream', 'half and half', 'margarine']
bin_dict['fresh_salty'] = ['goat cheese', 'chevre', 'feta', 'cotilla', 'queso fresco', 'oaxaca', 'halloumi',  'fromage blanc', 'cottage cheese', 'sour cream', 'paneer']
bin_dict['delicate_nutty'] = ['brie', 'comte', 'comté', 'gruyere', 'havarti', 'mascarpone', 'mozzarella', 'creme fraiche', 'crème fraîche','ricotta', 'mascarpone', 'swiss cheese', 'emmental', 'raclette', 'colby', 'jack cheese', 'provolone', 'burrata', 'triple cream', 'morbier', 'camembert', 'boursin', 'fontina']
bin_dict['strong_firm'] = ['asiago', 'cheddar', 'gouda', 'manchego', 'parmesan', 'pecorino', 'cheshire', 'cantal', 'munster', 'parmagiano', 'iberico cheese', 'queso iberico', 'quexo iberico', 'idiazabal']
bin_dict['pungent'] = ['blue cheese', 'epoisses', 'époisses', 'gorgonzola', 'roquefort', 'stilton', 'taleggio', 'valdeon']
#veg
bin_dict['acid'] = ['lemon', 'lemons', 'lime', 'limes', 'vinegar', 'tomato', 'tomatoes', 'white wine']
bin_dict['green_veg'] = ['lettuce', 'cabbage', 'spinach', 'kale', 'watercress', 'brussels sprout', 'zucchini', 'okra', 'asparagus', 'artichoke', 'cucumber', 'collard', 'chard', 'green bean', 'endive', 'broccolini', 'avocado', 'romanesco', 'cauliflower']
bin_dict['root_veg'] = ['sweet potato', 'squash', 'pumpkin', 'carrot', 'carrots', 'turnip', 'turnips', 'beet', 'beets', 'radish', 'radishes', 'parsnip', 'parsnips', 'daikon', 'rutabaga', 'salsify', 'yam', 'yuca', 'yucca', 'butternut', 'gourd']
bin_dict['allium'] = ['onion', 'garlic', 'shallot', 'chive', 'scallion', 'leek', 'ramps']
bin_dict['nightshade'] = ['potato', 'bell pepper', 'tomato', 'tomatoes', 'eggplant', 'tomatillo', 'potatoes', 'bell peppers', 'pizza sauce']
bin_dict['hot_pepper'] = ['jalapeno', 'jalapeño', 'habanero', 'birdseye', 'thai chili', 'chili pepper', 'chilies', 'tabasco', 'chile paste', 'chili paste']
bin_dict['bean'] = ['bean', 'chickpea', 'lentil', 'edamame', 'pea']
bin_dict['fungi'] = ['mushroom', 'mushrooms', 'chantarelle', 'shitake', 'crimini', 'cremini', 'oyster mushroom', 'porcini', 'maitake', 'portobello', 'champignon', 'boletus', 'hen of the woods', 'truffle']

veg_dict = {}
#veg
veg_dict['acid'] = ['lemon', 'lemons', 'lime', 'limes', 'vinegar', 'tomato', 'tomatoes', 'white wine']
veg_dict['green_veg'] = ['lettuce', 'cabbage', 'spinach', 'kale', 'watercress', 'brussels sprout', 'zucchini', 'okra', 'asparagus', 'artichoke', 'cucumber', 'collard', 'chard', 'green bean', 'endive', 'broccolini', 'avocado', 'romanesco', 'cauliflower']
veg_dict['root_veg'] = ['sweet potato', 'squash', 'pumpkin', 'carrot', 'carrots', 'turnip', 'turnips', 'beet', 'beets', 'radish', 'radishes', 'parsnip', 'parsnips', 'daikon', 'rutabaga', 'salsify', 'yam', 'yuca', 'yucca', 'butternut', 'gourd']
veg_dict['allium'] = ['onion', 'garlic', 'shallot', 'chive', 'scallion', 'leek', 'ramps']
veg_dict['nightshade'] = ['potato', 'bell pepper', 'tomato', 'tomatoes', 'eggplant', 'tomatillo', 'potatoes', 'bell peppers', 'pizza sauce']
veg_dict['hot_pepper'] = ['jalapeno', 'jalapeño', 'habanero', 'birdseye', 'thai chili', 'chili pepper', 'chilies', 'tabasco', 'chile paste', 'chili paste']
veg_dict['bean'] = ['bean', 'chickpea', 'lentil', 'edamame', 'pea']
veg_dict['fungi'] = ['mushroom', 'mushrooms', 'chantarelle', 'shitake', 'crimini', 'cremini', 'oyster mushroom', 'porcini', 'maitake', 'portobello', 'champignon', 'boletus', 'hen of the woods', 'truffle']
veg_dict['red_meat'] = ['beef', 'hamburger', 'steak', 'ground round', 'veal', 'bison', 'buffalo', 'lamb', 'mutton', 'goat', 'venison', 'deer', 'elk', 'caribou', 'moose']
veg_dict['pork'] = ['pork', 'boar', 'pig', 'bratwurst', 'italian sausage', 'ham', 'knackwurst', 'frankfurter', 'hot dog', 'sausage']
veg_dict['poultry'] = ['chicken', 'turkey']
veg_dict['game'] = ['duck', 'pheasant', 'rabbit', 'quail', 'lapin', 'goose', 'grouse']
veg_dict['cured_meat'] = ['charcuterie', 'salumi','bacon', 'pancetta', 'mortadella', 'salami', 'pepperoni', 'pancetta', 'guanciale', 'capocollo', 'soppressata', 'pastrami', 'jamon iberico', 'bresaola', 'nduja', 'jamon serrano']
veg_dict['fish'] = ['salmon', 'tuna', 'trout', 'bass', 'seabass', 'snapper', 'cod', 'steelhead', 'yellowtail', 'hamachi', 'kampachi', 'amberjack', 'yellowjack', 'yellow jack', 'tilapia', 'mahi-mahi', 'flounder', 'halibut', 'swordfish', 'anchovy', 'sardine', 'catfish', 'grouper', 'haddock', 'mackerel', 'perch', 'whitefish', 'white fish', 'smelt']
veg_dict['shellfish'] = ['crab', 'lobster', 'crawfish', 'crayfish', 'langostino', 'shrimp', 'prawn', 'dungeness']
veg_dict['mollusk'] = ['oyster', 'cuttlefish', 'clam', 'scallop', 'octopus', 'squid', 'conch', 'mussel', 'periwinkle']

In [31]:
t = recipe_info('https://www.allrecipes.com/recipe/18074/marinated-flank-steak/?internalSource=hub%20recipe&referringContentType=Search&clickId=cardslot%204')
s = listToString(t)
test = ing_classifier('steak, chicken')
protest = protein_classifier('steak, chicken')
test

{'red_meat': True,
 'pork': False,
 'poultry': True,
 'game': False,
 'cured_meat': False,
 'fish': False,
 'shellfish': False,
 'mollusk': False,
 'fresh_green': False,
 'earthy_green': False,
 'bitter_floral': False,
 'savory_brown': False,
 'sharp_spicy': False,
 'perfumed_citrus_spicy': False,
 'smoky_spicy': False,
 'umami_spicy': False,
 'baking_spice': False,
 'anise': False,
 'butter_cream': False,
 'fresh_salty': False,
 'delicate_nutty': False,
 'strong_firm': False,
 'pungent': False,
 'acid': False,
 'green_veg': False,
 'root_veg': False,
 'allium': False,
 'nightshade': False,
 'hot_pepper': False,
 'bean': False,
 'fungi': False}

In [75]:
#wineSeries = pd.Series()
def boolToPredict(t): 
    temp_df = pd.DataFrame()
    for key in t:
        if t[key] == True:
            temp_df0 = df[key].sort_values(ascending = False)[0:100]
            temp_df = temp_df.append(temp_df0)      
    bool_df = temp_df.T
    x = bool_df.fillna(0)
    x['sums'] = x.sum(axis = 1)
    predicts = x.sort_values('sums', ascending = False).index.values[0:3]
    return predicts
#p = boolToPredict(test)        
#print([key, temp_df])


#Same function, just run on df2 (no big reds)
def boolToPredictVeg(t):
    temp_df = pd.DataFrame()
    for key in t:
        if t[key] == True:
            temp_df0 = df2[key].sort_values(ascending = False)[0:100]
            temp_df = temp_df.append(temp_df0)      
    bool_df = temp_df.T
    x = bool_df.fillna(0)
    x['sums'] = x.sum(axis = 1)
    predicts = x.sort_values('sums', ascending = False).index.values[0:3]
    return predicts
#p = boolToPredict(test) 

def proteinTokenize(t):
    protein_list = []
    for key in t:
        if t[key] == True:
            x = protein_tokens.get(key)
            protein_list.append(x)
            
        if len(protein_list)<1:
            protein_list.append(9)
    return sorted(protein_list)[0]
p = proteinTokenize(protest)

In [76]:
p

1

## scrape and test

In [21]:
df_food = pd.read_csv("cleaned_ingredients.csv")
food_raw = df_food["ingredients"]
rnames = df_food['name']
#rnames = rnames[2:202]
sample = food_raw

In [22]:
rnames[2]

'lou s fabulous bruschetta'

In [23]:
l1 = sample[2]
def str_clean(s):
    raw_ing = s.strip("[]")
    raw_ing = raw_ing.replace("'", "")
    raw_ing = raw_ing.split(",")
    clean_ing = listToString(raw_ing)
    return clean_ing
s1 = str_clean(l1)    
test1 = ing_classifier(s1)
food_clean = []
for f in sample:
    ci = str_clean(f)
    food_clean.append(ci)


['fresh mushrooms  butter  boneless skinless chicken breast halves  flour  marsala  chicken broth  salt  mozzarella cheese  parmesan cheese  green onion',
 'beef eye round  vegetable oil  dried thyme leaves  salt  pepper  ready-to-serve beef broth  burgundy wine  garlic  baby carrots  frozen whole pearl onion  cornstarch  frozen sugar snap peas',
 'french baguette  butter  garlic powder  ricotta cheese  parmesan cheese  salt  tomatoes  olive oil  fresh basil leaves',
 'fresh lemon juice  olive oil  black beans  fresh corn kernels  plum tomato  scallion  fresh parsley leaves  cayenne  boston lettuce leaves',
 'olive oil  italian sausage  white onions  garlic  red pepper flakes  salt  black pepper  sun-dried tomato  white wine  cabbage  chicken stock  garlic toast',
 'butter  flour  salt  margarine  egg yolk  milk  water  shrimp  onion  tomatoes  parsley  lemon  pepper  tabasco sauce  garlic  cornstarch  olive oil  cumin  bell peppers',
 'brown rice  chicken broth  unsalted butter  orega

In [24]:
scrapePredicts = []
proteinTokens = []
for f in food_clean:
    protein = protein_classifier(f)
    veg = veg_classifier(f)
    
    #if no protein or veg ingredients returns 'invalid pairing'
    
    if all(x == False for x in veg.values()):
        i = 'invalid pairing'
        scrapePredicts.append(i)
        
    #if no protein ingredients runs booltopredict on df with no big reds
    
    elif all(x == False for x in protein.values()):
        t = ing_classifier(f)
        v = boolToPredictVeg(t)
        scrapePredicts.append(v)
        proteinTokens.append(0)
        
    #original function
    
    else:
        t = ing_classifier(f)
        p = boolToPredict(t)
        pro = proteinTokenize(protein)
        scrapePredicts.append(p)
        proteinTokens.append(pro)

KeyboardInterrupt: 

In [None]:
testPredictions = pd.DataFrame(rnames)
testPredictions['ingredients'] = food_clean
testPredictions['Predicts'] = scrapePredicts
testPredictions['protein_tokens'] = proteinTokens
testPredictions['calories'] = df_food['calories (number)']
testPredictions['fat (pdv)'] = df_food['total fat (PDV)']
testPredictions['sugar (pdv)'] = df_food['sugar (PDV)']
testPredictions['sodium (pdv)'] = df_food['sodium (PDV)']
testPredictions['protein (pdv)'] = df_food['protein (PDV)']
testPredictions['saturated fat (pdv)'] = df_food['saturated fat (PDV)']
testPredictions['carbohydrates (pdv)'] = df_food['carbohydrates (PDV)']

In [None]:
proteinTokens