## 0. import packages

In [1]:
import pandas as pd
import numpy as np
from scipy.spatial import distance

import gensim
import fasttext

In [72]:
import re
import requests
from bs4 import BeautifulSoup

## 1. load cvs files

In [3]:
# read recipe csv file
df_recipe = pd.read_csv("recipes_test.csv")

In [60]:
df_recipe.head(3)

Unnamed: 0,name,id,minutes,contributor_id,submitted,tags,nutrition,n_steps,steps,description,n_ingredients,ingredients
0,pretty freaking awesome pulled pork crock pot,484624,1260,197045,2012-07-30,"['course', 'preparation', 'main-dish', 'easy']","[685.3, 55.0, 127.0, 83.0, 86.0, 62.0, 14.0]",23,"['1', '6 hours before cooking , mix all ingred...",in the search for the best pulled pork recipe ...,11,3 -4 lbs pork butt 2 large onions 1 cup ging...
1,acadia s eggplant parmesan,399771,60,804550,2009-11-17,"['60-minutes-or-less', 'time-to-make', 'course...","[206.7, 21.0, 23.0, 19.0, 20.0, 39.0, 3.0]",10,['slice eggplant cross wise into desired thick...,i love eggplant and i decided to make a layere...,6,salt 2 tablespoons butter (or margarine) 1 e...
2,african chicken curry,455721,60,400708,2011-05-10,"['curries', '60-minutes-or-less', 'time-to-mak...","[758.4, 92.0, 16.0, 16.0, 91.0, 148.0, 4.0]",7,"['heat olive oil in a large , heavy skillet ov...",this is an african (mozambique) curry that can...,10,"1 tablespoon olive oil 1 onion, chopped 2 ga..."


In [5]:
df_emissions = pd.read_csv("emissions.csv")

In [61]:
df_emissions.head(3)

Unnamed: 0,ingredient,variety,certification,emissions
0,mushrooms,canned,conventional,2.55
1,corn,canned,conventional,1.2
2,tomato,canned,conventional,1.87


## 2. load language model 

In [7]:
ft = fasttext.load_model('../model/fastText/cc.en.300.bin')



## 3. compute all word vectors of ingredients in emission df 

In [23]:
# list of unique ingredient name in db (total 441)
ing_db_words = df_emissions['ingredient'].unique()

In [24]:
# list of word-vector of ingredient name in db (total 441)
ing_db_vecs = []
for ing in ing_db_words:
    ing_db_vecs.append(ft.get_word_vector(ing))

## 4. scrap ingredients and quantities from food.com

### 4.1 generate query for url 

In [74]:
def generate_query(name,id):
    name = name.split(" ")
    name = '-'.join(name)
    query = name + "-" + str(id)
    return query

In [75]:
name = 'pretty freaking awesome pulled pork crock pot'
id = 484624

query = generate_query(name,id)

### 4.2 scrapping

In [77]:
def requestRecipeUrl(input:str):
    # # INPUT(string) : Recipe name from food.com
    # # OUTPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html

    # EXPECTED_RECIPE_PAGE = 'food.com/recipe/'
    # input.index(EXPECTED_RECIPE_PAGE)
    # input = input.split('?')[0]
    
    """
    SWEET AND SPICY VEGETARIAN CHILI
    If scale is 0, there exist 0 quantity ingredients
    Solution : search with scale 10 and divide each quantity by 10 later
    """
    url = 'https://www.food.com/recipe/' + input + '?units=metric&scale=10'
    

    r = requests.get(url)
    html_doc = r.text
    soup = BeautifulSoup(html_doc, features="html.parser")

    return soup

In [79]:
soup = requestRecipeUrl(query)

In [80]:
def parseRecipeName(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(tuple | string, List of dictionary) : Recipe Name
    recipeTitle = soup.title.text.split(' - Food.com')[0]
    return recipeTitle

def parseRecipeIngrd(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(List of dictionary) : Ingredients List
    ingrdList = findIngrd(soup)
    return ingrdList

def findIngrd(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(List of dictionary) : Ingredient Information List
    # #                              {'ingredient' : string, 'quantity' : float, 'unit' : string}

    ingrdList = []

    ultag = soup.find('ul', {'class': re.compile('^ingredient-list')})
    i = 0
    for litag in ultag.find_all('li'):
        # print(f"{i+1}/{len(ultag.find_all('li'))}")
        quant_obj = litag.find('span', {'class': re.compile('quantity')})
        ingrd_obj = litag.find('span', {'class': re.compile('text')})

        # CHECK : the item of list is the information of an ingredient.
        if (quant_obj != None) and (ingrd_obj != None):
            q_str = quant_obj.text.split('-')
            q_str = q_str[0] # [CornerCase] e.g. 10-13
            q = float(q_str)/10 if q_str != '' else float(0) # scale down from 10 to 1

            # ASSUME : If there is no unit, the unit as 'ea'
            _u = ingrd_obj.text.split()[0]
            u = _u if (_u =='ml' or _u =='g') else 'ea'

            # CHECK : the ingredient has a url for a detail.
            ingrdPage = ingrd_obj.find('a')
            if (ingrdPage != None):
                sub_url = ingrdPage['href']
                # CHECK : the ingrdient url is a Ingredient Detail page
                if ('about' in sub_url):
                    sub_url ='https://www.food.com' + sub_url
                    sub_r = requests.get(sub_url)
                    sub_html_doc = sub_r.text
                    sub_soup = BeautifulSoup(sub_html_doc, features="html.parser")
                    ingrd = sub_soup.find('h1').text

                    ingrdList.append({'ingredient' : ingrd.lower(), 'quantity' : q, 'unit' : u})
                else:
                    True
                    # TODO [CORNER CASE] Recursive Call of Scraping Recipe Page. Skip now.
            else:
                # TODO [CORNER CASE] case for the ingredient doesn't have a url link
                True
        else:
            True
        i+=1

    return ingrdList

In [81]:
recipe = findIngrd(soup)

In [82]:
print(recipe)

[{'ingredient': 'onion', 'quantity': 0.3, 'unit': 'ea'}, {'ingredient': 'ginger ale', 'quantity': 39.431, 'unit': 'ml'}, {'ingredient': 'barbecue sauce', 'quantity': 75.598, 'unit': 'g'}, {'ingredient': 'brown sugar', 'quantity': 2.464, 'unit': 'ml'}, {'ingredient': 'sugar', 'quantity': 2.464, 'unit': 'ml'}, {'ingredient': 'paprika', 'quantity': 2.464, 'unit': 'ml'}, {'ingredient': 'salt', 'quantity': 2.464, 'unit': 'ml'}, {'ingredient': 'pepper', 'quantity': 0.8210000000000001, 'unit': 'ml'}, {'ingredient': 'mustard, seed and powder', 'quantity': 0.8210000000000001, 'unit': 'ml'}, {'ingredient': 'cayenne pepper', 'quantity': 0.205, 'unit': 'ml'}]


In [51]:
# input - list of dictionary : recipe id 
'''
recipe = [{'ingredient': 'chicken legs', 'quantity': 1.0, 'unit': 'ea'}, 
         {'ingredient': 'ricoatta cheese', 'quantity': 56.699, 'unit': 'g'}]
'''

## 5. compute cos sim and match to the most similar ingrediant in DB

In [83]:
def replace_sim_words(input):
    for i, ing_org in enumerate(input):
        print("===="*10)
        print("Find most similar words of <<",ing_org['ingredient'],">> in DB")
        distances = []
        ing_org_vec = ft.get_word_vector(ing_org['ingredient'])   

        for ing_db_vec in ing_db_vecs:
            cos_sim= distance.cosine(ing_org_vec, ing_db_vec)
            distances.append(cos_sim)

        min_dis = min(distances)
        sim_word = ing_db_words[distances.index(min_dis)]
        print("found! We will repace it to <<",sim_word,">> !")
        input[i]['ingredient']= sim_word
        
    return input
       

In [84]:
replace_sim_words(recipe)

Find most similar words of << onion >> in DB
found! We will repace it to << onion >> !
Find most similar words of << ginger ale >> in DB
found! We will repace it to << white cabbage >> !
Find most similar words of << barbecue sauce >> in DB
found! We will repace it to << fish sauce >> !
Find most similar words of << brown sugar >> in DB
found! We will repace it to << brown sugar >> !
Find most similar words of << sugar >> in DB
found! We will repace it to << sugar >> !
Find most similar words of << paprika >> in DB
found! We will repace it to << paprika >> !
Find most similar words of << salt >> in DB
found! We will repace it to << salt >> !
Find most similar words of << pepper >> in DB
found! We will repace it to << pepper >> !
Find most similar words of << mustard, seed and powder >> in DB
found! We will repace it to << granulated sugar >> !
Find most similar words of << cayenne pepper >> in DB
found! We will repace it to << bell pepper >> !


[{'ingredient': 'onion', 'quantity': 0.3, 'unit': 'ea'},
 {'ingredient': 'white cabbage', 'quantity': 39.431, 'unit': 'ml'},
 {'ingredient': 'fish sauce', 'quantity': 75.598, 'unit': 'g'},
 {'ingredient': 'brown sugar', 'quantity': 2.464, 'unit': 'ml'},
 {'ingredient': 'sugar', 'quantity': 2.464, 'unit': 'ml'},
 {'ingredient': 'paprika', 'quantity': 2.464, 'unit': 'ml'},
 {'ingredient': 'salt', 'quantity': 2.464, 'unit': 'ml'},
 {'ingredient': 'pepper', 'quantity': 0.8210000000000001, 'unit': 'ml'},
 {'ingredient': 'granulated sugar',
  'quantity': 0.8210000000000001,
  'unit': 'ml'},
 {'ingredient': 'bell pepper', 'quantity': 0.205, 'unit': 'ml'}]

## 6. calculate total CO2

In [85]:
def compute_emission(input):
    emission = 0
    for recipe in input:
        emission += recipe['quantity']*(df_emissions.loc[df_emissions['ingredient'] == recipe['ingredient'], 'emissions'].iloc[0])
    return emission
               
        

In [87]:
compute_emission(recipe)

216.9877662

## 7. run test file and generate result csv file 

In [90]:
recipe_names = df_recipe['name']
recipe_ids = df_recipe['id']
emissions = []

for name,id in zip(recipe_names,recipe_ids):
    query = generate_query(name,id)
    soup = requestRecipeUrl(query)
    recipe = findIngrd(soup)
    recipe_new = replace_sim_words(recipe)
    emission = compute_emission(recipe_new)
    emissions.append(emission)

ValueError: could not convert string to float: '1⁄4'

In [None]:
emission_series = pd.Series(emissions)
frame = { 'id': recipe_ids, 'total': emission_series}
result = pd.DataFrame(frame)
#Printing elements of Dataframe
print(result)

#export to csv file
