# 0. Dependancy 

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
%cd /content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle/

Mounted at /content/gdrive
/content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle


### import package

In [2]:
import re
import requests
from bs4 import BeautifulSoup

import pandas as pd
from scipy.spatial import distance
from gensim.models import FastText
from textblob import TextBlob

from keras.preprocessing.text import Tokenizer
import re
import nltk
from string import punctuation 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

# 1. load dataset & model

In [3]:
# read recipe csv file
df_recipe = pd.read_csv("./data/recipes_test.csv")
df_emissions = pd.read_csv("./data/emissions.csv")

In [30]:
loaded_model = FastText.load('./model/fooddotcom_model')
print(loaded_model)

FastText(vocab=12232, size=100, alpha=0.025)


# 2. data preparation

### build word-vector of ingredients in DB

In [31]:
# list of unique ingredient name in db (total 441)
ing_db_words = df_emissions['ingredient'].unique()

# list of word-vector of ingredient name in db (total 441)
ing_db_vecs = []
for ing in ing_db_words:
    ing_db_vecs.append(loaded_model.wv.get_vector(ing))

### scrapping clean data

In [6]:
def generate_query(name,id):
    name = name.split(" ")
    name = '-'.join(name)
    query = name + "-" + str(id)
    return query

def requestRecipeUrl(input:str):
    print(input)
    # request by url 'https://www.food.com/recipe/' + name + id
    url = 'https://www.food.com/recipe/' + input 
    r = requests.get(url)
    # get a correct url and scale to create full url
    html_doc = r.text
    soup = BeautifulSoup(html_doc, features="html.parser")
    serves = soup.find(class_="value svelte-1o10zxc").string
    print(serves)
    if len(serves) > 0 :
      if "/" in serves:
        serves = serves.split("/")
        serves = serves[0]+"%"+"2F"+serves[1]
        # 1%2F10
    print(serves)
    url_metric = r.url + '?units=metric&scale='+serves
    print(url_metric)
    # create html soup object to scrap 
    r = requests.get(url_metric)   
    html_doc = r.text
    soup = BeautifulSoup(html_doc, features="html.parser")

    return soup

In [21]:
def text_postprocessing(word):

  # remove special characters
  word = re.sub(r'\W', ' ', str(word))
  # remove numbers 
  word = re.sub('[0-9]+', '', word)
  # remove single characters
  word = re.sub(r'\s+[a-zA-Z]\s+', ' ', word)
  word = re.sub(r'\^[a-zA-Z]\s+', ' ', word)
  # substituting multiple spaces with single space
  word = re.sub(r'\s+', ' ', word, flags=re.I)
  # converting to lowercase
  word = word.lower()
  # remove stop words 
  tokens = word.split()
  clean = [token for token in tokens if len(token) > 2 if not token in en_stop]
  word = " ".join(clean)
  print(word)
  # leave only noun 
  word = TextBlob(word)
  pos = word.tags
  print(pos)
  word = [p[0] for p in pos if p[1] == 'NN' or p[1] == 'NNS']
  print(word)
  word = " ".join(word)
  print(word)

  return word


In [11]:
def parseRecipeName(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(tuple | string, List of dictionary) : Recipe Name
    recipeTitle = soup.title.text.split(' - Food.com')[0]
    return recipeTitle

def parseRecipeIngrd(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(List of dictionary) : Ingredients List
    ingrdList = findIngrd(soup)
    return ingrdList

def findIngrd(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(List of dictionary) : Ingredient Information List
    # #                              {'ingredient' : string, 'quantity' : float, 'unit' : string}

    ingrdList = []

    ultag = soup.find('ul', {'class': re.compile('^ingredient-list')})
    i = 0
    for litag in ultag.find_all('li'):
        # print(f"{i+1}/{len(ultag.find_all('li'))}")
        quant_obj = litag.find('span', {'class': re.compile('quantity')})
        ingrd_obj = litag.find('span', {'class': re.compile('text')})

        # CHECK : the item of list is the information of an ingredient.   
        if (quant_obj != None) and (ingrd_obj != None):
            print(quant_obj.text)
            if "/" in quant_obj.text:
              q_str = quant_obj.text.split("/")
              q = float(q_str [0])/float(q_str [1])
            elif "⁄" in quant_obj.text:
              q_str = quant_obj.text.split("⁄")
              q = float(q_str [0])/float(q_str [1])
            elif "-" in quant_obj.text:
              q_str = quant_obj.text.split("-")
              q = (float(q_str [0])+float(q_str [1]))/2
            elif quant_obj.text == '':
              q = 0
            else : 
              q = float(quant_obj.text)
            
            # ASSUME : If there is no unit, the unit as 'ea'
            _u = ingrd_obj.text.split()[0]
            u = _u if (_u =='ml' or _u =='g') else 'ea'

            # CHECK : the ingredient has a url for a detail.
            ingrdPage = ingrd_obj.find('a')
            if (ingrdPage != None):
                sub_url = ingrdPage['href']
                # CHECK : the ingrdient url is a Ingredient Detail page
                if ('about' in sub_url):
                    sub_url ='https://www.food.com' + sub_url
                    sub_r = requests.get(sub_url)
                    sub_html_doc = sub_r.text
                    sub_soup = BeautifulSoup(sub_html_doc, features="html.parser")
                    ingrd = sub_soup.find('h1').text

                    ingrdList.append({'ingredient' : ingrd.lower(), 'quantity' : q, 'unit' : u})
                else:
                    True
                    # TODO [CORNER CASE] Recursive Call of Scraping Recipe Page. Skip now.
            else:
                # TODO [CORNER CASE] case for the ingredient doesn't have a url link
                ingrd = ingrd_obj.text
                print(ingrd)
                ingrd = text_postprocessing(ingrd)
                ingrdList.append({'ingredient' : ingrd, 'quantity' : q, 'unit' : u})
                True
        else:
            True
        i+=1

    return ingrdList

# 3. find similar words 

In [33]:
def replace_sim_words(input):
    for i, ing_org in enumerate(input):
        print("===="*10)
        print("Find most similar words of <<",ing_org['ingredient'],">> in DB")
        distances = []
        ing_org_vec = loaded_model.wv.get_vector(ing_org['ingredient'])   

        for ing_db_vec in ing_db_vecs:
            cos_sim= distance.cosine(ing_org_vec, ing_db_vec)
            distances.append(cos_sim)

        min_dis = min(distances)
        sim_word = ing_db_words[distances.index(min_dis)]
        print("found! We will repace it to <<",sim_word,">> !  similarity is", min_dis)

        if min_dis > 0.1 :
          print("but it looks not so similar :( we will mark as none from the recipe")
          input[i]['ingredient']= "none"
        else:
          input[i]['ingredient']= sim_word
        
    return input
       

# 4. Compute CO2 emission

In [19]:
def compute_emission(input):
    n = len(input)
    recipes = input.copy()
    for i in range(n):
      if input[i]['ingredient'] == 'none':
        recipes.remove(input[i])
    emission = 0
    for recipe in recipes:
      emission += recipe['quantity']*(df_emissions.loc[df_emissions['ingredient'] == recipe['ingredient'], 'emissions'].iloc[0])
    return emission
               

# 5. test file run

In [34]:
recipe_names = df_recipe['name']
recipe_ids = df_recipe['id']
emissions = []

for name,id in zip(recipe_names,recipe_ids):
    query = generate_query(name,id)
    soup = requestRecipeUrl(query)
    recipe = findIngrd(soup)
    recipe_new = replace_sim_words(recipe)
    emission = compute_emission(recipe_new)
    emissions.append(emission)

pretty-freaking-awesome--pulled-pork--crock-pot-484624
6
6
https://www.food.com/recipe/pretty-freaking-awesome-pulled-pork-crock-pot-484624?units=metric&scale=6
1360.77-1814.36

  g    pork butt

pork butt
[('pork', 'NN'), ('butt', 'NN')]
['pork', 'butt']
pork butt
2
236.59
453.59
14.79
14.79
14.79
14.79
4.92
4.92
1.23
Find most similar words of << pork butt >> in DB
found! We will repace it to << pork >> !  similarity is 0.14629054069519043
but it looks not so similar :( we will mark as none from the recipe
Find most similar words of << onion >> in DB
found! We will repace it to << onion >> !  similarity is 0
Find most similar words of << ginger ale >> in DB
found! We will repace it to << ginger >> !  similarity is 0.09138751029968262
Find most similar words of << barbecue sauce >> in DB
found! We will repace it to << bbq sauce >> !  similarity is 0.1862044334411621
but it looks not so similar :( we will mark as none from the recipe
Find most similar words of << brown sugar >> in DB
f

#6. export test result to csv

In [35]:
emission_series = pd.Series(emissions)/1000
frame = { 'Id': recipe_ids, 'Predicted': emission_series}
result = pd.DataFrame(frame)
#Printing elements of Dataframe
print(result)
result.to_csv('./result/result10.csv', index=False)  # threshold 0.2


        Id  Predicted
0   484624   0.430138
1   399771   1.333170
2   455721   3.842630
3   454994   0.610500
4   361646   1.005724
5    26219   0.059421
6    53252   3.024486
7   186382   1.372856
8   130632  10.519877
9   257067   0.598148
10   28149   1.623842
11  112082   1.106923
12  154088   3.765148
13  313094   0.899948
14  382812   1.278242
15   64322   1.149287
16   10328  15.545166
17  274736   0.454920
18  424780   0.248795
19  390247   1.313074
20  513985   2.692046
21  206463   1.117212
22   38596   0.120678
23   23706   5.151267
24  292213   0.309960
25  370726   4.733672
26  240843   8.642292
27   49249   0.557817
28  110813   0.041742
29   66190   4.542498
30  252674  14.339432
31  512810   0.091080
32  113725   2.341371
33   41874   7.674466
34   61210   0.255851
35   41725   4.564554
36  177958   1.429016
37   53907   0.103609
38  137627   0.224852
39  485002   0.908696
40   19441   0.954266
41  169079   0.697364
42  150554   0.472167
43   37000   1.078887
44   91195