In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd /content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle/

/content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Prepare dataset

In [4]:
# read recipe csv file
df_train = pd.read_csv("RAW_recipes.csv")

In [5]:
df_train.shape

(231637, 12)

In [6]:
steps = df_train['steps']
reviews = df_train['description']
ingredients = df_train['ingredients']

### normalization & tokenizing

In [None]:
from keras.preprocessing.text import Tokenizer
import re
import nltk
from string import punctuation 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))


In [8]:
tokenizer = nltk.WordPunctTokenizer()

def preprocessing(document):
  # remove special characters
  document = re.sub(r'\W', ' ', str(document))

  # remove numbers 
  document = re.sub('[0-9]+', '', document)

  # remove single characters
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

  # substituting multiple spaces with single space
  document = re.sub(r'\s+', ' ', document, flags=re.I)

  # converting to lowercase
  dodument = document.lower()

  # tokenizing 
  document = tokenizer.tokenize(document) 

  # remove stop words 
  document = [w for w in document if len(w) > 2 if not w in en_stop]
  '''
  # lemmatization 
  tokens = document.split()
  stemmer = WordNetLemmatizer()
  tokens = [stemmer.lemmatize(word) for word in tokens]
  tokens = [word for word in tokens if word not in en_stop]
  tokens = [word for word in tokens if len(word)>3]

  res = ' '.join(tokens)
  '''

  return document

In [9]:
# preprocessing dataset 
preprocessed_steps = [preprocessing(sentence) for sentence in steps if sentence.strip() != '']

In [None]:
preprocessed_steps[0]

## Training fastText 

learn more about the gensim fastText model parameter : https://radimrehurek.com/gensim/models/fasttext.html

quick tutorial : https://github.com/PacktPublishing/fastText-Quick-Start-Guide/blob/master/chapter5/fasttext%20with%20gensim.ipynb

In [12]:
from gensim.models import FastText

In [14]:
model = FastText(preprocessed_steps, size=100, window=5, min_count=5, workers=4,sg=1)

In [22]:
model.wv.most_similar("macadamia nut")

[('macadamia', 0.9698609113693237),
 ('macadamias', 0.9531890153884888),
 ('walnuts', 0.734894871711731),
 ('cashew', 0.7241073846817017),
 ('almonds', 0.7235787510871887),
 ('nuts', 0.7228017449378967),
 ('pistachio', 0.7166469097137451),
 ('pistachios', 0.7151767015457153),
 ('pecans', 0.6972396969795227),
 ('cashewnuts', 0.6871625781059265)]

In [18]:
model.wv.most_similar("riccota cheese")

[('riccota', 0.8869120478630066),
 ('cheese', 0.8205503821372986),
 ('ricotta', 0.7524741888046265),
 ('cheeses', 0.7492947578430176),
 ('asiago', 0.7462842464447021),
 ('mozzeralla', 0.7407122850418091),
 ('creamcheese', 0.7369866371154785),
 ('cottage', 0.7360917925834656),
 ('mozza', 0.7321102023124695),
 ('chees', 0.7266157865524292)]

## save model

In [25]:
model.save('fooddotcom_model')


In [26]:
loaded_model = FastText.load('fooddotcom_model')
print(loaded_model)

FastText(vocab=12232, size=100, alpha=0.025)


# Test 

In [27]:
import re
import requests
from bs4 import BeautifulSoup

In [28]:
def generate_query(name,id):
    name = name.split(" ")
    name = '-'.join(name)
    query = name + "-" + str(id)
    return query

In [81]:
def requestRecipeUrl(input:str):
    print(input)
    # request by url 'https://www.food.com/recipe/' + name + id
    url = 'https://www.food.com/recipe/' + input 
    r = requests.get(url)
    # get a correct url and scale to create full url
    html_doc = r.text
    soup = BeautifulSoup(html_doc, features="html.parser")
    serves = soup.find(class_="value svelte-1o10zxc").string
    if len(serves) > 0 :
      serves.split("-")
      serves = serves[0]
    url_metric = r.url + '?units=metric&scale='+serves
    print(url_metric)
    # create html soup object to scrap 
    r = requests.get(url_metric)   
    html_doc = r.text
    soup = BeautifulSoup(html_doc, features="html.parser")

    return soup

In [82]:
input = 'alyssa-s-favorite-fish-361646'

In [83]:
res = requestRecipeUrl(input)

alyssa-s-favorite-fish-361646
https://www.food.com/recipe/alyssas-favorite-fish-361646?units=metric&scale=4


In [30]:
def parseRecipeName(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(tuple | string, List of dictionary) : Recipe Name
    recipeTitle = soup.title.text.split(' - Food.com')[0]
    return recipeTitle

def parseRecipeIngrd(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(List of dictionary) : Ingredients List
    ingrdList = findIngrd(soup)
    return ingrdList

def findIngrd(soup:BeautifulSoup):
    # # INPUT(Constructor | BeautifulSoup) : BeautifulSoup constructor of Recipe URL html
    # # OUTPUT(List of dictionary) : Ingredient Information List
    # #                              {'ingredient' : string, 'quantity' : float, 'unit' : string}

    ingrdList = []

    ultag = soup.find('ul', {'class': re.compile('^ingredient-list')})
    i = 0
    for litag in ultag.find_all('li'):
        # print(f"{i+1}/{len(ultag.find_all('li'))}")
        quant_obj = litag.find('span', {'class': re.compile('quantity')})
        ingrd_obj = litag.find('span', {'class': re.compile('text')})

        # CHECK : the item of list is the information of an ingredient.
        if (quant_obj != None) and (ingrd_obj != None):
            q_str = quant_obj.text.split('-')
            q_str = q_str[0] # [CornerCase] e.g. 10-13
            q = float(q_str)/10 if q_str != '' else float(0) # scale down from 10 to 1

            # ASSUME : If there is no unit, the unit as 'ea'
            _u = ingrd_obj.text.split()[0]
            u = _u if (_u =='ml' or _u =='g') else 'ea'

            # CHECK : the ingredient has a url for a detail.
            ingrdPage = ingrd_obj.find('a')
            if (ingrdPage != None):
                sub_url = ingrdPage['href']
                # CHECK : the ingrdient url is a Ingredient Detail page
                if ('about' in sub_url):
                    sub_url ='https://www.food.com' + sub_url
                    sub_r = requests.get(sub_url)
                    sub_html_doc = sub_r.text
                    sub_soup = BeautifulSoup(sub_html_doc, features="html.parser")
                    ingrd = sub_soup.find('h1').text

                    ingrdList.append({'ingredient' : ingrd.lower(), 'quantity' : q, 'unit' : u})
                else:
                    True
                    # TODO [CORNER CASE] Recursive Call of Scraping Recipe Page. Skip now.
            else:
                # TODO [CORNER CASE] case for the ingredient doesn't have a url link
                True
        else:
            True
        i+=1

    return ingrdList

In [32]:
name = 'pretty freaking awesome  pulled pork crock pot'
id = 484624

query = generate_query(name,id)
soup = requestRecipeUrl(query)
recipe = findIngrd(soup)

print(recipe)

pretty-freaking-awesome--pulled-pork-crock-pot-484624
https://www.food.com/recipe/pretty-freaking-awesome-pulled-pork-crock-pot-484624?units=metric&scale=6
[{'ingredient': 'onion', 'quantity': 0.2, 'unit': 'ea'}, {'ingredient': 'ginger ale', 'quantity': 23.659, 'unit': 'ml'}, {'ingredient': 'barbecue sauce', 'quantity': 45.358999999999995, 'unit': 'g'}, {'ingredient': 'brown sugar', 'quantity': 1.4789999999999999, 'unit': 'ml'}, {'ingredient': 'sugar', 'quantity': 1.4789999999999999, 'unit': 'ml'}, {'ingredient': 'paprika', 'quantity': 1.4789999999999999, 'unit': 'ml'}, {'ingredient': 'salt', 'quantity': 1.4789999999999999, 'unit': 'ml'}, {'ingredient': 'pepper', 'quantity': 0.492, 'unit': 'ml'}, {'ingredient': 'mustard, seed and powder', 'quantity': 0.492, 'unit': 'ml'}, {'ingredient': 'cayenne pepper', 'quantity': 0.123, 'unit': 'ml'}]


### load test file

In [33]:
# read recipe csv file
df_recipe = pd.read_csv("recipes_test.csv")
df_emissions = pd.read_csv("emissions.csv")

### compute word vectors 

In [35]:
# list of unique ingredient name in db (total 441)
ing_db_words = df_emissions['ingredient'].unique()

In [42]:
# list of word-vector of ingredient name in db (total 441)
ing_db_vecs = []
for ing in ing_db_words:
    ing_db_vecs.append(loaded_model.wv.get_vector(ing))

### compute com sim

In [45]:
from scipy.spatial import distance

In [46]:
def replace_sim_words(input):
    for i, ing_org in enumerate(input):
        print("===="*10)
        print("Find most similar words of <<",ing_org['ingredient'],">> in DB")
        distances = []
        ing_org_vec = loaded_model.wv.get_vector(ing_org['ingredient'])   

        for ing_db_vec in ing_db_vecs:
            cos_sim= distance.cosine(ing_org_vec, ing_db_vec)
            distances.append(cos_sim)

        min_dis = min(distances)
        sim_word = ing_db_words[distances.index(min_dis)]
        print("found! We will repace it to <<",sim_word,">> !")
        input[i]['ingredient']= sim_word
        
    return input
       

In [47]:
replace_sim_words(recipe)

Find most similar words of << onion >> in DB
found! We will repace it to << onion >> !
Find most similar words of << ginger ale >> in DB
found! We will repace it to << ginger >> !
Find most similar words of << barbecue sauce >> in DB
found! We will repace it to << bbq sauce >> !
Find most similar words of << brown sugar >> in DB
found! We will repace it to << brown sugar >> !
Find most similar words of << sugar >> in DB
found! We will repace it to << sugar >> !
Find most similar words of << paprika >> in DB
found! We will repace it to << paprika >> !
Find most similar words of << salt >> in DB
found! We will repace it to << salt >> !
Find most similar words of << pepper >> in DB
found! We will repace it to << pepper >> !
Find most similar words of << mustard, seed and powder >> in DB
found! We will repace it to << mustard seeds >> !
Find most similar words of << cayenne pepper >> in DB
found! We will repace it to << pepper >> !


[{'ingredient': 'onion', 'quantity': 0.2, 'unit': 'ea'},
 {'ingredient': 'ginger', 'quantity': 23.659, 'unit': 'ml'},
 {'ingredient': 'bbq sauce', 'quantity': 45.358999999999995, 'unit': 'g'},
 {'ingredient': 'brown sugar', 'quantity': 1.4789999999999999, 'unit': 'ml'},
 {'ingredient': 'sugar', 'quantity': 1.4789999999999999, 'unit': 'ml'},
 {'ingredient': 'paprika', 'quantity': 1.4789999999999999, 'unit': 'ml'},
 {'ingredient': 'salt', 'quantity': 1.4789999999999999, 'unit': 'ml'},
 {'ingredient': 'pepper', 'quantity': 0.492, 'unit': 'ml'},
 {'ingredient': 'mustard seeds', 'quantity': 0.492, 'unit': 'ml'},
 {'ingredient': 'pepper', 'quantity': 0.123, 'unit': 'ml'}]

### compute emission

In [48]:
def compute_emission(input):
    emission = 0
    for recipe in input:
        emission += recipe['quantity']*(df_emissions.loc[df_emissions['ingredient'] == recipe['ingredient'], 'emissions'].iloc[0])
    return emission
               

In [49]:
compute_emission(recipe)

88.2553417

### Test file run and save as csv

In [84]:
recipe_names = df_recipe['name']
recipe_ids = df_recipe['id']
emissions = []

for name,id in zip(recipe_names,recipe_ids):
    query = generate_query(name,id)
    soup = requestRecipeUrl(query)
    recipe = findIngrd(soup)
    recipe_new = replace_sim_words(recipe)
    emission = compute_emission(recipe_new)
    emissions.append(emission)

pretty-freaking-awesome--pulled-pork--crock-pot-484624
https://www.food.com/recipe/pretty-freaking-awesome-pulled-pork-crock-pot-484624?units=metric&scale=6
Find most similar words of << onion >> in DB
found! We will repace it to << onion >> !
Find most similar words of << ginger ale >> in DB
found! We will repace it to << ginger >> !
Find most similar words of << barbecue sauce >> in DB
found! We will repace it to << bbq sauce >> !
Find most similar words of << brown sugar >> in DB
found! We will repace it to << brown sugar >> !
Find most similar words of << sugar >> in DB
found! We will repace it to << sugar >> !
Find most similar words of << paprika >> in DB
found! We will repace it to << paprika >> !
Find most similar words of << salt >> in DB
found! We will repace it to << salt >> !
Find most similar words of << pepper >> in DB
found! We will repace it to << pepper >> !
Find most similar words of << mustard, seed and powder >> in DB
found! We will repace it to << mustard seeds >> 

In [89]:
emission_series = pd.Series(emissions)/100
frame = { 'Id': recipe_ids, 'Predicted': emission_series}
result = pd.DataFrame(frame)
#Printing elements of Dataframe
print(result)
result.to_csv('result2.csv', index=False)  


        Id  Predicted
0   484624   0.882553
1   399771   1.333170
2   455721   3.320890
3   454994   0.840349
4   361646   1.005315
5    26219   0.192102
6    53252   0.176495
7   186382   1.564091
8   130632  10.626037
9   257067   0.929582
10   28149   1.623842
11  112082   1.231292
12  154088   4.055881
13  313094   1.576812
14  382812   2.331059
15   64322   1.282282
16   10328  18.148326
17  274736   0.456249
18  424780   0.269431
19  390247   9.336082
20  513985   3.021318
21  206463   1.759975
22   38596   0.118993
23   23706   5.445188
24  292213   0.303840
25  370726   5.545767
26  240843   8.567903
27   49249   0.569649
28  110813   0.054420
29   66190   0.233198
30  252674  15.141033
31  512810   0.315134
32  113725   2.974072
33   41874   8.024616
34   61210   0.334840
35   41725   0.670065
36  177958   1.454059
37   53907   0.104849
38  137627   0.856539
39  485002   0.947022
40   19441   0.056655
41  169079   0.093830
42  150554   0.123857
43   37000   1.110399
44   91195