## This is the spacy and regex implementation for extracting food from recipe

### import library, loading data

In [1]:
import spacy
import regex
import json
# probably other dependency

In [None]:
with open("./recipe/recipe.json","r") as f:
    data = json.load(f)

In [2]:
from collections import Counter
nlp =  spacy.load('en_core_web_lg')
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from spacy.tokenizer import Tokenizer
Tokenizer = Tokenizer(nlp.vocab)
Lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
# import en_core_seb_sm

In [4]:
doc = nlp(data[0]["food_ingredients"][-1])
for token in doc:
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

1/2 1/2 d/d NUM CD
cup cup xxx NOUN NN
confectioners confectioner xxxx NOUN NNS
' ' ' PART POS
sugar sugar xxxx NOUN NN
for for xxx ADP IN
decoration decoration xxxx NOUN NN


### define lemmatizer

In [3]:
def lemmatizer(text):
    return nlp(" ".join(list(map(lambda x: x.lemma_,nlp(text)))))

# test= nlp('1 1/4 cups butter')
# for token in test:
#     if token.tag_ == "NNS":
#         import pdb
#         pdb.set_trace()
#         token = Tokenizer(Lemmatizer(token.text, "NOUN")[0])

In [12]:
test= nlp('1 1/4 cups butter')
t = lemmatizer('1 1/4 cups butter')

In [28]:
str(t)

'1 1/4 cup butter'

In [17]:
from spacy import displacy
displacy.render(doc, style='ent', jupyter=True)

In [None]:
# nlp = spacy.load('en_core_web_lg')
dog = nlp.vocab["dog"]
cat = nlp.vocab["cat"]
apple = nlp.vocab["apple"]
orange = nlp.vocab["orange"]

In [None]:
dog.similarity(cat)

### setting up entity class and entity list

In [18]:
all_entities = []

In [4]:
class entity:
    def __init__(self, recipe):
        self.recipe = recipe
        self.ingredient = [x for x in recipe["food_ingredients"]]
        self.number = [0.0 for x in recipe["food_ingredients"]]
        self.unit = ["TBD" for x in recipe["food_ingredients"]]
        self.entities = ["TBD" for x in recipe["food_ingredients"]]
#         self.failure
#         self.recipe_triple = []
    def get_nlpres(self, nlp_output, index):
        if not nlp_output:
            self.number[index]="TBD"
            self.unit[index] = "TBD"
            self.entities[index] = "TBD"
#             print("")
            return
        res = 0.0
        for v in nlp_output:
            if v[1] != 'CARDINAL' and v[1] != "QUANTITY":
#                 print(v)
                pass
            elif v[1] == "CARDINAL":
                vals = v[0].split()
#                 res = 0.0
                for _ in vals:
                    try:
                        res+=eval(_)
                    except:
                        break
                self.number[index]=res
        
#                 self.unit.append("TBD")
#                 self.entities.append("TBD")
            else:
                vals = v[0].split()
#                 res = 0.0
                for _ in vals:
                    try:
                        res += eval(_)
                    except:
                        self.unit[index] = _
                        break
                self.number[index] = res
#                 self.entities.append("TBD")
            #TBD as default since cannot find

In [39]:
an_entity = entity(data[0])


In [40]:
all_entities = list(map(lambda x:entity(x), data))

In [41]:
len(all_entities)

3828

In [45]:
w = an_entity.ingredient
for i, _ in enumerate(w):
    tmp = lemmatizer(_)
    an_entity.ingredient[i] = str(tmp)
#     print(tmp)
    recog = [(d.text, d.label_) for d in tmp.ents]
#     print(recog)
    an_entity.get_nlpres(recog, i)

In [46]:
an_entity.number

[1.25, 0.6666666666666666, 1.0, 2.0, 0.125, 0.5, 2.0, 0.5]

In [47]:
an_entity.ingredient

['1 1/4 cup butter',
 '2/3 cup white sugar',
 '1 teaspoon vanilla extract',
 '2 cup all - purpose flour',
 '1/8 teaspoon salt',
 '1/2 cup unsweetened cocoa powder',
 '2 cup chop pecan',
 "1/2 cup confectioner ' sugar for decoration"]

### Run nlp for all_entities and saving as step1 pickle file, in this step, we resolve numbers and unit, item was filled by "TBD"

In [48]:
for e in all_entities:
    words = e.ingredient
    for i, w in enumerate(words):
        tmp = lemmatizer(w)
        e.ingredient[i] = str(tmp)
        for t in tmp:
            t = t.lemma_
        recog = [(d.text, d.label_) for d in tmp.ents]
        e.get_nlpres(recog, i)
    
        # automaticly find
        
        

In [49]:
# if not have a valid number, discard
for _, l in enumerate(all_entities):
    for i, n in enumerate(l.number):
#         print(i)
        if n == "TBD":
#             print(i)
            l.ingredient[i] = "not valid"
            l.unit[i] = "not valid"
            l.entities[i] = "not valid"
            l.number[i] = "not valid"

In [50]:
import pickle
with open("first_step_number.pkl", "wb") as p:
    pickle.dump(all_entities, p)

### Reload pickled file, start with items

In [51]:
loaded_all_entities = pickle.load(open("first_step_number.pkl", "rb"))

In [52]:
len(loaded_all_entities)

3828

In [53]:
loaded_all_entities[0].number

[1.25, 0.6666666666666666, 1.0, 2.0, 0.125, 0.5, 2.0, 0.5]

In [64]:
loaded_all_entities[5].unit

['cup',
 'cup',
 'cup',
 'TBD',
 'cup',
 'TBD',
 'TBD',
 'teaspoon',
 'teaspoon',
 'teaspoon',
 'teaspoon',
 'teaspoon']

In [65]:
loaded_all_entities[5].ingredient

['1 1/4 cup butter , soften',
 '1 1/4 cup white sugar',
 '3/4 cup light corn syrup',
 '2 small egg',
 '3 cup all - purpose flour',
 '1 1/2 teaspoon baking powder',
 '1 teaspoon baking soda',
 '1/2 teaspoon salt',
 '2 teaspoon ground cinnamon',
 '2 teaspoon ground clove',
 '1 teaspoon ground ginger',
 '1/4 teaspoon ground black pepper']

In [56]:
# statical result about first step entity recoginition number
cnt = 0
for l in loaded_all_entities:
    for n in l.number:
        if n == "TBD":
            cnt += 1
    

In [57]:
cnt

0

In [58]:
cnt = 0
for l in loaded_all_entities:
    for n in l.number:
        if n == "not valid":
            cnt += 1
cnt

971

In [59]:
loaded_all_entities[0].unit

['cup', 'cup', 'teaspoon', 'cup', 'teaspoon', 'cup', 'cup', 'TBD']

In [None]:
for token in test:
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

In [66]:
cnt = 0
for l in loaded_all_entities:
    for n in l.unit:
        if n == "TBD":
            cnt += 1
cnt

9365

In [None]:
test

In [68]:
cnt = 0
for l in loaded_all_entities:
    for n in l.unit:
        cnt+=1
cnt

33456

In [69]:
9365/33456

0.27991989478718315

In [70]:
test_sentence = '3 cup all - purpose flour'

In [72]:
for token in nlp(test_sentence):
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

3 3 d NUM CD
cup cup xxx NOUN NN
all all xxx DET DT
- - - PUNCT HYPH
purpose purpose xxxx NOUN NN
flour flour xxxx NOUN NN


In [75]:
test_sentence2 = "3 small egg"

In [76]:
for token in nlp(test_sentence2):
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

3 3 d NUM CD
small small xxxx ADJ JJ
egg egg xxx NOUN NN


In [77]:
test_sentence3 = "2 teaspoon ground cinnamon"

In [78]:
for token in nlp(test_sentence3):
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

2 2 d NUM CD
teaspoon teaspoon xxxx NOUN NN
ground ground xxxx NOUN NN
cinnamon cinnamon xxxx NOUN NN


In [79]:
test_sentence4 = '1 1/4 cup butter , soften'

In [80]:
for token in nlp(test_sentence4):
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

1 1 d NUM CD
1/4 1/4 d/d NUM CD
cup cup xxx NOUN NN
butter butter xxxx NOUN NN
, , , PUNCT ,
soften soften xxxx VERB VB


## only keep NN, remove things already in unit, change "TBD" in unit to default unit("ge"), val = "DEFAULT"

In [81]:
for l in loaded_all_entities:
    for i,u in enumerate(l.unit):
        if u == "TBD":
            l.unit[i] = "DEFAULT"

In [83]:
cnt = 0
for l in loaded_all_entities:
    for n in l.unit:
        if n == "DEFAULT":
            cnt += 1
cnt

9365

In [91]:
for l in loaded_all_entities:
    for i, n in enumerate(l.number):
        if l.ingredient[i] != "not valid":
            tmp = nlp(l.ingredient[i])
            res = []
            for token in tmp:
                if token.tag_ == ",":
                    break
                if token.tag_ == "NN" or token.tag_ == "JJ":
                    if token.text != l.unit[i]:
                        res.append(token.text)
            l.entities[i] = " ".join(res)
                

In [92]:
doc = nlp(data[0]["food_ingredients"][-1])
for token in doc:
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

1/2 1/2 d/d NUM CD
cup cup xxx NOUN NN
confectioners confectioner xxxx NOUN NNS
' ' ' PART POS
sugar sugar xxxx NOUN NN
for for xxx ADP IN
decoration decoration xxxx NOUN NN


In [93]:
loaded_all_entities[0].entities

['butter',
 'white sugar',
 'vanilla extract',
 'purpose flour',
 'salt',
 'unsweetened cocoa powder',
 'chop pecan',
 'cup confectioner sugar decoration']

In [94]:
loaded_all_entities[0].unit

['cup', 'cup', 'teaspoon', 'cup', 'teaspoon', 'cup', 'cup', 'DEFAULT']

In [88]:
loaded_all_entities[0].ingredient

['1 1/4 cup butter',
 '2/3 cup white sugar',
 '1 teaspoon vanilla extract',
 '2 cup all - purpose flour',
 '1/8 teaspoon salt',
 '1/2 cup unsweetened cocoa powder',
 '2 cup chop pecan',
 "1/2 cup confectioner ' sugar for decoration"]

In [90]:
t = '1/2 cup unsweetened cocoa powder'
for token in nlp(t):
    print(token.text, token.lemma_, token.shape_, token.pos_, token.tag_)

1/2 1/2 d/d NUM CD
cup cup xxx NOUN NN
unsweetened unsweetened xxxx ADJ JJ
cocoa cocoa xxxx NOUN NN
powder powder xxxx NOUN NN


In [95]:
cnt = 0
for l in loaded_all_entities:
    for e in l.entities:
        if e == "TBD":
            cnt += 1
cnt

0

In [99]:
loaded_all_entities[5].entities

['butter',
 'white sugar',
 'light corn syrup',
 'small egg',
 'purpose flour',
 'teaspoon baking powder',
 'teaspoon baking soda',
 'salt',
 'ground cinnamon',
 'ground clove',
 'ground ginger',
 'ground black pepper']

In [98]:
loaded_all_entities[5].ingredient

['1 1/4 cup butter , soften',
 '1 1/4 cup white sugar',
 '3/4 cup light corn syrup',
 '2 small egg',
 '3 cup all - purpose flour',
 '1 1/2 teaspoon baking powder',
 '1 teaspoon baking soda',
 '1/2 teaspoon salt',
 '2 teaspoon ground cinnamon',
 '2 teaspoon ground clove',
 '1 teaspoon ground ginger',
 '1/4 teaspoon ground black pepper']

### saving result to pkl

In [100]:
with open("extraction.pkl", "wb") as p:
    pickle.dump(loaded_all_entities, p)

### saving new result to json

In [107]:
outputs = []
for l in loaded_all_entities:
    output = l.recipe
    output["extraction_results"] = []
    for i, n in enumerate(l.number):
        output["extraction_results"].append((l.number[i], l.unit[i], l.entities[i]))
    outputs.append(output)

In [108]:
type(outputs)

list

In [109]:
outputs[0]

{'food_url': 'https://www.allrecipes.com/recipe/15253/chocolate-snowballs/',
 'type_url': 'https://www.allrecipes.com/recipes/841/holidays-and-events/christmas/desserts/christmas-cookies/',
 'type_name': 'Christmas Cookies',
 'food_name': 'Chocolate Snowballs',
 'food_ingredients': ['1 1/4 cups butter',
  '2/3 cup white sugar',
  '1 teaspoon vanilla extract',
  '2 cups all-purpose flour',
  '1/8 teaspoon salt',
  '1/2 cup unsweetened cocoa powder',
  '2 cups chopped pecans',
  "1/2 cup confectioners' sugar for decoration"],
 'extraction_results': [(1.25, 'cup', 'butter'),
  (0.6666666666666666, 'cup', 'white sugar'),
  (1.0, 'teaspoon', 'vanilla extract'),
  (2.0, 'cup', 'purpose flour'),
  (0.125, 'teaspoon', 'salt'),
  (0.5, 'cup', 'unsweetened cocoa powder'),
  (2.0, 'cup', 'chop pecan'),
  (0.5, 'DEFAULT', 'cup confectioner sugar decoration')]}

In [110]:
with open("recipe_extraction.json", "w") as j:
    json.dump(outputs, j)

### Second Run of NLP, resolving irregular units

In [1]:
import spacy
import regex
import json
from collections import Counter
nlp =  spacy.load('en_core_web_lg')
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES
from spacy.tokenizer import Tokenizer
Tokenizer = Tokenizer(nlp.vocab)
Lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
def lemmatizer(text):
    return nlp(" ".join(list(map(lambda x: x.lemma_,nlp(text)))))
class entity:
    def __init__(self, recipe):
        self.recipe = recipe
        self.ingredient = [x for x in recipe["food_ingredients"]]
        self.number = [0.0 for x in recipe["food_ingredients"]]
        self.unit = ["TBD" for x in recipe["food_ingredients"]]
        self.entities = ["TBD" for x in recipe["food_ingredients"]]
#         self.failure
#         self.recipe_triple = []
    def get_nlpres(self, nlp_output, index):
        if not nlp_output:
            self.number[index]="TBD"
            self.unit[index] = "TBD"
            self.entities[index] = "TBD"
#             print("")
            return
        res = 0.0
        for v in nlp_output:
            if v[1] != 'CARDINAL' and v[1] != "QUANTITY":
#                 print(v)
                pass
            elif v[1] == "CARDINAL":
                vals = v[0].split()
#                 res = 0.0
                for _ in vals:
                    try:
                        res+=eval(_)
                    except:
                        break
                self.number[index]=res
        
#                 self.unit.append("TBD")
#                 self.entities.append("TBD")
            else:
                vals = v[0].split()
#                 res = 0.0
                for _ in vals:
                    try:
                        res += eval(_)
                    except:
                        self.unit[index] = _
                        break
                self.number[index] = res

In [24]:
import pickle
with open("./extraction.pkl", "rb") as p:
    recipies = pickle.load(p)

In [25]:
len(recipies)

3828

In [4]:
nlp_set = {"1/4-inch", "4x4-inch", "1-inch", "1/2-inch", "1/8-inch", "12-inch", "2-inch", "degree", "f/45", "fluid", "12x12", "half"}
to_default_set = {"caramel", "chicken", "cloves","garlic", "graham", "ice", "maraschino", "pint", "portobello", "whole", "(", "clove"}
removal_set = {"not valid", "red", "yellow", "toothpick"}
special_treat = {"skinless":"chicken breast halves"}
ounce = {"ounce":"oz"}
DEFAULT = "DEFAULT"

In [5]:
testing_ingredient = "1 slice prepared polenta, cut into 4x4-inch piece"
testing_ingredient[35+8+1]

'p'

In [6]:
pos = testing_ingredient.find("4x4-inch")
new = testing_ingredient[:pos]+testing_ingredient[pos+len("4x4-inch")+1:]
new

'1 slice prepared polenta, cut into piece'

In [7]:
def remove_word(ingredient, keyword):
    pos = ingredient.find(keyword)
    return ingredient[:pos]+ingredient[pos+len(keyword)+1:]
new = remove_word(testing_ingredient, "4x4-inch")
new

'1 slice prepared polenta, cut into piece'

In [26]:
for recipe in recipies:
    for index, value in enumerate(recipe.unit): # all has same index
        tmp = lemmatizer(recipe.ingredient[index]) # lemmatize
        recipe.ingredient[index] = str(tmp)
        if value in removal_set: # SET TO NOT VALID
            recipe.unit[index] = "not valid"
            recipe.entities[index] = "not valid"
            recipe.number[index] = "not valid"
            recipe.ingredient[index] = "not valid"
        elif value in to_default_set: # change to DEFAULT ON VALUE
            recipe.unit[index] = DEFAULT
            
        elif value in special_treat or recipe.entities[index] in special_treat:
            recipe.unit[index] = DEFAULT
            recipe.entities[index] = special_treat["skinless"]
        
        elif value in ounce:
            recipe.unit[index] = ounce[value]
        
        elif value in nlp_set:
            to_be_remove = recipe.ingredient[index]
            recipe.ingredient[index] = remove_word(to_be_remove, value)
            tmp = nlp(recipe.ingredient[index])
            recog = [(d.text, d.label_) for d in tmp.ents] #recog again, reset unit
            for v in recog:
                if v[1] == "QUANTITY":        
                    vals = v[0].split()
                    res = 0.0
                    for _ in vals:
                        try:
                            res += eval(_)
                        except:
                            if _ == "ounce":
                                recipe.unit[index] = ounce[_]
                            else:
                                recipe.unit[index] = _
                            break
                else:
                    recipe.unit[index] = DEFAULT
            res = []
            for token in tmp:
                if token.tag_ == ",":
                    break
                if token.tag_ == "NN" or token.tag_ == "JJ":
                    if token.text != recipe.unit[index]:
                        res.append(token.text)
            recipe.entities[index] = " ".join(res)  
        else:
            pass

In [27]:
len(recipies)

3828

In [28]:
from functools import reduce
all_units = reduce(lambda x,y:x.union(y) ,(map(lambda x: set(x.unit), recipies)))


In [29]:
len(all_units)

20

In [30]:
all_units

{'1/4-inch',
 'DEFAULT',
 'clove',
 'cube',
 'cup',
 'degree',
 'fluid',
 'gallon',
 'gram',
 'inch',
 'liter',
 'milliliter',
 'not valid',
 'oz',
 'pound',
 'quart',
 'skinless',
 'tablespoon',
 'teaspoon',
 'toothpick'}

In [37]:
file = "./second_round_extraction.pkl"
with open(file, "wb") as w:
    pickle.dump(recipies, w)

### save load scond round

In [38]:
with open(file,"rb") as f:
    new_recipies = pickle.load(f)

In [39]:
all_units = reduce(lambda x,y:x.union(y) ,(map(lambda x: set(x.unit), recipies)))

In [40]:
all_units

{'1/4-inch',
 'DEFAULT',
 'clove',
 'cube',
 'cup',
 'degree',
 'fluid',
 'gallon',
 'gram',
 'inch',
 'liter',
 'milliliter',
 'not valid',
 'oz',
 'pound',
 'quart',
 'skinless',
 'tablespoon',
 'teaspoon',
 'toothpick'}

In [93]:
q = []
for j, r in enumerate(recipies):
    for i, v in enumerate(r.unit):
        if 'skinless' in v:
            q.append(j)

In [94]:
q

[3567]

In [95]:
recipies[q[1]].unit

IndexError: list index out of range

In [91]:
recipies[q[1]].ingredient

['1 cup coconut milk',
 '1 cup pineapple juice',
 '1/2 cup rum',
 '4 tablespoon white sugar',
 '8 cube ice']

In [92]:
recipies[q[1]].recipe

{'food_url': 'https://www.allrecipes.com/recipe/32632/pina-colada-iii/',
 'type_url': 'https://www.allrecipes.com/recipes/133/drinks/cocktails/',
 'type_name': 'Cocktail Recipes',
 'food_name': 'Pina Colada III',
 'food_ingredients': ['1 cup coconut milk',
  '1 cup pineapple juice',
  '1/2 cup rum',
  '4 tablespoons white sugar',
  '8 cubes ice']}

In [49]:
t = nlp("1 ( 14 ounce ) package deluxe macaroni and cheese dinner mix ( such as kraft ® )")
recog = [(d.text, d.label_) for d in t.ents]
rotk = []
for v in recog:
    if v[1] == "QUANTITY":        
        vals = v[0].split()
#                 res = 0.0
        for _ in vals:
            res = 0.0
            try:
                res += eval(_)
            except:
                rotk.append(_)
                
    else:
        pass

In [50]:
rotk

[]

In [86]:
remove_word('1/2 cup warm water ( 110 degree degree c )', "degree")

'1/2 cup warm water ( 110 degree c )'

In [49]:
remove_word('1 bunch slender asparagus spear , trim , cut on diagonal into 1-inch piece', "1-inch")

'1 bunch slender asparagus spear , trim , cut on diagonal into piece'

In [87]:
nlp_set = {"1/4-inch", "4x4-inch", "1-inch", "1/2-inch", "1/8-inch", "12-inch", "2-inch", "degree", "f/45", "fluid", "12x12", "half", "degree degree"}
to_default_set = {"caramel", "chicken", "cloves","garlic", "graham", "ice", "maraschino", "pint", "portobello", "whole", "(", "clove"}
removal_set = {"not valid", "red", "yellow", "toothpick"}
special_treat = {"skinless":"chicken breast halves"}
ounce = {"ounce":"oz"}
DEFAULT = "DEFAULT"

In [104]:
for recipe in new_recipies:
    for index, value in enumerate(recipe.unit): # all has same index
        tmp = lemmatizer(recipe.ingredient[index]) # lemmatize
        recipe.ingredient[index] = str(tmp)
        if value in removal_set: # SET TO NOT VALID
            recipe.unit[index] = "not valid"
            recipe.entities[index] = "not valid"
            recipe.number[index] = "not valid"
            recipe.ingredient[index] = "not valid"
        elif value in to_default_set: # change to DEFAULT ON VALUE
            recipe.unit[index] = DEFAULT
            
        elif value in special_treat or recipe.entities[index] in special_treat:
            recipe.unit[index] = DEFAULT
            recipe.entities[index] = special_treat["skinless"]
        
        elif value in ounce:
            recipe.unit[index] = ounce[value]
        
        elif value in nlp_set:
            to_be_remove = recipe.ingredient[index]
            recipe.ingredient[index] = remove_word(to_be_remove, value)
            tmp = nlp(recipe.ingredient[index])
            recog = [(d.text, d.label_) for d in tmp.ents] #recog again, reset unit
            for v in recog:
                if v[1] == "QUANTITY":        
                    vals = v[0].split()
                    res = 0.0
                    for _ in vals:
                        try:
                            res += eval(_)
                        except:
                            if _ == "ounce":
                                recipe.unit[index] = ounce[_]
                            else:
                                recipe.unit[index] = _
                            break
                else:
                    recipe.unit[index] = DEFAULT
            res = []
            for token in tmp:
                if token.tag_ == ",":
                    break
                if token.tag_ == "NN" or token.tag_ == "JJ":
                    if token.text != recipe.unit[index]:
                        res.append(token.text)
            recipe.entities[index] = " ".join(res)  
        else:
            pass

In [105]:
len(new_recipies)

3828

In [106]:
all_units = reduce(lambda x,y:x.union(y) ,(map(lambda x: set(x.unit), new_recipies)))

In [107]:
all_units

{'DEFAULT',
 'cube',
 'cup',
 'gallon',
 'gram',
 'inch',
 'liter',
 'milliliter',
 'not valid',
 'oz',
 'pound',
 'quart',
 'tablespoon',
 'teaspoon'}

In [108]:
q = []
for j, r in enumerate(new_recipies):
    for i, v in enumerate(r.unit):
        if 'degree' in v:
            q.append(j)

In [109]:
q

[]

### 3 times iterations of NLP, we removed all no good units

In [110]:
file = "./third_round_extraction.pkl"
with open(file, "wb") as w:
    pickle.dump(new_recipies, w)

In [111]:
# outputs = []
# for l in loaded_all_entities:
#     output = l.recipe
#     output["extraction_results"] = []
#     for i, n in enumerate(l.number):
#         output["extraction_results"].append((l.number[i], l.unit[i], l.entities[i]))
#     outputs.append(output)
outputs = []
for l in new_recipies:
    output = l.recipe
    output["extraction_results"] = []
    for i, n in enumerate(l.number):
        output["extraction_results"].append((l.number[i], l.unit[i], l.entities[i]))
    outputs.append(output)

In [112]:
outputs[0]

{'food_url': 'https://www.allrecipes.com/recipe/15253/chocolate-snowballs/',
 'type_url': 'https://www.allrecipes.com/recipes/841/holidays-and-events/christmas/desserts/christmas-cookies/',
 'type_name': 'Christmas Cookies',
 'food_name': 'Chocolate Snowballs',
 'food_ingredients': ['1 1/4 cups butter',
  '2/3 cup white sugar',
  '1 teaspoon vanilla extract',
  '2 cups all-purpose flour',
  '1/8 teaspoon salt',
  '1/2 cup unsweetened cocoa powder',
  '2 cups chopped pecans',
  "1/2 cup confectioners' sugar for decoration"],
 'extraction_results': [(1.25, 'cup', 'butter'),
  (0.6666666666666666, 'cup', 'white sugar'),
  (1.0, 'teaspoon', 'vanilla extract'),
  (2.0, 'cup', 'purpose flour'),
  (0.125, 'teaspoon', 'salt'),
  (0.5, 'cup', 'unsweetened cocoa powder'),
  (2.0, 'cup', 'chop pecan'),
  (0.5, 'DEFAULT', 'cup confectioner sugar decoration')]}

In [113]:
with open("recipe_extraction.json", "w") as j:
    json.dump(outputs, j)

In [None]:
cc