In [1]:
import stanza as st
import numpy as np
import re as re
from typing import Optional
import transformers as tra

st.download('en')
depgram = st.Pipeline('en')#, processors='tokenize,mwt,pos,lemma,depparse,ner')
pipe2 = tra.pipeline(model="facebook/bart-large-mnli")

  from .autonotebook import tqdm as notebook_tqdm
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 64.4MB/s]                    
2023-03-12 05:14:44 INFO: Downloading default packages for language: en (English) ...
2023-03-12 05:14:46 INFO: File exists: C:\Users\Typic\stanza_resources\en\default.zip
2023-03-12 05:14:50 INFO: Finished downloading models and saved to C:\Users\Typic\stanza_resources.
2023-03-12 05:14:50 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json: 193kB [00:00, 32.2MB/s]                    
2023-03-12 05:14:51 INFO: Loading these models for language: en (English):
| Processor    | Package   |
----------------------------
| tokenize     | combined  |
| pos     

In [2]:
# if head_id is -1 or id is 0, this is the root node
class depNode():
    def __init__(self, id:int, head_id:int, text:str, typ: str, deps:dict):
        self.id = id
        self.head_id = head_id
        self.text = text
        self.typ = typ
        self.deps = deps
        if deps == None:
            self.deps = {}

    def addDependent(self, dep_id:int, dep_text:str, rel_type:str):
        if dep_id in self.deps.keys():
            return
        self.deps[dep_id] = (dep_id, dep_text, rel_type)

    def __str__(self):
        outputStr = ""
        outputStr += "ID: " + str(self.id)
        outputStr += "\n\tHead ID: " + str(self.head_id)
        outputStr += "\n\tText: " + self.text
        outputStr += "\n\tType: " + self.typ
        outputStr += "\n\tDependent Words: "
        for x in self.deps.keys():
            outputStr += "\n\t\tDep. ID: " + str(self.deps[x][0])
            outputStr += "\n\t\t\tDep. Text: " + self.deps[x][1]
            outputStr += "\n\t\t\tRelation Type: " + self.deps[x][2]
        return outputStr
            

# returns a easier to traverse dependency tree with word id in the sentence as the key, as well as a dictionary of a word to a list of ids of instances of it
def getDependency(input_dep:list):
    text_to_ids = {} # ex: For the sentence, "cakes are cakes": text_to_ids["cakes"] == [1,3] 
    dependency_dict = {}

    # add a special node for ROOT
    dependency_dict[0] = depNode(0, -1, "ROOT", "N/A", {})

    for entry in input_dep:

        id1 = entry[0].id
        txt1 = entry[0].text.lower()
        id2 = entry[2].id
        txt2 = entry[2].text.lower()
        rel_type = entry[1]

        # if either word id isn't in the dependency dictionary, add it
        if not id1 in dependency_dict.keys():
            dependency_dict[id1] = depNode(id1, entry[0].head, txt1, entry[0].xpos, {})
            if not txt1 in text_to_ids.keys():
                text_to_ids[txt1] = [id1]
            elif txt1 in text_to_ids.keys() and not id1 in text_to_ids[txt1]:
                text_to_ids[txt1] = text_to_ids[txt1] + [id1]

        if not id2 in dependency_dict.keys():
            dependency_dict[id2] = depNode(id2, entry[2].head, txt2, entry[2].xpos, {})
            if not txt2 in text_to_ids.keys():
                text_to_ids[txt2] = [id2]
            elif txt2 in text_to_ids.keys() and not id2 in text_to_ids[txt2]:
                text_to_ids[txt2] = text_to_ids[txt2] + [id2]

        # add a dependency into the head word
        dependency_dict[id1].addDependent(id2, txt2, rel_type)

    return (dependency_dict, text_to_ids)


In [3]:
# makes sentence parsing better through removing problematic verbs that make sentences more complex.
# this should only be done if we know it isn't about the recipe or next/previous/current step!!!! it removes some stuff important to those
def removeProblemWords(text:str):
    final = text

    let_result = re.search("(^|\s)let\W(s\W)*",text.lower())
    if let_result != None:
        final = (text[:let_result.span()[0]] + " " +  text[let_result.span()[1]:]).lstrip().rstrip()

    can_result = re.search("(^|\s)can\W",final.lower())
    if can_result != None:
        final = (final[:can_result.span()[0]] + " " +  final[can_result.span()[1]:]).lstrip().rstrip()
    
    know_result = re.search("(^|\s)know\W",final.lower())
    if know_result != None:
        final = (final[:know_result.span()[0]] + " " +  final[know_result.span()[1]:]).lstrip().rstrip()

    me_result = re.search("(^|\s)me\W",final.lower())
    if me_result != None:
        final = (final[:me_result.span()[0]] + " " +  final[me_result.span()[1]:]).lstrip().rstrip()
    
    you_result = re.search("(^|\s)you\W",final.lower())
    if you_result != None:
        final = (final[:you_result.span()[0]] + " " +  final[you_result.span()[1]:]).lstrip().rstrip()
    
    we_result = re.search("(^|\s)we\W",final.lower())
    if we_result != None:
        final = (final[:we_result.span()[0]] + " " +  final[we_result.span()[1]:]).lstrip().rstrip()

    love_result = re.search("(^|\s)love(s|d)*\W",final.lower()) # evil programmer removes love
    if love_result != None:
        final = (final[:love_result.span()[0]] + " " +  final[love_result.span()[1]:]).lstrip().rstrip()

    like_result = re.search("(^|\s)like(s|d)*\W",final.lower())
    if like_result != None:
        final = (final[:like_result.span()[0]] + " " +  final[like_result.span()[1]:]).lstrip().rstrip()

    step_result = re.search("(^|\s)steps*\W",final.lower())
    if step_result != None:
        final = (final[:step_result.span()[0]] + " " +  final[step_result.span()[1]:]).lstrip().rstrip()

    return final

# returns a list with each entry corresponding to a dependent word on the head word provided
# (word text, relation to head, word type, id in dependency dict)
# ex: ("it", "obj", "PRP", 5)
def getDepInfo(input_deps:dict, head:depNode):
    res = []
    for dd in head.deps:
        text = head.deps[dd][1]
        rel_type = head.deps[dd][2]
        word_type = input_deps[head.deps[dd][0]].typ
        res.append((text, rel_type, word_type, head.deps[dd][0]))
    return res

def isVague(text:str, typ:str):
    text = text.lower()
    if "PRP" in typ or "DT" in typ:
        return True
    if "thing" in text:
        return True
    if text == "stuff":
        return True
    if "WP" in typ or "WRB" in typ:
        return True
    if "IN" in typ:
        return True
    if "JJ" in typ:
        return True
    
    return False

def determineVaguenessFromDep(input_deps:dict):
    head = getHeadWord(input_deps)
    head_type = head.typ

    print(head)
    dep_list = getDepInfo(input_deps, head)

    # condition based on head type
    if "VB" in head_type:
        # if it is a verb, then we check the object and nsubj, then maybe obl in that order
        for dl in dep_list:
            if "obj" in dl[1]:
                # ok so if the object is vague, then the input could be vague but let's check nsubj first
                if isVague(dl[0], dl[2]):
                    break
                # if the object isn't vague, then the input is absolutely being specific. The "these" case is handled outside of this function
                else: 
                    return False
        for dl in dep_list:
            if "nsubj" in dl[1]:
                # if the nsubj is vague, 
                if isVague(dl[0], dl[2]):
                    break
                # if the object isn't vague, then the input is absolutely being specific. The "these" case is handled outside of this function
                else: 
                    return False
    elif "NN" in head_type:
        pass
    elif head_type == "WRB":
        pass
    else:
        pass
    return head_type

# wrapper for determineVaguenessFromDep, with a few easy exit cases
# NOTE that vagueness doesn't really check how vague something is, but is more of a distinguisher between situations we would
# call a google search or where we would check recipe steps.
# so for example, "What is it?" would be an actually vague question that this says is vague.
# However, "How do I cut these strawberries?" isn't actually vague by definition since it is being very specific, but it would fall under the same
# type of procedure we would use for a vague step, where we check the instructions for info.
# Something like "How do I cut a strawberry?" is kind of middle-of-the-road vagueness compared to the previous examples, but we would say it is being
# specific since they aren't explicitly referring to the instructions (there may be some implicit intention there but ambiguity is hard).
def determineVagueness(text:str):
    # if "these" is in the text, then it must be referring to something specifically in relation to the recipe
    # this is also an important exception to my other logic, bc "strawberries" is Q4 BUT "these strawberries" is Q3
    if "these" in text.lower():
        return True
    
    test_doc = depgram(removeProblemWords(text))
    return determineVaguenessFromDep(getDependency(test_doc.sentences[0].dependencies))

# returns the id, text, and type of the head word of the given dependency dict (in depNode form)
def getHeadWord(input_deps:dict):
    return input_deps[list(input_deps[0].deps.keys())[0]]

In [6]:
class Ingredient:
    def __init__(   self, 
                    og_text:str, # the original string
                    main_comp:str, # the main part of the ingredient component, IE chicken
                    quantity:str, # a number. if a non-numerical amount, this should be None (ex: some raisins). "a" -> 1. is a string rather than a float bc fractions are more readable for recipes
                    measurement:str, # the measurement the quantity is referring to (like a cup). if no measurement (like "2 apples"), is None. if vague, non-committal amount (ex: "some"), this does here.
                    sub_quantity:str, 
                    sub_measurement: str,
                    descriptors:list # other details of the ingredient listing, IE dependent nouns, adjectives, preparation verb parts (ex: "finely chopped")
                    ):
        self.og_text = og_text
        self.main_comp = main_comp
        self.quantity = quantity
        self.measurement = measurement
        self.sub_quantity = sub_quantity
        self.sub_measurement = sub_measurement
        self.descriptors = descriptors

    def __str__(self):
        outputStr = ""
        outputStr += "Ingredient: " + self.main_comp
        outputStr += "\n\tQuantity: " 
        if self.quantity == None:
            outputStr += "N/A"
        else:
            outputStr += self.quantity
        outputStr += "\n\tMeasurement: " 
        if self.measurement == None:
            if self.sub_quantity == None and self.sub_measurement == None:
                outputStr += "N/A"
            else:
                outputStr += self.sub_quantity + " " + self.sub_measurement + " "
        else:

            if self.sub_quantity != None:
                outputStr += self.sub_quantity + " " 
            if self.sub_measurement != None:
                outputStr += self.sub_measurement + " "

            outputStr += self.measurement
        
        outputStr += "\n\tDescriptors: "
        if len(self.descriptors) < 1:
            outputStr += "\n\t\tN/A"
        else:
            for dt in self.descriptors:
                outputStr += "\n\t\t" + dt
        outputStr += "\n\tOriginal text: " + self.og_text
        return outputStr

def combineItemsIntoPhrase(its:list):
    res = ""
    for x in its:
        if x in [".", ",", "'", ";", ":", "-", "/"] or (len(x) > 1 and x[0] in [".", ",", "'", ";", ":", "-", "/"]):
            res = res.rstrip()
            
        res += x
        res += " "  
    res = res.replace("&#39;", "'")
    return res.rstrip()

def floatFromFractionString(frac: str):
    try:
        return float(frac)
    except:
        numerator = frac[:frac.index("/")]
        denominator = frac[frac.index("/")+1:]

        return float(numerator) / float(denominator)

# tries to find a quantity and measurement if none were found by directly analyzing the head word
def tryFindQuantity(input_deps:dict, head:depNode, head_rel_type:str, meas: str):
    quantity = None
    measurement = None

    # if measurement, then set measurement
    if head_rel_type == "nmod:npmod":
        measurement = head.text

    if head_rel_type in ["nummod", 'det']:
        if head_rel_type == 'det' and head.text.lower() in ["a", "an"]:
            # print("???")
            return (str(1), measurement)
        elif head_rel_type == 'det' and meas != None:
            # print("???")
            return (head.text, measurement)
        elif head_rel_type == "nummod":
            # print("???")
            return (head.text, measurement)

    for dd in head.deps:
        rel_type = head.deps[dd][2]
        # print(head.deps[dd])
        (temp_quantity, temp_measurement) = tryFindQuantity(input_deps, input_deps[head.deps[dd][0]], rel_type, measurement)
        if temp_measurement != None:
            measurement = temp_measurement
        if temp_quantity != None:
            # print(temp_quantity)
            if quantity == None:
                quantity = temp_quantity
            else:
                quantity == str(floatFromFractionString(quantity) * floatFromFractionString(temp_quantity))
            if measurement == None:
                measurement = head.text


    return (quantity, measurement)

def getIngredientParameters(depgram, ingred:str):
    og_ingred = ingred
    sub_phrase = None
    sub_quantity = None
    sub_measurement = None
    after_comma = None
    if "(" in ingred and ")" in ingred:
        sub_measurement_result = re.search("\((.+)\)", ingred)
        if sub_measurement_result != None:
            sub_phrase = sub_measurement_result.group(1)
            # print(sub_phrase)
            ingred = ingred.replace(sub_phrase, "")
            nums = re.search("\s*([\d/\.]+)\s*", sub_phrase)
            if nums != None:
                sub_quantity = nums.group(1)
                sub_measurement = sub_phrase[nums.span()[1]:]

        # # ingred = ingred.replace("packages", "")
        # # ingred = ingred.replace("package", "")
        ingred = ingred.replace("(", "")
        ingred = ingred.replace(")", "")

    if "," in ingred:
        comma_result = re.search("\,(.+)", ingred)
        if comma_result != None:
            after_comma = comma_result.group(1)
            ingred = ingred.replace(after_comma, "")
            ingred = ingred.replace(",", "")

        # remove_list = ["package", "can"]
        
        # for rr in remove_list:
        #     temp_result = re.search("\s(" + rr + "s*)\s", ingred)
        #     if temp_result != None:
        #         print(temp_result.span())
        #         ingred = ingred[:temp_result.span()[0]] + " " + ingred[temp_result.span()[1]:]
        #         # print("CANNN")
        
        # nums = re.findall("\s*([\d/\.]+)\s*", ingred)
        # if nums != None:
        #     print(nums)
        #     nums.sort(key=lambda x: len(x), reverse=True)
        #     product = 1
        #     for nn in nums:
        #         ingred = ingred.replace(nn, "")
        #         product *= floatFromFractionString(nn)

        #     ingred = str(product) + ingred
        # print(ingred)

    doc = depgram(ingred)

    # consti = doc.sentences[0].constituency
    depend = getDependency(doc.sentences[0].dependencies)[0]
    head_dep = depend[list(depend[0].deps.keys())[0]]
    # print(head_dep)
    # print(consti)

    # for dd in depend:
    #     print(depend[dd])

    # from what I can tell, the head noun will reference 4 common types of dependent relations:
    #   1: "amod" | "parataxis": should go into the descriptors list [note that the parataxis is something weird where if there is a comma or something]
    #   2: "compound": should be part of the main_comp (make sure to figure out how to enjoin with the head word)
    #   3: "nmod:npmod": should be the measurement field. if it exists, we need to go there and get the nummod or a de ("a", "some", etc.)
    #   4: "nummod": this should be the quantity field. if this is dependent on the root noun, then there is no measurement word, so set it to None
    
    descriptors = []
    compounds = []
    measurement = None
    quantity = None
    main_comp = head_dep.text
    
    deps = getDepInfo(depend, head_dep)
    # print(deps)
    
    for dd in deps:
        if dd[1] == 'nmod:npmod':
            measurement = dd[0]
            temp_quant = getDepInfo(depend, depend[dd[3]])
            if temp_quant == None:
                continue
            for tq in temp_quant:
                if tq[0] == 'no':
                    continue
                if tq[1] == 'nummod':
                    quantity = tq[0]
                elif tq[1] == 'det' and (tq[0].lower() == 'a' or tq[0].lower() == 'an'):
                    quantity = str(1)
                elif quantity == None:
                    quantity = tq[0]
                    # note no break here bc this is a failsafe in case there is no appropriate quantity descriptor
        elif dd[1] == 'amod' or dd[1] == 'parataxis' or dd[1] == 'conj' or dd[1] == 'acl':
            # also add certain dependents on the amod if they exist (ex: "all-purpose flour" has "all" as dependent on "purpose")
            temp_text = dd[0]
            temp_desc = getDepInfo(depend, depend[dd[3]])
            temp_adds = []
            if temp_desc != None:
                for td in temp_desc:
                    if td[1] in ['det', 'amod', 'obl']:
                        temp_adds.append(td[0])
                    elif td[1] in ['punct']:
                        continue
                    else:
                        # print("What is this?? " + str(td))
                        continue
            if len(temp_adds) > 0:
                temp_adds = [temp_text] + temp_adds
                temp_text = combineItemsIntoPhrase(temp_adds)
            descriptors.append(temp_text)
        elif dd[1] == 'compound' or dd[1] == 'appos' or dd[1] == 'aux':
            remove_list = ["package", "packages", "can", "cans", "jar", "jars"]
            if dd[0] in remove_list:
                measurement = dd[0]
            else:
                # print(">>>" + str(depend[dd[3]]))
                for abc in list(depend[dd[3]].deps.keys()):
                    # print(depend[dd[3]].deps[abc])
                    # print(depend[abc])
                    if depend[dd[3]].deps[abc][2] == 'amod':
                        compounds.append(depend[abc].text)
                # print(list(depend[dd[3]].deps.keys()))
                compounds.append(dd[0])
        elif dd[1] == 'nummod':
            quantity = dd[0]
        elif dd[1] == 'det':
            if dd[0].lower() == 'a' or dd[0].lower() == 'an':
                quantity = str(1)
            else:
                if dd[0].lower() == 'no':
                    compounds.append(dd[0])
                    continue
                quantity = dd[0]
        elif dd[1] in ['aux', 'punct']:
            continue
        else:
            print("??????")
            # descriptors.append(dd[0])
            print(dd)

    if quantity == None and measurement == None:
        # do something to try and find something because we should have at least 1

        # first, let's try going through each of the compounds recursively with depth-first
        # temp_quant = None
        # temp_meas = None
        for cc in deps:
            (temp_quant, temp_meas) = tryFindQuantity(depend, depend[cc[3]], None, None)
            if quantity == None and temp_quant != None:
                quantity = temp_quant
                
            if measurement == None and temp_meas != None:
                measurement = temp_meas
                
            if quantity != None and measurement != None:
                break


    if len(compounds) > 0:
        compounds.append(main_comp)
        main_comp = combineItemsIntoPhrase(compounds)

    if after_comma != None:
        descriptors.append(after_comma.lstrip().rstrip())

    return Ingredient(og_ingred, main_comp, quantity, measurement, sub_quantity, sub_measurement, descriptors)
    



In [7]:
test_phrases = ['1 cup butter, softened', '1 cup white sugar', '1 cup packed brown sugar', '2 eggs', '2 teaspoons vanilla extract', '1 teaspoon baking soda', '2 teaspoons hot water', '0.5 teaspoon salt', '3 cups all-purpose flour', '2 cups semisweet chocolate chips', 'a cup chopped walnuts', 'an orange']
test_phrases = ['1 tablespoon olive oil', '1 small onion, diced', '4 cloves garlic, minced', '1.5 pounds ground beef', '1 teaspoon garlic powder', '1 (28 ounce) jar sausage flavored spaghetti sauce', '1 (8 ounce) can tomato sauce', '1 teaspoon dried oregano', '1 (8 ounce) package shredded mozzarella cheese', '1 (8 ounce) package shredded provolone cheese', '1 (15 ounce) container ricotta cheese', '0.25 cup milk', '2 large eggs', '0.5 teaspoon dried oregano', '9 no-boil lasagna noodles', '0.25 cup grated Parmesan cheese']
test_phrases = ['2 (7 ounce) packages shirataki noodles, drained', '1 tablespoon vegetable oil', '1 (12 ounce) package tofu, cut into chunks', '0.25 cup reduced-sodium soy sauce', '0.5 cup lemon juice', '0.25 cup white sugar', '2 tablespoons peanut butter', '1 tablespoon sriracha hot sauce', '2 eggs', '1 (4.5 ounce) can mushrooms', '0.5 cup chopped cashews, divided', '1 cup bean sprouts', '1 lime, cut into wedges']
test_phrases = ['3 cups Burgundy wine', '2 onions, thinly sliced', '2 carrots, chopped', '2 tablespoons brandy', '1 clove garlic, crushed', '10 whole black peppercorns', '1 teaspoon salt', '1 sprig fresh parsley', '1 bay leaf', '2 pounds cubed beef chuck roast', '4 tablespoons olive oil, divided', '0.25 pound bacon, cubed', '2 onions, chopped', '3 tablespoons all-purpose flour', '2 cloves garlic, crushed', '1 tablespoon tomato paste', '1 (10.5 ounce) can beef broth', 'salt and pepper to taste', '4 tablespoons butter', '1 pound fresh mushrooms, sliced']
test_phrases = ['0.25 cup sesame oil', '0.25 cup lemon juice', '0.25 cup soy sauce', '2 tablespoons brown sugar, or more to taste', '1 tablespoon sesame seeds', '1 teaspoon ground mustard', '1 teaspoon ground ginger', '0.25 teaspoon garlic powder', '4 (6 ounce) salmon steaks']
test_phrases = ['2 tablespoons olive oil', '1 carrot, diced', '0.5 green bell pepper, diced', '2 cups shrimp, peeled and deveined', '0.5 onion, diced', '0.5 (15.25 ounce) can whole kernel corn, drained', '2 cloves garlic, thinly sliced', '1 tablespoon olive oil', '2 eggs, beaten', '4 cups cooked rice, cooled - or more to taste', '2 tablespoons oyster sauce, or more to taste', '2 tablespoons soy sauce', '1 tablespoon butter', '0.5 teaspoon salt', '1 teaspoon butter, or as needed', '4 eggs, divided']
test_phrases = ['1 (12 ounce) package ladyfingers', '0.25 cup unsalted butter, melted', '2 tablespoons coffee-flavored liqueur', '3 (8 ounce) packages cream cheese, softened', '1 (8 ounce) container mascarpone cheese, softened', '1 cup white sugar', '2 tablespoons coffee-flavored liqueur', '0.25 cup all-purpose flour', '2 large eggs', '1 teaspoon heavy cream, or more as needed', '0.25 ounce semisweet chocolate'] 
test_phrases = ['1 tablespoon vegetable oil', '1 cup long grain white rice', '1.5 cups chicken broth', '1 tomato, seeded and chopped', '0.5 onion, finely chopped', '0.5 green bell pepper, finely chopped', '1 fresh jalapeno pepper, chopped', '0.5 cup chopped fresh cilantro', '1 cube chicken bouillon', '1 clove garlic, halved', '0.5 teaspoon ground cumin', 'salt and pepper to taste']







# test_phrases = ['1 (28 ounce) jar sausage flavored spaghetti sauce', '1 (8 ounce) can tomato sauce', '1 (8 ounce) package shredded mozzarella cheese', '1 (8 ounce) package shredded provolone cheese', '1 (15 ounce) container ricotta cheese']
# test_phrases = ['1 (12 ounce) package tofu, cut into chunks']
test_phrases = ['1 (8 ounce) container mascarpone cheese, softened']
# test_doc = de0.25 cup reduced-sodium soy saucepgram(removeProblemWords(test_phrase))

# print(test_doc.sentences[0].constituency, 0)
# print(test_doc.sentences[0].constituency, 0)
# depe = getDependency(test_doc.sentences[0].dependencies)
# for dd in depe[0]:
#     print(depe[0][dd])

# # removeProblemWords(test_phrase)
# getHeadWord(depe[0])
# determineVaguenessFromDep(depe[0])

# print(type(["hiya"]))
ingredients = []

for tp in test_phrases:
    and_result = re.search("(.+)\sand\s(.+)", tp)
    if and_result != None:
        ingredients.append(getIngredientParameters(depgram, and_result.group(1).lstrip().rstrip()))
        tp = and_result.group(2).lstrip().rstrip()
    ingredients.append(getIngredientParameters(depgram, tp))

for ing in ingredients:
    print(ing)

Ingredient: mascarpone cheese
	Quantity: 1
	Measurement: 8 ounce container
	Descriptors: 
		softened
	Original text: 1 (8 ounce) container mascarpone cheese, softened


In [200]:
def floatFromFractionString(frac: str):
    try:
        return float(frac)
    except:
        numerator = frac[:frac.index("/")]
        denominator = frac[frac.index("/")+1:]

        return float(numerator) / float(denominator)

print(floatFromFractionString("1/2"))

0.5
