# Cleaning The NYT Data for CRF Training 

In order to parse recipe data into a useable vector representation, we need a way of labeling the different parts of the natural language of the recipe ingredients, e.g. Ingredient name, unit of measurement, quantity, etc.

To do this we chose to use Conditional Random Fields (https://en.wikipedia.org/wiki/Conditional_random_field), throught the use of pycrfsuite. 

To train the model we will use labelled ingredient data published by the New York Times, that they used for their own internal CRF model to label ingredients. That data can be found here: https://github.com/nytimes/ingredient-phrase-tagger.

Unfortunately, the quality of the labels is very poor, and it needs fixing, cleaning, and standardizing.
So lets take a look.

In [1]:
from collections import Counter
import operator
import decimal
import re
from fractions import Fraction
import string
import sys
import unicodedata
import pandas as pd
from sklearn.model_selection import train_test_split
import unidecode
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])
%pprint

Pretty printing has been turned OFF


This is what the labeled NYT data looks like.

In [2]:
# Load raw data
raw_data = pd.read_csv(
    "../data/raw/nyt-ingredients-snapshot-2015.csv", index_col="index"
)
# The NYT has a column to label ingredient quantity ranges, e.g. 2-3 apples.
# We use the average of a range and so dont need to keep track of range end points
raw_data = raw_data.drop(columns=["range_end","comment"])
raw_data.shape[0]
raw_data.dropna(axis=0, subset=["input"], inplace=True)
raw_data.shape[0]
raw_data.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium-size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.0,cup
9,6 cups veal or beef stock,stock,6.0,cup
10,1/3 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


First thing, lets strip out all the leftover html tags and cruft

In [3]:
def clean_nyt_html(string):
    """
    This will replace all html tags that were not stripped
    from the NYT data
    """
    if string == string:
        # this will remove all: <a href=...>see recipe</a>
        match = re.findall(r"\(?<.*see\s*recipe.*>\)?", string)
        if match:
            for m in match:
                string = re.sub(r"\(?<.*see\s*recipe.*>\)?", "", string)

        # this will remove all: see <a href=...>recipe</a>
        match = re.findall(r"\(?\s*(see)\s*?<.*recipe.*>\)?", string)
        if match:
            for m in match:
                string = re.sub(
                    r"\(?\s*(see)\s*?<.*recipe.*>\)?", "", string
                )

        # This will remove all <span> and misc <a href=...>...</a>
        match = re.findall(r"<.*?>", string)
        if match:
            for m in match:
                string = re.sub(r"<.*?>", "", string)
        # this will remove all un-escapped '\n' from the original html
        match = re.findall(r"\\n", string)
        if match:
            for m in match:
                string = re.sub(r"\\n", " ", string)
        # this will remove all un-escapped '\t' from the original html
        match = re.findall(r"\\t", string)
        if match:
            for m in match:
                string = re.sub(r"\\t", " ", string)
        # if the column is now blank becasue of what we removed, set it
        # to NaN so pandas can handle it easier
        string = string.strip()
    else:
        string = ""
    return string

In [4]:
cleaned_html = raw_data.copy()
for col in ["input", "name"]:
    cleaned_html[col] = cleaned_html[col].apply(clean_nyt_html)
print(cleaned_html.shape)
# There are over 100 lines missing an input, these are worthless and are dropped.
cleaned_html.dropna(axis=0, subset=["input"], inplace=True)
cleaned_html = cleaned_html[cleaned_html.astype(str)['input'] != ""]
print(cleaned_html.shape)

(179063, 4)
(179060, 4)


Now we fix various spelling errors and standardise spelling

In [5]:
def fix_spelling(string):
    if string == string:
        string = re.sub(r"([Cc])(hipolte|hipottle)", r"\1hipotle", string)
        string = re.sub(r"([Ff])(ritata|rittatta|ritatta|ritartar)", r"\1rittata", string)
        string = re.sub(r"([Cc])reme\s[Ff](resh|raishe)", r"\1reme fraiche", string)
        string = re.sub(r"([Mm])(ascapone|ascaprone)", r"\1ascarpone", string)
        string = re.sub(r"([Bb])(russel|russle)\s[Ss]prout", r"\1russels sprout", string)
        string = re.sub(r"([Gg])nocci", r"\1nocchi", string)
        string = re.sub(r"([Mm])(accaroni|acarroni)", r"\1acaroni", string)
        string = re.sub(r"([Mm])(acaroon|accaron|acarron)", r"\1acaron", string)
        string = re.sub(r"([Ff])(ettuccini|ettucine|ettucchine)", r"\1ettuccine", string)
        string = re.sub(r"([Ee])xpresso", r"\1spresso", string)
        string = re.sub(r"([Mm])(ozzarrella|ozarela|ozzarela )", r"\1ozzarella", string)
        string = re.sub(r"([Ss])herbert", r"\1herbet", string)
        string = re.sub(r"([Cc])ardamon", r"\1ardamom", string)
        string = re.sub(r"([Ll])inguini", r"\1inguine", string)
        string = re.sub(r"([Ll])iquer", r"\1iqueur", string)
        string = re.sub(r"([Ww])on\ston", r"\1onton", string)
        string = re.sub(r"([Cc])hile", r"\1hili", string)
        string = re.sub(r"([Cc])hilies", r"\1hilis", string)
        string = re.sub(r"(\&amp\;|\&)e(acute|grave)\;", "e", string)
        string = re.sub(r"(\&amp\;|\&)icirc\;", "i", string)
        string = re.sub(r"(\&amp\;|\&)ucirc\;", "u", string)
        string = re.sub(r"(\&amp\;|\&)\#231\;", "c", string)
        string = re.sub(r"(\&amp\;|\&)rsquo\;", "'", string)
        string = re.sub(r"(\&amp\;|\&)ntilde\;", "n", string)
        string = re.sub(r"redpepper", "red pepper", string)
        string = re.sub(r"blackpepper", "black pepper", string)
        string = re.sub(r"roastedalmonds", "roasted almonds", string)
        string = re.sub(r"XXshiitake", "shiitake", string)
        # Handling misc edge case
        string = re.sub(r"1 1\/2\½", "1 1/2", string)
        string = re.sub(r"1\#3", "1/3", string)
        string = re.sub(r"1\#12", "1 12", string)

    return string

In [6]:
fixed_spelling = cleaned_html.copy()
for col in ["input", "name"]:
    fixed_spelling[col] = fixed_spelling[col].apply(fix_spelling)
fixed_spelling.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium-size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.0,cup
9,6 cups veal or beef stock,stock,6.0,cup
10,1/3 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


Lets take a look at the characters present and see if theres anything we wouldn't expect in a recipe, e.g. unescaped unicode, bizare punctuation, etc.

In [7]:
c = Counter()
for col in ["input", "name", "unit"]:
    fixed_spelling[col].apply(lambda x: c.update(list(str(x))))

# We expect alphabet, digit, and punctuation characters so lets remove them
for i in list(c):
    if i.isalpha() or i.isdigit():
        del c[i] 
sorted(c.keys())

[' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '?', '@', '[', ']', '`', '|', '\x90', '\xa0', '¼', '½', '¾', '×', '–', '—', '‘', '’', '“', '”', '⁄', '⅓']

Ok so we have unescaped unicode, non-standard hyphens, quotes, as well as unicode fractions. Lets fix this.

In [89]:
def fix_characters(string):
    if string == string:
        if "\xa0" in string:
            string = string.replace("\xa0", " ")
        if "\x90" in string:
            string = string.replace("\x90", "")
        if "×" in string:
            string = string.replace("×", "x")
        # Wait to process hyphens until after ingredient ranges are processed.
        #string = re.sub(r"(?<!(?:[^\d]))[\–\—\‐\‑\-](?=(?:[^\d]))", " ", string)
        string = re.sub(r"[\!\*\|\`\@\+\?\�\™\‿\•\®\§\¤\[\]\u2028]", "", string)
        string = re.sub(r"[\‘|\’]", "''", string)
        string = re.sub(r"[\“\”\″\‟]", '"', string)
        string = re.sub(r"\&", "and", string)
        if "⁄" in string:
            string = string.replace("⁄", "/")
        # The following characters only appear a very small number of times each in the data and are removed

        if "‱" in string:
            string = string.replace("‱", "n")
        strin = string.replace("  ", " ")
    return string

In [90]:
fixed_characters = fixed_spelling.copy()
for col in ["input", "name", "unit"]:
    fixed_characters[col] = fixed_characters[col].apply(fix_characters)
fixed_characters.iloc[38:54]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,"Vegetable oil, for deep-frying",Vegetable oil,0.0,
40,pound elbow macaroni,elbow macaroni,0.0,pound
41,Freshly ground black pepper,black pepper,0.0,
42,2 teaspoons Dijon mustard,Dijon mustard,2.0,teaspoon
43,cup heavy cream,heavy cream,0.0,cup
44,2 cups whole milk,milk,2.0,cup
45,1/2 teaspoon hot red pepper flakes,hot red pepper flakes,0.5,teaspoon
46,3 tablespoons flour,flour,3.0,tablespoon
47,5 cups coarsely grated extra-sharp Cheddar,Cheddar,5.0,cup
48,2 cups panko or coarse dry bread crumbs,bread crumbs,2.0,cup


All unit labels are words and the training of the crf model cant associate oz. with ounce, so we need to fix all abbreviations here. On top of this the NYT data is missing labels we can easily and accuratly assign here.

In [91]:
def fix_abbreviations(string):
    """
    Converts instances of oz., ml., and g. to ounce and gram respectively
    """
    if string == string:
        match = re.findall(r"([^\w])oz\.?([^\w])?", string)
        if match:
            for m in match:
                if len(m) == 1:
                    string = re.sub(
                        r"([^\w])oz\.?([^\w])", m[0] + "ounce", string, 1
                    )
                else:
                    string = re.sub(
                        r"([^\w])oz\.?([^\w])", m[0] + "ounce" + m[1], string, 1
                    )
        # replace ml. with milliliter
        match = re.findall(r"([^\w])ml\.?([^\w])?", string)
        if match:
            for m in match:
                if len(m) == 1:
                    string = re.sub(
                        r"([^\w])ml\.?([^\w])", m[0] + "milliliter", string, 1
                    )
                else:
                    string = re.sub(
                        r"([^\w])ml\.?([^\w])", m[0] + "milliliter" + m[1], string, 1
                    )
        # replace g. with gram
        match = re.findall(r"(\d+)\s?g\.?([^\w])", string)
        if match:
            for m in match:
                string = re.sub(
                    r"(\d+)\s?g\.?([^\w])", m[0] + " gram" + m[1], string, 1
                )

        # replace tbsp with tablespoon
        match = re.findall(r"[Tt]bsp\.*", string)
        if match:
            for m in match:
                string = re.sub(r"[Tt]bsp\.*", "tablespoon", string, 1)

        # replace tsp with teaspoon
        match = re.findall(r"[Tt]sp\.*", string)
        if match:
            for m in match:
                string = re.sub(r"[Tt]sp\.*", "teaspoon", string, 1)
    return string

In [92]:
fixed_abr = fixed_characters.copy()
for col in ["input", "unit"]:
    fixed_abr[col] = fixed_abr[col].apply(fix_abbreviations)
fixed_abr.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium-size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.0,cup
9,6 cups veal or beef stock,stock,6.0,cup
10,1/3 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [94]:
numbers = {"one":1, "two":2, "three":3, "four":4, "five":5, "six":6, "seven":7, "eight":8, "nine":9, "ten":10, "dozen": 12}

def fix_numeric_words(ingredient):

    ingredient = re.sub(r"(\sone and a half|\sone and one[\s\-]half)", " 1.5", ingredient)
    ingredient = re.sub(r"one and one[\s\-]quarter", "1.25", ingredient)
    ingredient = re.sub(r"two and one[\s\-]quarter", "2.25", ingredient)
    ingredient = re.sub(r"two and one[\s\-]half", "2.5", ingredient)
    ingredient = re.sub(r"three and a half", "3.5", ingredient)
    match = re.findall(
        r"(?<!(?:[^\w]))([Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ive|[Ss]ix|[Ss]even|[Ee]ight|[Nn]ine|[Tt]en|[Dd]ozen)(?=(?:[^\w]))",
        ingredient,
    )
    for m in match:
        ingredient = re.sub(
            r"(?<!(?:[^\w]))([Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ive|[Ss]ix|[Ss]even|[Ee]ight|[Nn]ine|[Tt]en|[Dd]ozen)(?=(?:[^\w]))",
            str(numbers[m.lower()]),
            ingredient,
        )
    return ingredient


In [95]:
fixed_numeric = fixed_abr.copy()
fixed_numeric["input"] = fixed_numeric["input"].apply(fix_numeric_words)
fixed_numeric.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium-size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.0,cup
9,6 cups veal or beef stock,stock,6.0,cup
10,1/3 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [96]:
# Qty in data are rounded up to two decimal places
decimal.getcontext().rounding = decimal.ROUND_HALF_UP
def clean_unicode_fractions(string):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """

    # match all mixed fractions with a unicode fraction (e.g. 1 ¾ or 1¾) and add them together
    # UNHANDLED EDGE CASE: There are a handful of ingredients in which the whole number is a quantity
    # mulitplier and not part of the fraction, e.g. 2 1/4 in cinnamon sticks, should be 0.5 not 2.25
    match = re.findall(r"(\d+\s?)?([\u2150-\u215E\u00BC-\u00BE])", string)
    if match:
        for m in match:
            if not m[0]: # single unicode fraction e.g. ¾
                num = float(Fraction(unicodedata.numeric(m[1])))
            else: # mixed unicode fraction e.g. 1¾
                num = float(m[0]) + float(Fraction(unicodedata.numeric(m[1])))
            num = decimal.Decimal(num)
            num = round(num, 2)
            num = str(num.normalize())
            string = re.sub(
                r"(\d+\s?)?([\u2150-\u215E\u00BC-\u00BE])", num, string, 1
            )

    return string

In [97]:
# Unicode has numerous characters to represent fractions like ¾, we remove these
fixed_uni_fractions = fixed_numeric.copy()
fixed_uni_fractions["input"] = fixed_uni_fractions["input"].apply(clean_unicode_fractions)
fixed_uni_fractions.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium-size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.0,cup
9,6 cups veal or beef stock,stock,6.0,cup
10,1/3 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [98]:
fixed_unicode = fixed_uni_fractions.copy()
c = Counter()
for col in ["input", "name"]:
    fixed_unicode[col] = fixed_unicode[col].apply(str).apply(unidecode.unidecode)        # remove all accent characters
    fixed_unicode[col].apply(lambda x: c.update(list(str(x))))
sorted(c.keys())

[' ', '"', '#', '%', "'", '(', ')', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [99]:
# Qty in data are rounded up to two decimal places
decimal.getcontext().rounding = decimal.ROUND_HALF_UP
def merge_fractions(string):
    """
    Merges mixed fractions: 1 2/3 => 1.67
    """
    # This filters out NaN values so they wont get caught in the try except
    if string == string:
        match = re.findall(r"(\d+)[\-\s](\d+\/\d+)", string)
        if match:
            for m in match:
                num = float(m[0]) + float(Fraction(m[1]))
                num = decimal.Decimal(num)
                num = round(num, 2)
                if 'E' in str(num.normalize()):
                    num = str(num.quantize(decimal.Decimal('1')))
                else:
                    num = str(num.normalize())
                string = re.sub(r"(\d+)[\-\s](\d+\/\d+)", num, string, 1)

        match = re.findall(r"(\d+\/\d+)", string)
        if match:
            for m in match:
                num = float(Fraction(m))
                num = decimal.Decimal(num)
                num = round(num, 2)
                if 'E' in str(num.normalize()):
                    num = str(num.quantize(decimal.Decimal('1')))
                else:
                    num = str(num.normalize())
                string = re.sub(r"(\d+\/\d+)", num, string, 1)

    return string

In [100]:
# Many ingredient quantities are written as 1 1/2 to represent 1.5
# The quantity label however is always written as 1.5 so we need to 
# convert these fractions so that the crf can match it
merged_fractions = fixed_unicode.copy()
merged_fractions["input"] = merged_fractions["input"].apply(merge_fractions)
merged_fractions.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium-size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomatoes,1.0,cup
9,6 cups veal or beef stock,stock,6.0,cup
10,0.33 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [101]:
decimal.getcontext().rounding = decimal.ROUND_HALF_UP
def merge_quantities(string):
    """
    Many ingredients are written in the form 2 8.5-ounce cans...
    This is both tricky for the model to parse and made worse because
    the labeled data incosistently labels the quanity as 2, 8.5, or 17.
    We want to reuce all these to a single value:
    2 8.5-ounce => 17.0-ounce
    and update the quantity label as appropriate
    """

    if string == string:
        # Ok first we need to average any number ranges, e.g. "3 to 4 pounds" becomes "3.5 pounds"
        match = re.findall(r"(\d+\.?\d*)[\s\-]*[tor\-]+[\s\-]*(\d+\.?\d*)", string)
        if match:
            for m in match:
                num = (float(m[0]) + float(m[1])) / 2
                num = decimal.Decimal(num)
                num = round(num, 2)
                if "E" in str(num.normalize()):
                    num = str(num.quantize(decimal.Decimal("1")))
                else:
                    num = str(num.normalize())
                string = re.sub(
                    r"(\d+\.?\d*)[\s\-]*[tor\-]+[\s\-]*(\d+\.?\d*)", num, string, 1
                )

        # now we do quantity multipliers
        match = re.findall(r"(\d+)\s+(\d+\.*\d*)", string)
        if match:
            for m in match:
                num = float(m[0]) * float(m[1])
                num = decimal.Decimal(num)
                num = round(num, 2)
                if "E" in str(num.normalize()):
                    num = str(num.quantize(decimal.Decimal("1")))
                else:
                    num = str(num.normalize())
                string = re.sub(r"(\d+)\s+(\d+\.*\d*)", num, string, 1)
        # Remove hyphens we skipped before
        string = re.sub(r"[\–\—\‐\‑\-]", " ", string)

    return string
    

In [103]:
merged_quantities = merged_fractions.copy()
merged_quantities["input"] = merged_quantities["input"].apply(merge_quantities)
merged_quantities.iloc[38:54]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
39,"Vegetable oil, for deep frying",Vegetable oil,0.0,
40,pound elbow macaroni,elbow macaroni,0.0,pound
41,Freshly ground black pepper,black pepper,0.0,
42,2 teaspoons Dijon mustard,Dijon mustard,2.0,teaspoon
43,cup heavy cream,heavy cream,0.0,cup
44,2 cups whole milk,milk,2.0,cup
45,0.5 teaspoon hot red pepper flakes,hot red pepper flakes,0.5,teaspoon
46,3 tablespoons flour,flour,3.0,tablespoon
47,5 cups coarsely grated extra sharp Cheddar,Cheddar,5.0,cup
48,2 cups panko or coarse dry bread crumbs,bread crumbs,2.0,cup


Ok so up to this point we have fix all of the most "technical" issues. HTML cruft has been removed, spelling errors fixed, unicode stripped, fractions and number values merges into a single value.

Now we need to fix a large amount of the terrible labeling.

Case in point:

In [None]:
merged_quantities[merged_quantities.name.str.contains("scallions,")]

The ingredient name for all those should just be "scallion" and the rest in the comment section.
The strategy here is to fix the most common ingredients, things related to stock, oils, vinegars, onions, garlic, flour. This will have the most impact in parsing recipes down the road.

The goal here is not to fix everything, but to fix enough.

In [169]:
measurementUnit = ['teaspoon', 'tablespoon', 'cup', 'container', 'packet', 'bag', 'quart', 'pound', 'can', 'bottle',
                   'pint', 'package', 'ounce', 'jar', 'head', 'gallon', 'drop', 'envelope', 'bar', 'box', 'pinch',
                   'dash', 'bunch', 'recipe', 'layer', 'slice', 'link', 'bulb', 'stalk', 'square', 'sprig', 'fillet',
                   'piece', 'leg', 'thigh', 'cube', 'granule', 'strip', 'tray', 'leave', 'loaf', 'halve','clove',
                   'cleave','leaf','stick']

def fix_ingredients(row):
    input_tokens = nlp(row["input"])
    lemmatized_input = " ".join([x.lemma_ for x in input_tokens])
    
    # First thing, fix oils, by far one of the most common ingredients with many different permutations
    if "oil" in lemmatized_input:
        match = re.findall(r"^(\d+\.?\d*) (\w+) (coconut|extra virgin olive|olive|avocado|pumpkin\s?seed|walnut|grapeseed|peanut|untoasted sesame|toasted sesame|dark sesame|light sesame|sesame|flaxseed|sunflower seed|sunflower|safflower|canola|vegetable|neutral|corn|ghee|clarified butter)( oil)? \, (\w+)( \w+)?( oil)? \, (\w+)( \w+)?( oil)? or (\w+)( \w+)? oil", lemmatized_input)
        for m in match:
            row["qty"] = float(m[0])
            if m[1] in measurementUnit:
                row["unit"] = m[1]
            if "ghee" in m[2] or "clarify butter" in m[2]:
                row["name"] = m[2]
            else:
                row["name"] = m[2] + " oil"
        if match == []:
            match = re.findall(r"^(\d+\.?\d*) (\w+) (coconut|extra virgin olive|olive|avocado|pumpkin\s?seed|walnut|grapeseed|peanut|untoasted sesame|toasted sesame|dark sesame|light sesame|sesame|flaxseed|sunflower seed|sunflower|safflower|canola|vegetable|neutral|corn|ghee|clarified butter)( oil)? \, (\w+)( \w+)?( oil)? or (\w+)( \w+)? oil", lemmatized_input)
            for m in match:
                row["qty"] = float(m[0])
                if m[1] in measurementUnit:
                    row["unit"] = m[1]
                if "ghee" in m[2] or "clarify butter" in m[2]:
                    row["name"] = m[2]
                else:
                    row["name"] = m[2] + " oil"
            if match == []:
                match = re.findall(r"^(\d+\.?\d*) (\w+) (coconut|extra virgin olive|olive|avocado|pumpkin\s?seed|walnut|grapeseed|peanut|untoasted sesame|toasted sesame|dark sesame|light sesame|sesame|flaxseed|sunflower seed|sunflower|safflower|canola|vegetable|neutral|corn|ghee|clarified butter)( oil)? or (\w+)( \w+)?( oil)? or (\w+)( \w+)? oil", lemmatized_input)
                for m in match:
                    row["qty"] = float(m[0])
                    if m[1] in measurementUnit:
                        row["unit"] = m[1]
                    if "ghee" in m[2] or "clarify butter" in m[2]:
                        row["name"] = m[2]
                    else:
                        row["name"] = m[2] + " oil"

                if match == []:
                    match = re.findall(r"^(\d+\.?\d*) (\w+) (coconut|extra virgin olive|olive|avocado|pumpkin\s?seed|walnut|grapeseed|peanut|untoasted sesame|toasted sesame|dark sesame|light sesame|sesame|flaxseed|sunflower seed|sunflower|safflower|canola|vegetable|neutral|corn|ghee|clarified butter)( oil)? or (\w+)( \w+)? oil", lemmatized_input)
                    for m in match:
                        row["qty"] = float(m[0])
                        if m[1] in measurementUnit:
                            row["unit"] = m[1]
                        if "ghee" in m[2] or "clarify butter" in m[2]:
                            row["name"] = m[2]
                        else:
                            row["name"] = m[2] + " oil"

                    if match == []:
                        match = re.findall(r"^(\d+\.?\d*) (\w+)( \w+)?( \w+)? (chili|coconut|olive|avocado|pumpkin\s?seed|walnut|grapeseed|peanut|untoasted sesame|toasted sesame|dark sesame|light sesame|sesame|flaxseed|sunflower seed|sunflower|safflower|canola|vegetable|neutral|corn) oil", lemmatized_input)
                        for m in match:
                            row["qty"] = float(m[0])
                            if m[1] in measurementUnit:
                                row["unit"] = m[1]
                            if "olive" in m[-1] and "extra" in m[2] and "virgin" in m[3]:
                                row["name"] = "extra virgin olive oil"
                            else:
                                row["name"] = m[-1] + " oil"
        
        match = re.findall(r"^(\w+|[Ee]xtra virgin olive|[Hh]ot chili|[Cc]hinese hot chili) oil", lemmatized_input)
        for m in match:
            row["name"] = m + " oil"
        match = re.findall(r"Corn , grapeseed , canola or other neutral oil", lemmatized_input)
        for m in match:
            row["name"] = "Corn oil"
    
    match = re.findall(r"^(\d+\.?\d*)( \w+)?( \w+)?( \w+)? (scallion|spring onion|[Vv]idalia|[Ww]alla [Ww]alla|[Mm]aui|sweet onion|ramp|garlic scape|yellow onion|white onion|red onion|pearl onion|shallot|leek|Spanish onion|[Cc]ippolini|onion)", lemmatized_input)
    for m in match:
        row["qty"] = float(m[0])
        if len(m) > 1 and m[1] in measurementUnit:
            row["unit"] = m[1].strip()
        elif len(m) > 2 and m[2] in measurementUnit:
            row["unit"] = m[2].strip()
        if m[-1].lower() == "vidalia":
            row["name"] = m[-1] + " onion"
        elif m[-1].lower() == "walla walla":
            row["name"] = m[-1] + " onion"
        elif m[-1].lower() == "maui":
            row["name"] = m[-1] + " onion"
        elif m[-1].lower() == "cippolini":
            row["name"] = m[-1] + " onion"
        else:
            row["name"] = m[-1]

    
    match = re.findall(r"^(\d+\.?\d*)( \w+)?( \w+)?( \w+)? (green garlic|garlic powder|garlic clove|garlic bulb|garlic head|garlic)", lemmatized_input)
    for m in match:
        row["qty"] = float(m[0])
        if len(m) > 1 and m[1] in measurementUnit:
            row["unit"] = m[1].strip()
        elif len(m) > 2 and m[2] in measurementUnit:
            row["unit"] = m[2].strip()
        if "clove" in m[-1].lower():
            row["name"] = "garlic"
            row["unit"] = "clove"
        elif "bulb" in m[-1].lower():
            row["name"] = "garlic"
            row["unit"] = "bulb"
        elif "head" in m[-1].lower():
            row["name"] = "garlic"
            row["unit"] = "head"
        else:
            row["name"] = m[-1].strip()

    match = re.findall(r"^(\d+\.?\d*)( \w+)?( \w+)?( \w+)?( \w+)? (table salt|kosher salt|pink salt|pickle salt)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if len(m) > 1 and m[1] in measurementUnit:
            row["unit"] = m[1].strip()
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*)( \w+) (bake soda|bake powder|cocoa powder)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if len(m) > 1 and m[1] in measurementUnit:
            row["unit"] = m[1].strip()
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*)( \w+)?( \w+)?( \w+)? (bay|thai basil|basil|oregano|parsley|rosemary|sage|thyme|tarragon)( \w+)?", lemmatized_input)
    for m in match:
        #if "bay" not in lemmatized_input:
        #    print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1].strip()
        elif m[-1] in measurementUnit:
            row["unit"] = m[-1]
        row["name"] = m[-2]
        #if "bay" not in lemmatized_input:
        #    print(row)
        
    match = re.findall(r"^(\d+\.?\d*)( \w+) (honeycomb tripe|corn syrup|molasses|honey|maple syrup|agave syrup|agave nectar)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+)( freshly)? grind (cardamom|ginger|cumin|cinnamon|coriander|nutmeg|clove|black pepper|white pepper|red pepper|pepper)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+)( \w+)? sweet potato", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        if m[1] == "yam":
            row["name"] = "yam"
        else:
            row["name"] = "sweet potato"
        #print(row)
        
    match = re.findall(r"^[Ff]reshly grind (black )?pepper", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float("nan")
        row["unit"] = ""
        row["name"] = m + "pepper"
        #print(row)
        
    match = re.findall(r"^([Kk]osher )?[Ss]alt and freshly grind (black )?pepper", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float("nan")
        row["unit"] = ""
        row["name"] = m[0] + "salt" + " " + m[1] + "pepper"
        #print(row)
        
    match = re.findall(r"^([Kk]osher )?[Ss]alt and (black )?pepper", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float("nan")
        row["unit"] = ""
        row["name"] = m[0] + "salt" + " " + m[1] + "pepper"
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+) (winter|butternut|kabocha|hubbard|acorn|buttercup|turban|spaghetti|delicata|carnival) squash", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-1] + " squash"
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+) corn kernel", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = "corn kernel"
        #print(row)
        
    match = re.findall(r"([Jj]uice|[Zz]est) of (\d+\.?\d*) (lemon|lime|orage)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[1])
        row["name"] = m[-1] + " " + m[0]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*)( \w+)( \w+)? (heavy cream|lemon juice)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*)( \w+) (black pepper|flour)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\w+) crouton", lemmatized_input)
    for m in match:
        row["name"] = "crouton"
        
    match = re.findall(r"^(\w+) salt", lemmatized_input)
    for m in match:
        if m[0] in measurementUnit:
            row["unit"] = m[0]
        elif m[0].lower() == "kosher":
            row["name"] = "kosher salt"
        row["name"] = "salt"

        
    match = re.findall(r"^(\d+\.?\d*) (\w+)( \w+)? crouton", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = "crouton"
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+) (boneless , )?(skinless )?(chicken|turkey) (breast|thigh)", lemmatized_input)
    for m in match:
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-2] + " " + m[-1]
        
    match = re.findall(r"^(\d+\.?\d*) (poblano|[Aa]naheim|pasilla) or (poblano|[Aa]naheim|pasilla) (pepper|chili)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        row["name"] = m[1] + " " + m[3]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+)( \w+)?( \w+)?( \w+)? tomato( paste)?", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        tomato_type = "tomato"
        for word in m[::-1]:
            if "grape" in word:
                tomato_type = "grape tomato"
                break
            elif "roma" in word:
                tomato_type = "roma tomato"
                break
            elif "cherry" in word:
                tomato_type = "cherry tomato"
                break
            elif "beefsteak" in word:
                tomato_type = "beefsteak tomato"
                break
            elif "plum" in word:
                tomato_type = "plum tomato"
                break
            elif "paste" in word:
                tomato_type = "tomato paste"
        row["name"] = tomato_type 
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+) (salt|water)$", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[-1]
        #print(row)
        
    match = re.findall(r"^(\d+\.?\d*) (\w+) (beef|chicken|pork|duck|veal|turkey|vegetable|shrimp|fish|lobster|mushroom|garlic|water|white wine)\, (\w+) or (\w+) (stock|broth)", lemmatized_input)
    for m in match:
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        if m[2] == "white wine":
            row["name"] = m[2]
        elif m[2] == "water":
            row["name"] = m[2]
        else:
            row["name"] = m[2] + " " + m[-1]
    if match == []:
        match = re.findall(r"^(\d+\.?\d*) (\w+) (beef|chicken|pork|duck|veal|turkey|vegetable|shrimp|fish|lobster|mushroom|garlic|water|white wine) or (\w+) (stock|broth)", lemmatized_input)
        for m in match:
            row["qty"] = float(m[0])
            if m[1] in measurementUnit:
                row["unit"] = m[1]
            if m[2] == "white wine":
                row["name"] = m[2]
            elif m[2] == "water":
                row["name"] = m[2]
            else:
                row["name"] = m[2] + " " + m[-1]
        if match == []:
            match = re.findall(r"^(\d+\.?\d*) (\w+) (beef|chicken|pork|duck|veal|turkey|vegetable|shrimp|fish|lobster|mushroom|garlic) (stock|broth)", lemmatized_input)
            for m in match:
                row["qty"] = float(m[0])
                if m[1] in measurementUnit:
                    row["unit"] = m[1]
                row["name"] = m[2] + " " + m[-1]

    match = re.findall(r"(beef|chicken|duck|pork|veal|turkey|vegetable|shrimp|fish|lobster|mushroom|garlic) from stock", lemmatized_input)
    for m in match:
        row["name"] = m + " stock"
    
    

                    
    match = re.findall(r"^(\d+\.?\d*) (\w+) (distilled white|white wine|white balsamic|white|champagne|red wine|red|seasoned rice|rice|apple cider|cider|sherry|malt)( vinegar)?\, (\w+)( vinegar)? or (\w+) vinegar", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = float(m[0])
        if m[1] in measurementUnit:
            row["unit"] = m[1]
        row["name"] = m[2] + " vinegar"
        #print(row)
    if match == []:
        match = re.findall(r"^(\d+\.?\d*) (\w+) (distilled white|white wine|white balsamic|white|champagne|red wine|red|seasoned rice|rice|apple cider|cider|sherry|malt)( vinegar)? or (\w+) vinegar", lemmatized_input)
        for m in match:
            #print(row)
            row["qty"] = float(m[0])
            if m[1] in measurementUnit:
                row["unit"] = m[1]
            row["name"] = m[2] + " vinegar"
            #print(row)
        if match == []:
            match = re.findall(r"^(\d+\.?\d*) (\w+) (distilled white|white wine|white balsamic|white|champagne|red wine|red|seasoned rice|rice|apple cider|cider|sherry|malt) vinegar", lemmatized_input)
            for m in match:
                #print(row)
                row["qty"] = float(m[0])
                if m[1] in measurementUnit:
                    row["unit"] = m[1]
                row["name"] = m[2] + " vinegar"
                #print(row)

    match = re.findall(r"^([Aa] )?([Bb]ouquet garni)", lemmatized_input)
    for m in match:
        #print(row)
        row["qty"] = 1
        row["unit"] = float("nan")
        row["name"] = m[-1]
        #print(row)

    return row
    

In [170]:
fixed_names = merged_quantities.copy()
fixed_names = fixed_names.apply(fix_ingredients, axis=1)
fixed_names.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomato,1.0,cup
9,6 cups veal or beef stock,veal stock,6.0,cup
10,0.33 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


So far we've only fixed various word and character level issues in the data, now we will start making larger but necessary changes to the data and the labels. First lets look at all the unique unit values.

In [174]:
oil_test = fixed_names[fixed_names.name.str.contains("oil")]
oil_test[oil_test.name.map(len) > 25]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
11038,Nonstick vegetable oil spray,Nonstick vegetable oil spray,0.0,
12970,"6 cans (3.75 ounces each) oil packed sardines,...","oil-packed sardines, drained and lightly crushed",22.5,ounce
31581,"Approximately 1 quart vegetable oil, for frying","vegetable oil for frying, approximately",1.0,quart
39187,"Bones and head from 1 flounder, sole or other ...","Bones and head from 1 flounder, sole or other ...",0.0,
41488,2 tablespoons plus 2 teaspoons porcini infused...,plus 2 teaspoons olive oil,2.0,tablespoon
...,...,...,...,...
168447,"2 tablespoons peanut, canola, rice bran, sunfl...","peanut, canola, rice bran, sunflower or grape ...",2.0,tablespoon
170101,"Canola or vegetable oil, for frying","Canola or vegetable oil, for frying",0.0,
170270,6 ounces (usually 2 cans) smoked sardines in o...,smoked sardines in olive oil,6.0,ounce
174614,"2. In a bowl, whisk together lemon juice, salt...","2. In a bowl, whisk together lemon juice, salt...",0.0,


In [175]:
c = Counter()  
c.update([str(x) for x in fixed_names["unit"].values])
sorted(c.keys())

['', '1 1/2-pound', '1 inch', '1-inch-thick slice', '1-pound bag', '1/2 inch', '1/4-inch piece', '10-pound piece', '12-ounce bottle', '12-ounce bottles', '12-ounce cans', '12-ounce piece', '12-ounce piece filet', '14-ounce can', '15-ounce can', '15-ounce cans', '15-ounce container', '2 1/2-pound', '2 by 4 inches', '2-inch piece', '2-ounce piece', '2-pound', '20-ounce can', '28-ounce can', '3 1/2 pound', '3 1/2- to 4-pound', '3 1/2-pound', '3- to 4-pound', '3-inch', '3-inch piece', '3-inch stick', '3/4- inch-thick', '4 inches long', '4- to 5-pound', '4-inch round', '4-inch stick', '46-ounce', '5-to-7-pound', '6-ounce bag', '6-pound', '7 1/2 -ounce packages', '7-inch', '7-ounce tube', '7-pound', '750-milliliter bottle', '8 to 9 ounces', '8- to 10-pound', '8-inch', '8-ounce packages', 'Handful', 'Pinch', 'Scant pinch', 'Slice', 'Slices', 'Small piece', 'bag', 'ball', 'bar', 'basket', 'batch', 'bottle', 'bowl', 'box', 'branch', 'bulb', 'bulbs', 'bunch', 'bunches', 'bushy sprigs', 'cake', '

This is a mess, while more obscure units like "sprig" or "thread" are fine many are just adjectives or adjective unit combinations, e.g. "heaping teaspoon". We will move all non unit of measure words to the the comment field.

In [176]:
def fix_units(string):
    if string == string:
        unit_tokens = nlp(string)
        lemmatized_units = " ".join([x.lemma_ for x in unit_tokens])
        for word in lemmatized_units.split():
            if word in measurementUnit:
                return word
    return ""


In [177]:
fixed_units = fixed_names.copy()
fixed_units["unit"] = fixed_units["unit"].apply(fix_units)
fixed_units.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,0.0,
8,1 cup canned plum tomatoes with juice,plum tomato,1.0,cup
9,6 cups veal or beef stock,veal stock,6.0,cup
10,0.33 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [178]:
c = Counter()  
c.update([str(x) for x in fixed_units["unit"].values])
sorted(c.keys())

['', 'bag', 'bar', 'bottle', 'box', 'bulb', 'bunch', 'can', 'cleave', 'clove', 'container', 'cube', 'cup', 'dash', 'drop', 'envelope', 'fillet', 'gallon', 'halve', 'head', 'jar', 'layer', 'leaf', 'leave', 'link', 'loaf', 'ounce', 'package', 'packet', 'piece', 'pinch', 'pint', 'pound', 'quart', 'recipe', 'slice', 'sprig', 'square', 'stalk', 'stick', 'strip', 'tablespoon', 'teaspoon']

In [179]:
fixed_units.loc[fixed_units.qty == 0, "qty"] = float("nan")
fixed_units.loc[fixed_units.name.map(len) < 3, "name"] = ""
fixed_units.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup
2,"1 medium size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,,
8,1 cup canned plum tomatoes with juice,plum tomato,1.0,cup
9,6 cups veal or beef stock,veal stock,6.0,cup
10,0.33 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [180]:
fixed_units[fixed_units["name"].map(len)>25]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
121,0.75 cup of liquid from cooked mussels,liquid from cooked mussels,0.75,cup
178,Chopped fresh parsley leaves for garnish,chopped fresh parsley leaves for garnish,,
182,"10 thin slices veal, preferably from the leg, ...","veal, preferably from the leg, pounded slightl...",8.00,slice
207,"2 tablespoons unsalted butter, at room tempera...","unsalted butter, at room temperature and cut i...",2.00,tablespoon
208,"5 ounces bittersweet chocolate, melted and sti...","bittersweet chocolate, melted and still warm",5.00,ounce
...,...,...,...,...
178858,6 cups thinly sliced bell pepper or other swee...,bell pepper or other sweet pepper,6.00,cup
179046,"0.5 pound orechiette, farfalle or other short ...","orechiette, farfalle or other short pasta",0.50,pound
179103,0.33 cup nonhydrogenated shortening,nonhydrogenated shortening,0.33,cup
179179,14 ounce bag corn tortilla chips,12- to-16-ounce bag corn tortilla chips,1.00,


Much better.

In the following cells we will start modifying most of the numerical information in the input string.
There exist the following problems:


1. The source of 80% of the issues we will fix in the following cells is that the quantity label is a decimal number, but often the quantity in the input has a non-decimal representation. Which, because of how labels are assigned to the input, the CRF cannot learn the relationship between the two (among other issues). e.g. "0.5 teaspoon cumin" is fine, "1/2 teaspoon cumin" is not. Specific cases are as follows:

2. The quantity is represented as a numeric word or phrase, e.g "One stalk rhubarb", "one and one-half". We convert theses words to their decimal equivalent.

3. The quantity has a unicode fraction representation, e.g. ¾ or a mixed fraction e.g. 1¾, we convert these to 0.25 and 1.25 respectively.

4. 

In [181]:
def fix_misc(row):
    if "8 thin slices of beef," in row["input"]:
        row["name"] = "beef"
    if "cup finely diced hot chil" in row["input"]:
        row["name"] = "hot chili"
    
    input_tokens = nlp(row["input"])
    lemmatized_input = " ".join([x.lemma_ for x in input_tokens])
    match = re.findall(r"^(\d+\.?\d*) (\w+) (\w+)", lemmatized_input)
    for m in match:
        if m[1] in measurementUnit:
            row["qty"] = float(m[0])
            row["unit"] = m[1]
    name_tokens = nlp(str(row["name"]))
    lemmatized_name = " ".join([x.lemma_ for x in name_tokens])
    unit_tokens = nlp(str(row["unit"]))
    lemmatized_unit = " ".join([x.lemma_ for x in unit_tokens])
    row["name"] = lemmatized_name
    row["unit"] = lemmatized_unit
    
    return row
    
    

In [182]:
fixed_misc = fixed_units.copy()
fixed_misc = fixed_misc.apply(fix_misc, axis=1)
fixed_misc.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnut,1.0,cup
2,"1 medium size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,,
8,1 cup canned plum tomatoes with juice,plum tomato,1.0,cup
9,6 cups veal or beef stock,veal stock,6.0,cup
10,0.33 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [183]:
fixed_misc.iloc[50:75]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
51,Sprigs of watercress,watercress,3.0,sprig
52,"0.75 pound shiitake mushrooms, stemmed and qua...",shiitake mushroom,0.75,pound
53,2 tablespoons extra virgin olive oil,extra virgin olive oil,2.0,tablespoon
54,Kosher salt and black pepper,salt,,
55,"2 pound beef tenderloin roast, cut from the la...",beef tenderloin,2.0,pound
56,0.13 teaspoon ground cinnamon,cinnamon,0.13,teaspoon
57,1 tablespoon cocoa powder,cocoa powder,1.0,tablespoon
58,"3 tablespoons medium roast coffee beans, finel...",coffee bean,3.0,tablespoon
59,1 teaspoon light brown sugar,brown sugar,1.0,teaspoon
60,Kosher salt,salt,,


In [85]:
fixed_misc[fixed_misc["name"].map(len)>25]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
178,Chopped fresh parsley leaves for garnish,chop fresh parsley leave for garnish,,
182,"10 thin slices veal, preferably from the leg, ...","veal , preferably from the leg , pound slightl...",8.0,slice
186,Extra virgin olive oil as needed,extra virgin olive oil a need,,
207,"2 tablespoons unsalted butter, at room tempera...","unsalted butter , at room temperature and cut ...",2.0,tablespoon
208,"5 ounces bittersweet chocolate, melted and sti...","bittersweet chocolate , melt and still warm",5.0,ounce
...,...,...,...,...
178858,6 cups thinly sliced bell pepper or other swee...,bell pepper or other sweet pepper,6.0,cup
178979,"1 teaspoon French four-spice powder, or substi...",French four - spice powder,1.0,teaspoon
179046,"0.5 pound orechiette, farfalle or other short ...","orechiette , farfalle or other short pasta",0.5,pound
179179,14 ounce bag corn tortilla chips,12 to-16 ounce bag corn tortilla chip,14.0,ounce


In [79]:
test = nlp("0.50 head iceberg or romaine lettuce, shredded 	")

In [80]:
for word in test:
    print(word.lemma_)

0.50
head
iceberg
or
romaine
lettuce
,
shred
	


In [81]:
str(decimal.Decimal(float(0.50)).normalize())

'0.5'

In [120]:
fixed_misc[fixed_misc.input.str.contains("oil")].iloc[0:25]

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
39,"Vegetable oil, for deep frying",Vegetable oil,,
53,2 tablespoons extra virgin olive oil,extra virgin olive oil,2.0,tablespoon
101,1 cup olive oil,olive oil,1.0,cup
118,0.5 cup olive oil,olive oil,0.5,cup
125,0.25 cup good quality olive oil,good quality olive oil,0.25,cup
134,1 teaspoon Asian sesame oil,Asian sesame oil,1.0,teaspoon
139,1 teaspoon olive oil,olive oil,1.0,teaspoon
186,Extra virgin olive oil as needed,extra virgin olive oil a need,,
199,2 tablespoons extra virgin olive oil,extra virgin olive oil,2.0,tablespoon


Now that we've handled the unicode fractions, we will remove the remaining unicode characters (this is almost exclusively letters with accent marks). This, among other things, will standardize the spellings of things like creme fraiche and jalepeno.

In [184]:
final = fixed_misc.copy()
print(final.shape)
final = final[final['name'].map(len) >= 3]
print(final.shape)
final = final[final['name'].map(len) < 25]
print(final.shape)

final.to_pickle("../data/interim/crf_data.pickle")

(179060, 4)
(178772, 4)
(174006, 4)


Beatiful.

In [None]:
final.tail(25)

In [None]:
merged_data.head()

Phase two of merging quantities going to start making some more assumptions
Starting with everything in the name label after a comma should be a comment

In [None]:
def replace_volume_with_weight(row):
    """
    By default the NYT labelled the qty and unit using volume,
    this changes those labels to weight if its in the ingredient input.
    """
    match = re.findall(r"\((\d+\.*\d*)\s(ounces?|pounds?|grams?|milliliters?)\)", row["input"])
    for m in match:
        if row["unit"] == row["unit"] and row["unit"] != m[1]:
            if row["comment"] == row["comment"]:
                row["comment"] = row["comment"].replace(m[0], '')
                row["comment"] = row["comment"].replace(m[1], '')
                if row["qty"] == row["qty"]:
                    if '.' in m[0]:
                        row["comment"] += " " + str(row["qty"]) + " " + row["unit"]
                    else:
                        row["comment"] += " " + str(int(row["qty"])) + " " + row["unit"]
                else:
                    row["comment"] += " " + row["unit"]
                row["unit"] = m[1]
                row["qty"] = float(m[0])
    return row

In [None]:
replaced_volume = merged_range.apply(replace_volume_with_weight, axis=1)

In [None]:
replaced_volume.head(15)

In [None]:
descriptions = ['baked', 'beaten', 'blanched', 'boiled', 'boiling', 'boned', 'breaded', 'brewed', 'broken', 'chilled',
        'chopped', 'cleaned', 'coarse', 'cold', 'cooked', 'cool', 'cooled', 'cored', 'creamed', 'crisp', 'crumbled',
        'crushed', 'cubed', 'cut', 'deboned', 'deseeded', 'diced', 'dissolved', 'divided', 'drained', 'dried', 'dry',
        'fine', 'firm', 'fluid', 'fresh', 'frozen', 'grated', 'grilled', 'ground', 'halved', 'hard', 'hardened',
        'heated', 'heavy', 'juiced', 'julienned', 'jumbo', 'large', 'lean', 'light', 'lukewarm', 'marinated',
        'mashed', 'medium', 'melted', 'minced', 'near', 'opened', 'optional', 'packed', 'peeled', 'pitted', 'popped',
        'pounded', 'prepared', 'pressed', 'pureed', 'quartered', 'refrigerated', 'rinsed', 'ripe', 'roasted',
        'roasted', 'rolled', 'rough', 'scalded', 'scrubbed', 'seasoned', 'seeded', 'segmented', 'separated',
        'shredded', 'sifted', 'skinless', 'sliced', 'slight', 'slivered', 'small', 'soaked', 'soft', 'softened',
        'split', 'squeezed', 'stemmed', 'stewed', 'stiff', 'strained', 'strong', 'thawed', 'thick', 'thin', 'tied', 
        'toasted', 'torn', 'trimmed', 'wrapped', 'vained', 'warm', 'washed', 'weak', 'zested', 'wedged',
        'skinned', 'gutted', 'browned', 'patted', 'raw', 'flaked', 'deveined', 'shelled', 'shucked', 'crumbs',
        'halves', 'squares', 'zest', 'peel', 'uncooked', 'butterflied', 'unwrapped', 'unbaked', 'warmed']




In [None]:
import spacy
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])

In [None]:


def check_names(row):
    match = re.findall(r"(\d+\.?\d*) (tablespoons?|teaspoons?|cups?) plus (\d+\.?\d*) (tablespoons?|teaspoons?|cups?)", row["input"])
    for m in match:
        print(row)
        if m[1] in m[3] or m[3] in m[1]:
            num = float(m[0]) + float(m[2])
        elif "cup" in m[1]:
            if "tablespoon" in m[3]:
                num = float(m[0]) + (float(m[2]) / 16)
            elif "teaspoon" in m[3]:
                num = float(m[0]) + (float(m[2]) / 48)
        elif "tablespoon" in m[1]:
            if "cup" in m[3]:
                num = float(m[0]) + (float(m[2]) * 16)
            elif "teaspoon" in m[3]:
                num = float(m[0]) + (float(m[2]) * 3)
        elif "teaspoon" in m[1]:
            if "tablespoon" in m[3]:
                num = float(m[0]) + (float(m[2]) * 3)
            elif "cup" in m[3]:
                num = float(m[0]) + (float(m[2]) * 48)
        row["input"] = re.sub(r"(\d+\.?\d*) (tablespoons?|teaspoons?|cups?) plus (\d+\.?\d*) (tablespoons?|teaspoons?|cups?)", str(round(num,2)) + " " + m[1], row["input"])
        print(row)
    match = re.findall(r"^\d+\.?\d*\s[a-zA-Z]+\s", row["input"])
    for m in match:
        unit1 = nlp(m.split()[1])
        if row["unit"] == row["unit"] and len(row["unit"]) > 0:
            unit2 = nlp(row["unit"])
            if unit1[0].lemma_ != unit2[0].lemma_ or unit2[0].lemma_ not in measurementUnit:
                if unit1[0].lemma_ in measurementUnit:
                    #print(row)
                    row["unit"] = unit1[0].lemma_
                    #print(row)
        else:
            if unit1[0].lemma_ in measurementUnit:
                row["unit"] = unit1[0].lemma_
        if row["qty"] != row["qty"] or row["qty"] == 0.0:
            if unit1[0].lemma_ in measurementUnit:
                #print(row)
                row["qty"] = float(m.split()[0])
                #print(row)
    """if row["name"] == float("nan"):
        print(row)
    if row["name"] != row["name"] or row["name"] == "" or row["input"] == row["name"]:
        match = re.findall(r"^\d+\.?\d* ([a-z]+) (black pepper|kosher salt|baking powder|all purpose flour|flour|water|sugar|salt|cornstarch|chickpea flour|garam masala|Aleppo pepper|sweet paprika|Greek yogurt)", row["input"])
        for m in match:
            print(m)
            #print(row)
            row["name"] = m
            #print(row)"""
    return row
            
fixed_units_again = fixed_names.apply(check_names, axis=1)

In [None]:
fixed_units_again.head(25)

In [None]:
replaced_volume[replaced_volume["name"].str.contains("bouquet garni")]

In [None]:
replaced_volume.to_csv("../data/interim/nyt_partial_clean.csv")

In [None]:
replaced_volume.to_pickle("../data/interim/crf_data.pickle")

In [None]:
training_data, test_data = train_test_split(replaced_volume, test_size=0.2)

In [None]:
training_data.to_pickle("../data/interim/crf_training_data.pickle")
test_data.to_pickle("../data/interim/crf_test_data.pickle")