# Cleaning The NYT Data for CRF Training 

In order to parse recipe data into a useable vector representation, we need a way of labeling the different parts of the natural language of the recipe ingredients, e.g. Ingredient name, unit of measurement, quantity, etc.

To do this we chose to use Conditional Random Fields (https://en.wikipedia.org/wiki/Conditional_random_field), throught the use of pycrfsuite. 

To train the model we will use labelled ingredient data published by the New York Times, that they used for their own internal CRF model to label ingredients. That data can be found here: https://github.com/nytimes/ingredient-phrase-tagger.

Unfortunately, the quality of the labels is very poor, and it needs fixing, cleaning, and standardizing.
So lets take a look.

In [1]:
from collections import Counter
import decimal
import re
from fractions import Fraction
import string
import sys
import unicodedata
import pandas as pd
from sklearn.model_selection import train_test_split
import unidecode
%pprint

Pretty printing has been turned OFF


This is what the labeled NYT data looks like.

In [2]:
# Load raw data
input_data = pd.read_csv(
    "../data/raw/nyt-ingredients-snapshot-2015.csv", index_col="index"
)
# The NYT has a column to label ingredient quantity ranges, e.g. 2-3 apples.
# We use the average of a range and so dont need to keep track of range end points
input_data = input_data.drop(columns="range_end")
input_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,tablespoon,


Lets take a look at the characters present and see if theres anything we wouldn't expect in a recipe, e.g. unescaped unicode, bizare punctuation, etc.

In [3]:
c = Counter()
for col in ["input", "name", "unit", "comment"]:
    input_data[col].apply(lambda x: c.update(list(str(x))))

# We expect alphabet, digit, and punctuation characters so lets remove them
for i in list(c):
    if i.isalpha() or i.isdigit():
        del c[i] 
sorted(c.keys())

[' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '_', '`', '|', '\x90', '\xa0', '¼', '½', '¾', '×', '–', '—', '‘', '’', '“', '”', '\u2028', '⁄', '⅓']

Ok so we have unescaped unicode, non-standard hyphens, quotes, as well as unicode fractions. Lets fix this.

In [6]:
def fix_characters(string):
    if string == string:
        if "\xa0" in string:
            string = string.replace("\xa0", " ")
        if "\x90" in string:
            string = string.replace("\x90", "")
        if "×" in string:
            string = string.replace("×", "x")
        string = re.sub(r"–|—|‐|‑", "-", string)
        if "‘" in string:
            string = string.replace("‘", "'")
        if "’" in string:
            string = string.replace("’", "'")
        string = re.sub(r"“|”|″|‟", '"', string)
        if "\u2028" in string:
            string = string.replace("\u2028", "")
        if "⁄" in string:
            string = string.replace("⁄", "/")
        # The following characters only appear a very small number of times each in the data and are removed
        if "|" in string:
            string = string.replace("|", "")
        if "`" in string:
            string = string.replace("`", "")
        if "@" in string:
            string = string.replace("@", "")
        if "+" in string:
            string = re.sub(r"\+{2,}", "", string)
        if "[" in string:
            string = string.replace("[", "")
        if "]" in string:
            string = string.replace("]", "")
        if "?" in string:
            string = string.replace("?", "")
        if "�" in string:
            string = string.replace("™", "")
        if "™" in string:
            string = string.replace("™", "")
        if "‿" in string:
            string = string.replace("‿", "")
        # for whatever reason the n in jalepeno is scraped as this character
        if "‱" in string:
            string = string.replace("‱", "n")
        if "•" in string:
            string = string.replace("•", "")
        if "®" in string:
            string = string.replace("®", "")
        if "§" in string:
            string = string.replace("§", "")
        if "¤" in string:
            string = string.replace("¤", "")
        if "-" in string:
            string = re.sub(r"(\d+)\-(\w)", r"\1 - \2", string)

    return string

In [7]:
for col in ["input", "name", "unit", "comment"]:
    input_data[col] = input_data[col].apply(fix_characters)

Now we fix various spelling errors and standardise spelling

In [8]:
def fix_spelling(string):
    if string == string:
        string = re.sub(r"([Cc])(hipolte|hipottle)", r"\1hipotle", string)
        string = re.sub(r"([Ff])(ritata|rittatta|ritatta|ritartar)", r"\1rittata", string)
        string = re.sub(r"([Cc])reme\s[Ff](resh|raishe)", r"\1reme fraiche", string)
        string = re.sub(r"([Mm])(ascapone|ascaprone)", r"\1ascarpone", string)
        string = re.sub(r"([Bb])(russel|russle)\s[Ss]prout", r"\1russels sprout", string)
        string = re.sub(r"([Gg])nocci", r"\1nocchi", string)
        string = re.sub(r"([Mm])(accaroni|acarroni)", r"\1acaroni", string)
        string = re.sub(r"([Mm])(acaroon|accaron|acarron)", r"\1acaron", string)
        string = re.sub(r"([Ff])(ettuccini|ettucine|ettucchine)", r"\1ettuccine", string)
        string = re.sub(r"([Ee])xpresso", r"\1spresso", string)
        string = re.sub(r"([Mm])(ozzarrella|ozarela|ozzarela )", r"\1ozzarella", string)
        string = re.sub(r"([Ss])herbert", r"\1herbet", string)
        string = re.sub(r"([Cc])ardamon", r"\1ardamom", string)
        string = re.sub(r"([Ll])inguini", r"\1inguine", string)
        string = re.sub(r"([Ll])iquer", r"\1iqueur", string)
        string = re.sub(r"([Ww])on\ston", r"\1onton", string)
        string = re.sub(r"([Cc])hile", r"\1hili", string)
        string = re.sub(r"([Cc])hilies", r"\1hilis", string)
        string = re.sub(r"(\&amp\;|\&)e(acute|grave)\;", "e", string)
        string = re.sub(r"(\&amp\;|\&)icirc\;", "i", string)
        string = re.sub(r"(\&amp\;|\&)ucirc\;", "u", string)
        string = re.sub(r"(\&amp\;|\&)\#231\;", "c", string)
        string = re.sub(r"(\&amp\;|\&)rsquo\;", "'", string)
        string = re.sub(r"(\&amp\;|\&)ntilde\;", "n", string)
        # Handling misc edge case
        string = re.sub(r"1\#3", "1/3", string)
        string = re.sub(r"1\#12", "1 12", string)

    return string

In [9]:
for col in ["input", "name", "unit", "comment"]:
    input_data[col] = input_data[col].apply(fix_spelling)

Next up, there are leftover html tags and markup that needs to be removed, this has the added benefit of stripping out the extraneous "see recipe" from both input and comment.

In [10]:
def clean_nyt_html(row, verbose=False):
    """
    This will replace all html tags that were not stripped
    from the NYT data
    """
    columns = ["input", "name", "comment"]
    for col in columns:
        # This filters out NaN values so they wont get caught in the try except
        if row[col] == row[col]:
            try:
                # this will remove all: <a href=...>see recipe</a>
                match = re.findall(r"\(?<.*see\s*recipe.*>\)?", row[col])
                if match:
                    for m in match:
                        row[col] = re.sub(r"\(?<.*see\s*recipe.*>\)?", "", row[col])
                        if col == "input" and row["comment"] == row["comment"]:
                            row["comment"] = re.sub(r"\(?see recipe\)?", "", row["comment"])
            except TypeError:
                print("ERROR: Removing <see recipe>, " + col + " ", row)
            try:
                # this will remove all: see <a href=...>recipe</a>
                match = re.findall(r"\(?\s*(see)\s*?<.*recipe.*>\)?", row[col])
                if match:
                    for m in match:
                        row[col] = re.sub(
                            r"\(?\s*(see)\s*?<.*recipe.*>\)?", "", row[col]
                        )
                        if col == "input" and row["comment"] == row["comment"]:
                            row["comment"] = re.sub(r"\(?see recipe\)?", "", row["comment"])
            except TypeError:
                print("ERROR: Removing see <recipe>, " + col + " ", row)

            # This will remove all <span> and misc <a href=...>...</a>
            match = re.findall(r"<.*?>", row[col])
            if match:
                for m in match:
                    row[col] = re.sub(r"<.*?>", "", row[col])
            # this will remove all un-escapped '\n' from the original html
            match = re.findall(r"\\n", row[col])
            if match:
                for m in match:
                    row[col] = re.sub(r"\\n", " ", row[col])
            # this will remove all un-escapped '\t' from the original html
            match = re.findall(r"\\t", row[col])
            if match:
                for m in match:
                    row[col] = re.sub(r"\\t", " ", row[col])
            # if the column is now blank becasue of what we removed, set it
            # to NaN so pandas can handle it easier
            if not row[col]:
                row[col] = ""
            else:
                row[col] = row[col].strip()
    return row

In [11]:
cleaned_html = input_data.apply(clean_nyt_html, axis=1)

In [12]:
print(cleaned_html.shape)
# There are over 100 lines missing an input, these are worthless and are dropped.
cleaned_html.dropna(axis=0, subset=["input"], inplace=True)
print(cleaned_html.shape)
dropped_missing = cleaned_html


(179207, 5)
(179063, 5)


All unit labels are words and the training of the crf model cant associate oz. with ounce, so we need to fix all abbreviations here. On top of this the NYT data is missing labels we can easily and accuratly assign here.

In [13]:
def fix_abbreviations(row):
    """
    Converts instances of oz., ml., and g. to ounce and gram respectively
    """
    columns = ["input", "unit", "comment"]
    for col in columns:
        # replace oz. with ounce
        if row[col] == row[col]:
            match = re.findall(r"([^\w])oz\.?([^\w])?", row[col])
            if match:
                for m in match:
                    if len(m) == 1:
                        row[col] = re.sub(
                            r"([^\w])oz\.?([^\w])", m[0] + "ounce", row[col], 1
                        )
                    else:
                        row[col] = re.sub(
                            r"([^\w])oz\.?([^\w])", m[0] + "ounce" + m[1], row[col], 1
                        )
            # replace ml. with milliliter
            match = re.findall(r"([^\w])ml\.?([^\w])?", row[col])
            if match:
                for m in match:
                    if len(m) == 1:
                        row[col] = re.sub(
                            r"([^\w])ml\.?([^\w])", m[0] + "milliliter", row[col], 1
                        )
                    else:
                        row[col] = re.sub(
                            r"([^\w])ml\.?([^\w])", m[0] + "milliliter" + m[1], row[col], 1
                        )
            # replace g. with gram
            match = re.findall(r"(\d+)\s?g\.?([^\w])", row[col])
            if match:
                for m in match:
                    row[col] = re.sub(
                        r"(\d+)\s?g\.?([^\w])", m[0] + " gram" + m[1], row[col], 1
                    )

            # replace tbsp with tablespoon
            match = re.findall(r"[Tt]bsp\.*", row[col])
            if match:
                for m in match:
                    row[col] = re.sub(r"[Tt]bsp\.*", "tablespoon", row[col], 1)

            # replace tsp with teaspoon
            match = re.findall(r"[Tt]sp\.*", row[col])
            if match:
                for m in match:
                    row[col] = re.sub(r"[Tt]sp\.*", "teaspoon", row[col], 1)
                # there are a handful of instances when "tsp."" is present that the unit is either missing or incorrectly labeled: "tablespoon"
                row["unit"] = "teaspoon"
    return row

In [14]:
fixed_abr = dropped_missing.apply(fix_abbreviations, axis=1)

So far we've only fixed various word and character level issues in the data, now we will start making larger but necessary changes to the data and the labels. First lets look at all the unique unit values.

In [15]:
c = Counter()  
c.update([str(x) for x in fixed_abr["unit"].values])
sorted(c.keys())

['1 - inch-thick slice', '1 - pound bag', '1 1/2 - pound', '1 inch', '1/2 inch', '1/4 - inch piece', '10 - pound piece', '12 - ounce bottle', '12 - ounce bottles', '12 - ounce cans', '12 - ounce piece', '12 - ounce piece filet', '14 - ounce can', '15 - ounce can', '15 - ounce cans', '15 - ounce container', '2 - inch piece', '2 - ounce piece', '2 - pound', '2 1/2 - pound', '2 by 4 inches', '20 - ounce can', '28 - ounce can', '28 - ounce cans', '3 - inch', '3 - inch piece', '3 - inch stick', '3 1/2 - pound', '3 1/2 pound', '3 1/2- to 4 - pound', '3- to 4 - pound', '3/4- inch-thick', '4 - inch round', '4 - inch stick', '4 inches long', '4- to 5 - pound', '46 - ounce', '48 - ounce can', '5 - to-7 - pound', '6 - ounce bag', '6 - pound', '7 - inch', '7 - ounce tube', '7 - pound', '7 1/2 -ounce packages', '750 - milliliter bottle', '8 - inch', '8 - ounce packages', '8 to 9 ounces', '8- to 10 - pound', 'Handful', 'Pinch', 'Scant pinch', 'Slice', 'Slices', 'Small piece', 'bag', 'ball', 'bar', '

This is a mess, while more obscure units like "sprig" or "thread" are fine many are just adjectives or adjective unit combinations, e.g. "heaping teaspoon". We will move all non unit of measure words to the the comment field.

In [16]:
def fix_units(row):
    # If the unit is really just an adjective move it to the comment field and set unit to NaN
    if row["unit"] in ["very large", "large", "medium", "medium-size", "medium-sized", "medium-small", "small-to-medium", "small", "very-small", "very small", "chopped", "crushed", "fresh", "thinly sliced", "ripe", "long", "smoked", "dried", "hatch"]:
        if row["comment"] != row["comment"]:
            row["comment"] = row["unit"]
        else:
            row["comment"] += " " + row["unit"]
        row["unit"] = ""
    # For some reason a dozen is a unit
    if row["unit"] == "dozen":
        row["qty"] *= 12
        row["unit"] = ""
    if row["unit"] == "dozen medium":
        row["qty"] *= 12
        row["unit"] = ""
        row["comment"] += " " + "medium"
    # Find and replace adjective unit combinations
    if row["unit"] == row["unit"]:
        match = re.findall(r"([Ss]mall|medium|large|full|bushy|leafy|heaping|[Ss]cant|thumb\-size|thin|very\ssmall|thick)\s(sprigs?|leaves|bunch|cloves?|chunk|knob|teaspoon|can|pinch|handfuls|ears|piece|slices?|cups?|head)", row["unit"])
        if match:
            for m in match:
                row["unit"] = m[1]
                if row["comment"] != row["comment"]:
                    row["comment"] = m[0]
                else:
                    row["comment"] += " " + m[0]
        
        elif row["unit"] not in ["inch", "inches", "pinch","Pinch", "pinches"] and "inch" in row["unit"]:
            row["unit"] = "inch"
        elif row["unit"] not in ["ounce", "ounces", "fluid ounce", "fluid ounces"] and "ounce" in row["unit"]:
            row["unit"] = "ounce"
        elif row["unit"] not in ["pound", "pounds"] and "pound" in row["unit"]:
            row["unit"] = "pound"
        elif row["unit"] != "milliliter" and "milliliter" in row["unit"]:
            row["unit"] = "milliliter"
        
    return row


In [17]:
fixed_units = fixed_abr.apply(fix_units, axis=1)

In [18]:
c = Counter()  
c.update([str(x) for x in fixed_units["unit"].values])
sorted(c.keys())

['', 'Handful', 'Pinch', 'Slice', 'Slices', 'bag', 'ball', 'bar', 'basket', 'batch', 'bottle', 'bowl', 'box', 'branch', 'bulb', 'bulbs', 'bunch', 'bunches', 'cake', 'can', 'cans', 'chunk', 'chunks', 'clove', 'cloves', 'cluster', 'cube', 'cubes', 'cup', 'cups', 'dash', 'dashes', 'drop', 'ear', 'ears', 'envelope', 'feet', 'fifth', 'fillet', 'fluid ounce', 'foot', 'gallon', 'glass', 'gram', 'grams', 'grind', 'half', 'handful', 'handfuls', 'head', 'heads', 'hunk', 'inch', 'jar', 'knob', 'layer', 'leaf', 'leaves', 'length', 'link', 'liter', 'liters', 'loaf', 'lobe', 'log', 'milliliter', 'nan', 'ounce', 'ounces', 'package', 'packet', 'pair', 'part', 'pat', 'piece', 'pieces', 'pinch', 'pinches', 'pint', 'pints', 'plate', 'pods', 'portion', 'pound', 'pounds', 'quart', 'quarts', 'rack', 'racks', 'rectangle', 'ribs', 'root', 'scoop', 'segment', 'segments', 'serving', 'shake', 'sheet', 'shot', 'side', 'slab', 'slice', 'slices', 'sliver', 'spiral', 'splash', 'sprig', 'sprigs', 'square', 'stalk', '

Much better.

In the following cells we will start modifying most of the numerical information in the input string.
There exist the following problems:


1. The source of 80% of the issues we will fix in the following cells is that the quantity label is a decimal number, but often the quantity in the input has a non-decimal representation. Which, because of how labels are assigned to the input, the CRF cannot learn the relationship between the two (among other issues). e.g. "0.5 teaspoon cumin" is fine, "1/2 teaspoon cumin" is not. Specific cases are as follows:

2. The quantity is represented as a numeric word or phrase, e.g "One stalk rhubarb", "one and one-half". We convert theses words to their decimal equivalent.

3. The quantity has a unicode fraction representation, e.g. ¾ or a mixed fraction e.g. 1¾, we convert these to 0.25 and 1.25 respectively.

4. 

In [19]:
numbers = {"one":1, "two":2, "three":3, "four":4, "five":5, "six":6, "seven":7, "eight":8, "nine":9, "ten":10}

def fix_numeric_words(row):
    columns = ["input", "unit", "name","comment"]
    for col in columns:
        if row[col] == row[col]:
            row[col] = re.sub(r"(\sone and a half|\sone and one-half)", " 1.5", row[col])
            row[col] = re.sub(r"one and one-quarter", "1.25", row[col])
            row[col] = re.sub(r"two and one-quarter", "2.25", row[col])
            row[col] = re.sub(r"two and one-half", "2.5", row[col])
            row[col] = re.sub(r"three and a half", "3.5", row[col])
            match = re.findall(r"\s(one|two|three|four|five|six|seven|eight|nine|ten)\s", row[col])
            for m in match:
                row[col] = re.sub(r"\s(one|two|three|four|five|six|seven|eight|nine|ten)\s", " " + str(numbers[m]) + " ", row[col])
    return row

In [20]:
fixed_numeric = fixed_units.apply(fix_numeric_words, axis=1)

In [21]:
fixed_numeric.iloc[650:660].input

index
652                               Grated zest of 1 lemon
653                             1 tablespoon lemon juice
654    1/2 cup goat cheese (chévre), (4 ounces), soft...
655              1 cup cream cheese (8 ounces), softened
656                                             1 nutmeg
657                                       Pinch sea salt
658                                   6 fresh egg whites
659                                          1 cup cream
660                                      1 1/2 cups milk
661                          2 cups good quality bourbon
Name: input, dtype: object

In [22]:
# Qty in data are rounded up to two decimal places
decimal.getcontext().rounding = decimal.ROUND_HALF_UP
def clean_unicode_fractions(row):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """
    columns = ["input", "name", "comment"]
    for col in columns:
        # This filters out NaN values so they wont get caught in the try except
        if row[col] == row[col]:
            # match all mixed fractions with a unicode fraction (e.g. 1 ¾ or 1¾) and add them together
            # UNHANDLED EDGE CASE: There are a handful of ingredients in which the whole number is a quantity
            # mulitplier and not part of the fraction, e.g. 2 1/4 in cinnamon sticks, should be 0.5 not 2.25
            match = re.findall(r"(\d+\s?)?([\u2150-\u215E\u00BC-\u00BE])", row[col])
            if match:
                for m in match:
                    if not m[0]: # single unicode fraction e.g. ¾
                        num = float(Fraction(unicodedata.numeric(m[1])))
                    else: # mixed unicode fraction e.g. 1¾
                        num = float(m[0]) + float(Fraction(unicodedata.numeric(m[1])))
                    num = decimal.Decimal(num)
                    num = str(round(num, 2))
                    row[col] = re.sub(
                        r"(\d+\s?)?([\u2150-\u215E\u00BC-\u00BE])", num, row[col], 1
                    )
                # Many rows with unicode fractions don't have a quantity value, we assume that this ia the correct label
                # (and in the majority of cases it is)
                if not row["qty"]:
                    row['qty'] = num
                # Many rows with mixed unicode fractions only have the whole number part or the fraction part
                # in their qty, we correct this if the qty matches matches one of the two, (if it doesnt we assume 
                # a different part of the input is the quantity) 
                elif row['unit'] == row['unit'] and float(row["qty"]) != float(num):
                    if m[0] and float(m[0]) == float(row["qty"]):
                        row["qty"] = num

    return row

In [23]:
# Unicode has numerous characters to represent fractions like ¾, we remove these
cleaned_unicode = fixed_numeric.apply(clean_unicode_fractions, axis=1)

Now that we've handled the unicode fractions, we will remove the remaining unicode characters (this is almost exclusively letters with accent marks). This, among other things, will standardize the spellings of things like creme fraiche and jalepeno.

In [24]:
c = Counter()
for col in ["input", "name", "comment"]:
    cleaned_unicode[col] = cleaned_unicode[col].apply(str).apply(unidecode.unidecode)        # remove all accent characters
    cleaned_unicode[col].apply(lambda x: c.update(list(str(x))))
sorted(c.keys())

[' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

Beatiful.

In [None]:
cleaned_unicode.iloc[650:660].input#.apply(unidecode.unidecode)

In [None]:
# Qty in data are rounded up to two decimal places
decimal.getcontext().rounding = decimal.ROUND_HALF_UP
def merge_fractions(row):
    """
    Merges mixed fractions: 1 2/3 => 1.67
    """
    columns = ["input", "name", "comment"]
    for col in columns:
        # This filters out NaN values so they wont get caught in the try except
        if row[col] == row[col]:
            match = re.findall(r"(\d+)[\-\s]?(\d+\/\d+)", row[col])
            if match:
                for m in match:
                    num = float(m[0]) + float(Fraction(m[1]))
                    num = decimal.Decimal(num)
                    row[col] = re.sub(r"(\d+)[\-\s]?(\d+\/\d+)", str(round(num, 2)), row[col], 1)

            match = re.findall(r"(\d+\/\d+)", row[col])
            if match:
                for m in match:
                    num = float(Fraction(m))
                    num = decimal.Decimal(num)
                    row[col] = re.sub(r"(\d+\/\d+)", str(round(num, 2)), row[col], 1)

    return row

In [None]:
# Many ingredient quantities are written as 1 1/2 to represent 1.5
# The quantity label however is always written as 1.5 so we need to 
# convert these fractions so that the crf can match it
merged_data = cleaned_unicode.apply(merge_fractions, axis=1)

In [None]:
merged_data.head()

In [None]:
decimal.getcontext().rounding = decimal.ROUND_HALF_UP
def merge_quantities(row):
    """
    Many ingredients are written in the form 2 8.5-ounce cans...
    This is both tricky for the model to parse and made worse because
    the labeled data incosistently labels the quanity as 2, 8.5, or 17.
    We want to reuce all these to a single value:
    2 8.5-ounce => 17.0-ounce
    and update the quantity label as appropriate
    """
    try:
        columns = ["input", "name","comment"]
        for col in columns:
            # This filters out NaN values so they wont get caught in the try except
            if row[col] == row[col]:
                # Ok first we need to average any number ranges, e.g. "3 to 4 pounds" becomes "3.5 pounds"
                match = re.findall(r"(\d+\.?\d*)[\s\-]*[tor]+[\s\-]*(\d+\.?\d*)", row[col])
                if match:
                    for m in match:
                        num = ((float(m[0])+float(m[1])) / 2)
                        num = decimal.Decimal(num)
                        row[col] = re.sub(r"(\d+\.?\d*)[\s\-]*[tor]+[\s\-]*(\d+\.?\d*)", str(round(num, 2)), row[col], 1)
                        if float(m[0]) == row["qty"] or float(m[1]) == row["qty"] and col == "input":
                            # probably a pretty good guess that the qty was only one of these two numbers, update it with the new num
                            row["qty"] = float(round(num, 2))
                # now we do quantity multipliers
                match = re.findall(r"(\d+)\s+(\d+\.*\d*)", row[col])
                if match:
                    for m in match:
                        num = float(m[0]) * float(m[1])
                        num = decimal.Decimal(num)
                        row[col] = re.sub(r"(\d+)\s+(\d+\.*\d*)", str(round(num, 2)), row[col], 1)
                        if float(m[0]) == row["qty"] or float(m[1]) == row["qty"] and col == "name":
                            # probably a pretty good guess that the qty was one of these two numbers, update it with the new num
                            row["qty"] = float(round(num, 2))
    except TypeError:
        print("Error Merging Ranges: ", row)
    return row
    

In [None]:
merged_range = merged_data.apply(merge_quantities, axis=1)

Phase two of merging quantities going to start making some more assumptions
Starting with everything in the name label after a comma should be a comment

In [None]:
merged_range[merged_range["name"] != merged_range["name"]]

In [None]:
def fix_names(row):
    if row["name"] == row["name"]:
        if ',' in row["name"]:
            temp = row["name"].split(',')
            row["name"] = temp[0]
            if row["comment"] != row["comment"]:
                row["comment"] = temp[1]
            else:
                row["comment"] += temp[1]
    return row            


In [None]:
fixed_names = merged_range.apply(fix_names, axis=1)

In [None]:
def replace_volume_with_weight(row):
    """
    By default the NYT labelled the qty and unit using volume,
    this changes those labels to weight if its in the ingredient input.
    """
    match = re.findall(r"\((\d+\.*\d*)\s(ounces?|pounds?|grams?|milliliters?)\)", row["input"])
    for m in match:
        if row["unit"] == row["unit"] and row["unit"] != m[1]:
            if row["comment"] == row["comment"]:
                row["comment"] = row["comment"].replace(m[0], '')
                row["comment"] = row["comment"].replace(m[1], '')
                if row["qty"] == row["qty"]:
                    if '.' in m[0]:
                        row["comment"] += " " + str(row["qty"]) + " " + row["unit"]
                    else:
                        row["comment"] += " " + str(int(row["qty"])) + " " + row["unit"]
                else:
                    row["comment"] += " " + row["unit"]
                row["unit"] = m[1]
                row["qty"] = float(m[0])
    return row

In [None]:
replaced_volume = fixed_names.apply(replace_volume_with_weight, axis=1)

In [None]:
replaced_volume.to_csv("../data/interim/nyt_partial_clean.csv")

In [None]:
replaced_volume.to_pickle("../data/interim/crf_training_data.pickle")

In [None]:
training_data, test_data = train_test_split(replaced_volume, test_size=0.2)

In [None]:
training_data.to_pickle("../data/interim/crf_training_data.pickle")
test_data.to_pickle("../data/interim/crf_test_data.pickle")