In [1]:
import re
from fractions import Fraction
import unicodedata
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load raw data
input_data = pd.read_csv(
    "../data/raw/nyt-ingredients-snapshot-2015.csv", index_col="index"
)
input_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [3]:
def clean_html(s):
    """
    This will replace all html tags that were not stripped
    from the NYT data
    """
    columns = ["input", "name", "comment"]
    for col in columns:
        # This filters out NaN values so they wont get caught in the try except
        if s[col] == s[col]:
            try:
                # this will remove all: <a href=...>see recipe</a>
                match = re.findall(r"\s*\(?<.*see\s*recipe.*>\)?", s[col])
                if match:
                    for m in match:
                        s[col] = re.sub(r"\s*\(?<.*see\s*recipe.*>\)?", "", s[col])
                        if col == "input" and s["comment"] == s["comment"]:
                            s["comment"] = re.sub(r"see recipe", "", s["comment"])
                # this will remove all: see <a href=...>recipe</a>
                match = re.findall(r"\(?\s*(see)\s*?<.*recipe.*>\)?", s[col])
                if match:
                    for m in match:
                        s[col] = re.sub(r"\(?\s*(see)\s*?<.*recipe.*>\)?", "", s[col])
                        if col == "input" and s["comment"] == s["comment"]:
                            s["comment"] = re.sub(r"see recipe", "", s["comment"])
                # This will remove all <span> and misc <a href=...>...</a>
                match = re.findall(r"<.*?>", s[col])
                if match:
                    for m in match:
                        s[col] = re.sub(r"<.*?>", "", s[col])
                # this will remove all un-escapped '\n' from the original html
                match = re.findall(r"\s*\\n\s*", s[col])
                if match:
                    for m in match:
                        s[col] = re.sub(r"\\n", " ", s[col])
                # if the column is now blank becasue of what we removed, set it
                # to NaN so pandas can handle it easier
                if not s[col]:
                    s[col] = float("nan")
                else:
                    s[col] = s[col].strip()

            except TypeError:
                print("error parsing " + col + ": ", s)
    return s

In [4]:
# There are leftover html tags that need to be removed
# Has the added benefit of stripping out the extraneous "see recipe"
# from both input and comment
input_data = input_data.apply(clean_html, axis=1)

In [5]:
print(input_data.shape)
# There are over 100 lines missing an input, these are worthless and are dropped.
input_data.dropna(axis=0, subset=["input"], inplace=True)
print(input_data.shape)

(179207, 6)
(179060, 6)


In [6]:
def clean_unicode_fractions(s):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """
    try:
        # match all mixed fractions with a unicode fraction (e.g. 1 ¾ or 1¾) and add them together
        match = re.findall(r"(\d+)\s?([\u2150-\u215E\u00BC-\u00BE])", s)
        if match:
            for m in match:
                num = float(m[0]) + float(Fraction(unicodedata.numeric(m[1])))
                s = re.sub(
                    r"(\d+)\s?([\u2150-\u215E\u00BC-\u00BE])", str(round(num, 3)), s
                )

        # match all other unicode fractions
        match = re.findall(r"([\u2150-\u215E\u00BC-\u00BE])", s)
        if match:
            for m in match:
                s = re.sub(
                    r"([\u2150-\u215E\u00BC-\u00BE])",
                    str(round(float(Fraction(unicodedata.numeric(m))), 3)),
                    s,
                )
    except TypeError:
        print("error parsing: ", s)
    return s

In [7]:
# Unicode has numerous characters to represent fractions like ¾, we remove these
input_data["input"] = input_data["input"].apply(clean_unicode_fractions)

In [8]:
def merge_ranges(s):
    """
    Many ingredients are written "1 2-2 1/2 pound" this represents
    an acceptable quantity range of 2 to 2.5. Because this will
    make parseing harder we will replace the range with the average.
    """
    try:
        match = re.findall(r"\d+\s\d+\-\d+\s*\d*\/*\d*", s["input"])

    except TypeError:
        print("error parsing: ", s)

In [9]:
def merge_fractions(s):
    """
    Merges mixed fractions: 1 2/3 => 1.67
    """
    match = re.findall(r"(\d+)\s+(\d\/\d)", s)
    if match:
        for m in match:
            num = float(m[0]) + float(Fraction(m[1]))
            s = re.sub(r"(\d+)\s+(\d\/\d)", str(round(num, 3)), s)

    match = re.findall(r"(\d\/\d)", s)
    if match:
        for m in match:
            num = float(Fraction(m))
            s = re.sub(r"(\d\/\d)", str(round(num, 3)), s)
    return s

In [10]:
# Many ingredient quantities are written as 1 1/2 to represent 1.5
# The quantity label however is always written as 1.5 so we need to 
# convert these fractions so that the crf can match it
input_data["input"] = input_data["input"].apply(merge_fractions)

In [11]:
def multiply_qty(s):
    """
    Many ingredients are written in the form 2 8.5-ounce cans...
    This is both tricky for the model to parse and made worse because
    the labeled data incosistently labels the quanity as 2, 8.5, or 17.
    We want to reuce all these to a single value:
    2 8.5-ounce => 17.0-ounce
    and update the quantity label as appropriate
    """
    match = re.findall(r"(\d+)\s+(\d+\.\d+)", s["input"])
    if match:
        for m in match:
            num = float(m[0]) * float(m[1])
            s["input"] = re.sub(r"(\d+)\s+(\d+\.\d+)", str(round(num, 3)), s["input"])
            if float(m[0]) == float(s["qty"]) or float(m[1]) == float(s["qty"]):
                # probably a pretty good guess that the qty was only one of these two numbers, update it with the new num
                s["qty"] = str(round(num, 3))
    return s

In [12]:
input_data = input_data.apply(multiply_qty, axis=1)

In [13]:
def fix_inconsistencies(row):
    """
    Fix various inconsistencies in the labels.
    """
    try:
        if ',' in row["name"]:
            name = row["name"].split(',',1)
            #print(name)
    except TypeError:
        print(row)  

In [14]:
input_data[(input_data['input'].str.contains('garlic')) & (input_data['name'] != "garlic")].apply(fix_inconsistencies, axis=1)

input        2 cloves garlic
name                     NaN
qty                        0
range_end                  0
unit                     NaN
comment                  NaN
Name: 120232, dtype: object
input        4 garlic cloves, minced
name                             NaN
qty                                0
range_end                          0
unit                             NaN
comment                          NaN
Name: 144161, dtype: object
input        Add cooled zucchini and pine nuts to food proc...
name                                                       NaN
qty                                                          0
range_end                                                    0
unit                                                       NaN
comment                                                    NaN
Name: 150489, dtype: object
input        1 small garlic clove puréed
name                                 NaN
qty                                    0
range_end          

index
185       None
238       None
773       None
782       None
1266      None
          ... 
178853    None
178993    None
178994    None
179009    None
179037    None
Length: 1380, dtype: object

In [21]:
def fix_abbreviations(s):
    """
    Converts instances of oz. and g. to ounce and gram respectively
    """
    columns = ["input", "unit"]
    for col in columns:
        # replace oz. with ounce
        if s[col] == s[col]:
            match = re.findall(r"([0-9])\s*oz\.*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"([0-9])\s*oz\.*", m + " ounce", s[col])
            # replace g. with gram
            match = re.findall(r"([0-9])\s*g([^a-z])", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(
                        r"([0-9])\s*g([^a-z])", m[0] + " gram" + m[1], s[col]
                    )
            # replace tbsp with tablespoon
            match = re.findall(r"[Tt]bsp\.*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"[Tt]bsp\.*", "tablespoon", s[col])
            # replace tsp with teaspoon
            match = re.findall(r"[Tt]sp\.*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"[Tt]sp\.*", "teaspoon", s[col])
    return s

In [22]:
input_data = input_data.apply(fix_abbreviations, axis=1)

In [23]:
training_data, test_data = train_test_split(input_data, test_size=0.2)

In [24]:
training_data.to_pickle("../data/interim/crf_training_data.pickle")
test_data.to_pickle("../data/interim/crf_test_data.pickle")

In [25]:
# Examining some remaining outliers in the data
input_data.unit.unique()

array(['cup', nan, 'stalk', 'tablespoon', 'teaspoon', 'clove', 'pound',
       'ounce', 'pinch', 'sprig', 'dash', 'slice', 'head', 'bunch', 'box',
       'dozen', 'sheet', 'piece', 'pint', 'fillet', 'gallon', 'quart',
       'strip', 'bottle', 'drop', 'cake', 'scoop', 'stick', 'ear', 'can',
       'bulb', 'package', 'loaf', 'layer', 'rack', 'envelope', 'leaf',
       'ball', 'chunk', 'knob', 'bag', 'rectangle', 'inch', 'pair',
       'shake', 'wedge', 'branch', 'half', 'packet', 'handful', 'fifth',
       'steak', 'splash', 'log', 'slab', 'grind', 'square', 'milliliter',
       'liter', 'twist', 'cube', 'gram', 'bowl', 'stem', 'vial', 'length',
       'segment', 'plate', 'foot', 'cluster', 'fluid ounce', 'glass',
       'bar', 'link', 'spiral', 'hatch', 'basket', 'part', 'pat',
       'sliver', 'jar', 'side', 'hunk', 'thread', 'batch', 'lobe',
       'portion', 'serving', 'root', 'shot', 'tablespoons', 'small',
       'cloves', 'cups', 'ounces', 'pounds', 'very small', 'teaspoons',
   

In [None]:
def move_size_descriptors(row):
    sizes = ["small, medium, large"]
    for size in sizes:
        if size in row["unit"]:
            pass