In [1]:
import re
from fractions import Fraction
import unicodedata
import pandas as pd
from sklearn.model_selection import train_test_split
import sys
sys.path.append("..")
from src.data import data_cleaning_util
from src.features import create_features

In [2]:
# Load raw data
input_data = pd.read_csv(
    "../data/raw/nyt-ingredients-snapshot-2015.csv", index_col="index"
)
input_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1 1/4 cups cooked and pureed fresh butternut s...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1 1/2 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [3]:
def clean_nyt_html(s):
    """
    This will replace all html tags that were not stripped
    from the NYT data
    """
    columns = ["input", "name", "comment"]
    for col in columns:
        # This filters out NaN values so they wont get caught in the try except
        if s[col] == s[col]:
            try:
                # this will remove all: <a href=...>see recipe</a>
                match = re.findall(r"\s*\(?<.*see\s*recipe.*>\)?", s[col])
                if match:
                    for m in match:
                        s[col] = re.sub(r"\s*\(?<.*see\s*recipe.*>\)?", "", s[col])
                        if col == "input" and s["comment"] == s["comment"]:
                            s["comment"] = re.sub(r"see recipe", "", s["comment"])
            except TypeError:
                print("ERROR: Removing <see recipe>, " + col + " ", s)
            try:
                # this will remove all: see <a href=...>recipe</a>
                match = re.findall(r"\(?\s*(see)\s*?<.*recipe.*>\)?", s[col])
                if match:
                    for m in match:
                        s[col] = re.sub(r"\(?\s*(see)\s*?<.*recipe.*>\)?", "", s[col])
                        if col == "input" and s["comment"] == s["comment"]:
                            s["comment"] = re.sub(r"see recipe", "", s["comment"])
            except TypeError:
                print("ERROR: Removing see <recipe>, " + col + " ", s)

            # This will remove all <span> and misc <a href=...>...</a>
            match = re.findall(r"<.*?>", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"<.*?>", "", s[col])
            # this will remove all un-escapped '\n' from the original html
            match = re.findall(r"\s*\\n\s*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"\\n", " ", s[col])
            # if the column is now blank becasue of what we removed, set it
            # to NaN so pandas can handle it easier
            if not s[col]:
                s[col] = float("nan")
            else:
                s[col] = s[col].strip()

    return s

In [4]:
# There are leftover html tags that need to be removed
# Has the added benefit of stripping out the extraneous "see recipe"
# from both input and comment
input_data = input_data.apply(clean_nyt_html, axis=1)

In [5]:
print(input_data.shape)
# There are over 100 lines missing an input, these are worthless and are dropped.
input_data.dropna(axis=0, subset=["input"], inplace=True)
print(input_data.shape)

(179207, 6)
(179060, 6)


In [6]:
def clean_unicode_fractions(s):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """
    try:
        # match all mixed fractions with a unicode fraction (e.g. 1 ¾ or 1¾) and add them together
        match = re.findall(r"(\d+)\s?([\u2150-\u215E\u00BC-\u00BE])", s)
        if match:
            for m in match:
                num = float(m[0]) + float(Fraction(unicodedata.numeric(m[1])))
                s = re.sub(
                    r"(\d+)\s?([\u2150-\u215E\u00BC-\u00BE])", str(round(num, 3)), s
                )

        # match all other unicode fractions
        match = re.findall(r"([\u2150-\u215E\u00BC-\u00BE])", s)
        if match:
            for m in match:
                s = re.sub(
                    r"([\u2150-\u215E\u00BC-\u00BE])",
                    str(round(float(Fraction(unicodedata.numeric(m))), 3)),
                    s,
                )
    except TypeError:
        print("ERROR CLEANING UNICODE: ", s)
    return s

In [7]:
# Unicode has numerous characters to represent fractions like ¾, we remove these
input_data["input"] = input_data["input"].apply(clean_unicode_fractions)

In [8]:
def merge_ranges(row):
    try:
        match = re.findall(r"(\d+)\sor\s(\d+)\s(\d+)\-\sor\s(\d+)\-", row["input"])
        if match:
            for m in match:
                num = ((float(m[0]) + float(m[1])) / 2) * ((float(m[2]) + float(m[3])) / 2)
                print(row["input"], row["qty"], num)
                row["input"] = re.sub(r"(\d+)\sor\s(\d+)\s(\d+)\-\sor\s(\d+)\-", str(round(num, 3))+"-", row["input"])
                if row["qty"] == '' or float(row["qty"]) == float(m[0]) or float(row["qty"]) == float(m[1]) or float(row["qty"]) == float(m[2]) or float(row["qty"]) == float(m[3]):
                    row["qty"] = str(round(num, 3))
                print(row["input"], row["qty"])
        match = re.findall(r"(\d+)\s(\d+)\-\sor\s(\d+)\-", row["input"])
        if match:
            for m in match:
                num = float(m[0]) * ((float(m[1])+float(m[2])) / 2)
                print(row["input"], row["qty"], num)
                row["input"] = re.sub(r"(\d+)\s(\d+)\-\sor\s(\d+)\-", str(round(num, 3))+"-", row["input"])
                if row["qty"] == '' or float(row["qty"]) == float(m[0]) or float(row["qty"]) == float(m[1]) or float(row["qty"]) == float(m[2]):
                    row["qty"] = str(round(num, 3))
                print(row["input"], row["qty"])
    except TypeError:
        print("Error Merging Ranges: ", row)
    return row
    

In [9]:
input_data = input_data.apply(merge_ranges, axis=1)

4 12-inch flour tortillas or 8 7- or 8-inch flour tortillas 4.0 60.0
4 12-inch flour tortillas or 60.0-inch flour tortillas 4.0
2 cups diced canned plum tomatoes (1 28- or 29-ounce can 2.0 28.5
2 cups diced canned plum tomatoes (28.5-ounce can 2.0
4 9- or 10-inch flour tortillas 4.0 38.0
38.0-inch flour tortillas 38.0
1 6- or 7-ounce can or pouch of salmon, drained, bones and skin removed 6.0 6.5
6.5-ounce can or pouch of salmon, drained, bones and skin removed 6.5
1 6- or 7-ounce can of wild salmon 6.0 6.5
6.5-ounce can of wild salmon 6.5
1 15- or 16-ounce can black beans 1.0 15.5
15.5-ounce can black beans 15.5
4 9- or 10-inch no-fat added flour tortillas 4.0 38.0
38.0-inch no-fat added flour tortillas 38.0
1 6- or 7-pound picnic ham or pork butt, trimmed of skin but not fat 6.0 6.5
6.5-pound picnic ham or pork butt, trimmed of skin but not fat 6.5
2 8- or 9-inch spongecake layers, homemade or storebought 2.0 17.0
17.0-inch spongecake layers, homemade or storebought 17.0
1 6- or 7-ou

In [10]:
def merge_fractions(s):
    """
    Merges mixed fractions: 1 2/3 => 1.67
    """
    try:
        match = re.findall(r"(\d+)\s+(\d\/\d)", s)
        if match:
            for m in match:
                num = float(m[0]) + float(Fraction(m[1]))
                s = re.sub(r"(\d+)\s+(\d\/\d)", str(round(num, 3)), s)

        match = re.findall(r"(\d\/\d)", s)
        if match:
            for m in match:
                num = float(Fraction(m))
                s = re.sub(r"(\d\/\d)", str(round(num, 3)), s)
    except ZeroDivisionError:
        print(s)
    return s

In [11]:
# Many ingredient quantities are written as 1 1/2 to represent 1.5
# The quantity label however is always written as 1.5 so we need to 
# convert these fractions so that the crf can match it
input_data["input"] = input_data["input"].apply(merge_fractions)

In [12]:
def multiply_qty(s):
    """
    Many ingredients are written in the form 2 8.5-ounce cans...
    This is both tricky for the model to parse and made worse because
    the labeled data incosistently labels the quanity as 2, 8.5, or 17.
    We want to reuce all these to a single value:
    2 8.5-ounce => 17.0-ounce
    and update the quantity label as appropriate
    """
    match = re.findall(r"(\d+)\s+(\d+\.*\d*)", s["input"])
    if match:
        for m in match:
            num = float(m[0]) * float(m[1])
            print(s["input"], s["qty"], num)
            s["input"] = re.sub(r"(\d+)\s+(\d+\.*\d*)", str(round(num, 3)), s["input"])
            if float(m[0]) == float(s["qty"]) or float(m[1]) == float(s["qty"]):
                # probably a pretty good guess that the qty was only one of these two numbers, update it with the new num
                s["qty"] = str(round(num, 3))
            print(s["input"], s["qty"], num)
    return s

In [13]:
input_data = input_data.apply(multiply_qty, axis=1)

1.25 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted 1.25 10.0
1.25 cups cooked and pureed fresh butternut squash, or 10.0-ounce package frozen squash, defrosted 1.25 10.0
1.25 cups cooked and pureed fresh butternut squash, or 1 10-ounce package frozen squash, defrosted 1.25 10.0
1.25 cups cooked and pureed fresh butternut squash, or 10.0-ounce package frozen squash, defrosted 1.25 10.0
1 2-pound beef tenderloin roast, cut from the large end, trimmed and tied at 0.5 -inch intervals with kitchen twine 2.0 2.0
2.0-pound beef tenderloin roast, cut from the large end, trimmed and tied at 0.5 -inch intervals with kitchen twine 2.0 2.0
1 6-inch white-corn tortilla 1.0 6.0
6.0-inch white-corn tortilla 6.0 6.0
2 28-ounce cans crushed tomatoes 56.0 56.0
56.0-ounce cans crushed tomatoes 56.0 56.0
4 9-ounce boxes frozen artichoke hearts 4.0 36.0
36.0-ounce boxes frozen artichoke hearts 36.0 36.0
2 16-ounce cans dark cherries, pitted 32.0 32.0
32.0-oun

2 8-ounce bags white pearl onions 16.0 16.0
16.0-ounce bags white pearl onions 16.0 16.0
2 8-ounce bags red pearl onions 16.0 16.0
16.0-ounce bags red pearl onions 16.0 16.0
2 10-ounce packages breakfast sausage 20.0 20.0
20.0-ounce packages breakfast sausage 20.0 20.0
1 4-pound chicken 1.0 4.0
4.0-pound chicken 4.0 4.0
1 4.5-ounce piece blue cheese (Danish, Stilton, Gorgonzola or Roquefort) 4.5 4.5
4.5-ounce piece blue cheese (Danish, Stilton, Gorgonzola or Roquefort) 4.5 4.5
2 12-ounce packages firm cranberries 24.0 24.0
24.0-ounce packages firm cranberries 24.0 24.0
1 14-pound turkey 1.0 14.0
14.0-pound turkey 14.0 14.0
1 15-ounce jar chestnuts packed in syrup, drained (see note) 15.0 15.0
15.0-ounce jar chestnuts packed in syrup, drained (see note) 15.0 15.0
1 3-inch strip of orange peel 1.0 3.0
3.0-inch strip of orange peel 3.0 3.0
1 4-pound duck 1.0 4.0
4.0-pound duck 4.0 4.0
1 12-ounce bag cranberries 1.0 12.0
12.0-ounce bag cranberries 12.0 12.0
1 17-to-20-pound ready-to-cook t

1 3-pound boneless center-cut pork loin 3.0 3.0
3.0-pound boneless center-cut pork loin 3.0 3.0
1 3-pound roasted chicken, skin removed, meat roughly chopped 3.0 3.0
3.0-pound roasted chicken, skin removed, meat roughly chopped 3.0 3.0
1 15-ounce can plain tomato sauce 15.0 15.0
15.0-ounce can plain tomato sauce 15.0 15.0
4 ripe fresh tomatoes, or 1 14-ounce can imported plum tomatoes, liquid discarded, peeled and coarsely chopped 4.0 14.0
4 ripe fresh tomatoes, or 14.0-ounce can imported plum tomatoes, liquid discarded, peeled and coarsely chopped 4.0 14.0
1 35-ounce can plus 1 28-ounce can (about 7 cups) peeled tomatoes and liquid 63.0 35.0
35.0-ounce can plus 35.0-ounce can (about 7 cups) peeled tomatoes and liquid 63.0 35.0
35.0-ounce can plus 35.0-ounce can (about 7 cups) peeled tomatoes and liquid 63.0 28.0
35.0-ounce can plus 35.0-ounce can (about 7 cups) peeled tomatoes and liquid 63.0 28.0
1 8-to-9-pound whole brisket, trimmed (see note) 8.0 8.0
8.0-to-9-pound whole brisket, t

1 19-ounce can chickpeas, drained and rinsed 1.0 19.0
19.0-ounce can chickpeas, drained and rinsed 19.0 19.0
1 3-inch knob of ginger, peeled 1.0 3.0
3.0-inch knob of ginger, peeled 3.0 3.0
1 13.5-ounce can coconut milk 13.5 13.5
13.5-ounce can coconut milk 13.5 13.5
24 12-inch wooden or bamboo skewers, soaked in hot water for 10 minutes 24.0 288.0
288.0-inch wooden or bamboo skewers, soaked in hot water for 10 minutes 288.0 288.0
2 2.5-pound chickens, rinsed, patted dry, spines removed, each cut into 12 pieces 5.0 5.0
5.0-pound chickens, rinsed, patted dry, spines removed, each cut into 12 pieces 5.0 5.0
2 16-ounce cans black beans, rinsed and drained 32.0 32.0
32.0-ounce cans black beans, rinsed and drained 32.0 32.0
1 15.5-ounce can white beans, rinsed and drained 15.5 15.5
15.5-ounce can white beans, rinsed and drained 15.5 15.5
4 8-ounce skinless and boneless chicken-breast halves, cut into 1-inch cubes 32.0 32.0
32.0-ounce skinless and boneless chicken-breast halves, cut into 1-in

36 1.5-inch chive segments 36.0 54.0
54.0-inch chive segments 54.0 54.0
1 2-inch length of leek 1.0 2.0
2.0-inch length of leek 2.0 2.0
4 8-ounce pieces swordfish (or 1 2-pound steak) 32.0 32.0
32.0-ounce pieces swordfish (or 32.0-pound steak) 32.0 32.0
32.0-ounce pieces swordfish (or 32.0-pound steak) 32.0 2.0
32.0-ounce pieces swordfish (or 32.0-pound steak) 32.0 2.0
1 1-inch chunk peeled fresh ginger 1.0 1.0
1.0-inch chunk peeled fresh ginger 1.0 1.0
2 2-pound frying chickens, each cut into 16 pieces 4.0 4.0
4.0-pound frying chickens, each cut into 16 pieces 4.0 4.0
1 1-pound halibut steak, trimmed of skin and cartilage, cut into 1-inch chunks 1.0 1.0
1.0-pound halibut steak, trimmed of skin and cartilage, cut into 1-inch chunks 1.0 1.0
2 1.25-pound lobsters, steamed, liquid reserved 2.0 2.5
2.5-pound lobsters, steamed, liquid reserved 2.5 2.5
2 1.5-pound lobsters, steamed, tail and claw meat removed and cut into 0.5-inch pieces 3.0 3.0
3.0-pound lobsters, steamed, tail and claw mea

1 4-pound chicken, cut into 6 pieces 4.0 4.0
4.0-pound chicken, cut into 6 pieces 4.0 4.0
2 1.5-pound lobsters 1.0 3.0
3.0-pound lobsters 1.0 3.0
2 8-ounce pork tenderloins 2.0 16.0
16.0-ounce pork tenderloins 16.0 16.0
4 5-ounce chicken breasts 4.0 20.0
20.0-ounce chicken breasts 20.0 20.0
1 1-by-3-inch orange rind 1.0 1.0
1.0-by-3-inch orange rind 1.0 1.0
Juice of 20 limes (about 2 3.1254 cups) 20.0 6.2508
Juice of 20 limes (about 6.251 cups) 20.0 6.2508
1 4-pound rack of venison, trimmed (see note) 4.0 4.0
4.0-pound rack of venison, trimmed (see note) 4.0 4.0
1 2-inch piece cinnamon stick 1.0 2.0
2.0-inch piece cinnamon stick 2.0 2.0
1 10-ounce can San Marzano tomatoes (and juices), crushed by hand 10.0 10.0
10.0-ounce can San Marzano tomatoes (and juices), crushed by hand 10.0 10.0
1 12-ounce bottle dark beer 12.0 12.0
12.0-ounce bottle dark beer 12.0 12.0
4 7-ounce salmon fillets, as thick as possible, preferably with skin 28.0 28.0
28.0-ounce salmon fillets, as thick as possible,

2 2-ounce cans anchovy fillets, packed in oil 4.0 4.0
4.0-ounce cans anchovy fillets, packed in oil 4.0 4.0
1 3-pound leg of lamb 3.0 3.0
3.0-pound leg of lamb 3.0 3.0
4 6-ounce sea bass fillets with skin on 24.0 24.0
24.0-ounce sea bass fillets with skin on 24.0 24.0
4 6-ounce halibut fillets (preferably thick pieces from a large fish) 24.0 24.0
24.0-ounce halibut fillets (preferably thick pieces from a large fish) 24.0 24.0
1 15-ounce can no-salt-added black beans 15.0 15.0
15.0-ounce can no-salt-added black beans 15.0 15.0
1 2-pound fillet of wild striped bass, with skin (Pacific salmon, mahi-mahi or barramundi may be substituted) 2.0 2.0
2.0-pound fillet of wild striped bass, with skin (Pacific salmon, mahi-mahi or barramundi may be substituted) 2.0 2.0
1 4-pound whole black sea bass or bluefish, cleaned 4.0 4.0
4.0-pound whole black sea bass or bluefish, cleaned 4.0 4.0
1 28-ounce can tomato sauce 1.0 28.0
28.0-ounce can tomato sauce 28.0 28.0
1 6-ounce can tomato paste 1.0 6.0
6.

1 14-ounce package puff pastry (like Dufour), defrosted according to package directions 4.0 14.0
14.0-ounce package puff pastry (like Dufour), defrosted according to package directions 4.0 14.0
1 12-pound fresh ham 1.0 12.0
12.0-pound fresh ham 12.0 12.0
2 9-inch rounds short-crust or puff pastry, unbaked 2.0 18.0
18.0-inch rounds short-crust or puff pastry, unbaked 18.0 18.0
1 750-milliliter bottle of vodka 750.0 750.0
750.0-milliliter bottle of vodka 750.0 750.0
1 750-milliliter bottle of dark rum 750.0 750.0
750.0-milliliter bottle of dark rum 750.0 750.0
2 750-ml. bottles white wine 1500.0 1500.0
1500.0-ml. bottles white wine 1500.0 1500.0
4 5-inch warm toast squares, crusts removed 4.0 20.0
20.0-inch warm toast squares, crusts removed 20.0 20.0
1 2-inch piece vanilla bean, split lengthwise 1.0 2.0
2.0-inch piece vanilla bean, split lengthwise 2.0 2.0
10.5 ounces boneless loin of veal, cut in 2 10.5-inch-thick medallions 10.5 21.0
10.5 ounces boneless loin of veal, cut in 21.0-inch

2 green bell peppers, cored and cut into pieces 1 12 inches long by 12 inch wide 2.0 12.0
2 green bell peppers, cored and cut into pieces 12.0 inches long by 12 inch wide 2.0 12.0
1 9-inch crust (recipe follows) 1.0 9.0
9.0-inch crust (recipe follows) 9.0 9.0
3 pounds peas in the pod, shelled, or 1 10-ounce box frozen tiny peas, defrosted and drained 3.0 10.0
3 pounds peas in the pod, shelled, or 10.0-ounce box frozen tiny peas, defrosted and drained 3.0 10.0
6 6-ounce center-cut salmon fillets 36.0 36.0
36.0-ounce center-cut salmon fillets 36.0 36.0
1 2-inch piece fresh ginger, peeled and diced 1.0 2.0
2.0-inch piece fresh ginger, peeled and diced 2.0 2.0
1 3- to 4-inch strip of kelp (kombu) 1.0 3.0
3.0- to 4-inch strip of kelp (kombu) 3.0 3.0
1 3-pound fryer chicken, butterflied 3.0 3.0
3.0-pound fryer chicken, butterflied 3.0 3.0
6 6-ounce skinless, boneless cod fillets 36.0 36.0
36.0-ounce skinless, boneless cod fillets 36.0 36.0
1 10-ounce package frozen baby peas 10.0 10.0
10.0-o

1 2-inch piece fresh ginger, peeled and chopped (about 0.25 cup) 1.0 2.0
2.0-inch piece fresh ginger, peeled and chopped (about 0.25 cup) 2.0 2.0
1 4- to 5-pound duck 4.0 4.0
4.0- to 5-pound duck 4.0 4.0
1 15-ounce can low-sodium kidney beans 15.0 15.0
15.0-ounce can low-sodium kidney beans 15.0 15.0
1 28-ounce can no-salt-added crushed tomatoes 28.0 28.0
28.0-ounce can no-salt-added crushed tomatoes 28.0 28.0
1 4-ounce onion or 3 ounces chopped onion (1 cup) 4.0 4.0
4.0-ounce onion or 3 ounces chopped onion (1 cup) 4.0 4.0
1 28-ounce can chopped or crushed tomatoes, no salt added 28.0 28.0
28.0-ounce can chopped or crushed tomatoes, no salt added 28.0 28.0
1 750 milliliter bottle Vernaccia di San Gimignano or other dry white wine, well chilled 750.0 750.0
750.0 milliliter bottle Vernaccia di San Gimignano or other dry white wine, well chilled 750.0 750.0
1 7-ounce jar roasted red peppers, rinsed, drained and cut into strips 1.0 7.0
7.0-ounce jar roasted red peppers, rinsed, drained an

1 16-ounce container dark chocolate fudge or milk chocolate frosting, preferably Duncan Hines Creamy Home-Style 16.0 16.0
16.0-ounce container dark chocolate fudge or milk chocolate frosting, preferably Duncan Hines Creamy Home-Style 16.0 16.0
1 1-inch piece fresh ginger, peeled 1.0 1.0
1.0-inch piece fresh ginger, peeled 1.0 1.0
2 3.5- to 4-pound chickens, fat removed, washed and blotted dry 7.0 7.0
7.0- to 4-pound chickens, fat removed, washed and blotted dry 7.0 7.0
2 12-ounce cans beer 24.0 24.0
24.0-ounce cans beer 24.0 24.0
1 6-ounce can tomato paste 6.0 6.0
6.0-ounce can tomato paste 6.0 6.0
1 28-ounce can crushed tomatoes 28.0 28.0
28.0-ounce can crushed tomatoes 28.0 28.0
2 cups homemade chicken broth or 1 17-ounce can chicken broth with 0.25 cup filtered water 2.0 17.0
2 cups homemade chicken broth or 17.0-ounce can chicken broth with 0.25 cup filtered water 2.0 17.0
1 2.75 pound chicken, with neck, rinsed and patted dry 2.75 2.75
2.75 pound chicken, with neck, rinsed and pat

1 6-ounce wedge of Brie 1.0 6.0
6.0-ounce wedge of Brie 6.0 6.0
4 6-inch sprigs tarragon 4.0 24.0
24.0-inch sprigs tarragon 24.0 24.0
4 6-to-8-ounce sturgeon fillets 24.0 24.0
24.0-to-8-ounce sturgeon fillets 24.0 24.0
2 5-ounce logs of goat cheese, cut into thirds crosswise 10.0 10.0
10.0-ounce logs of goat cheese, cut into thirds crosswise 10.0 10.0
1 6 to 8 pound Hubbard squash 6.0 6.0
6.0 to 8 pound Hubbard squash 6.0 6.0
4 8-ounce pieces black cod fillets, with skin 32.0 32.0
32.0-ounce pieces black cod fillets, with skin 32.0 32.0
1 5-inch sprig rosemary 1.0 5.0
5.0-inch sprig rosemary 5.0 5.0
1 3-pound chicken, skinned and trimmed of excess fat 3.0 3.0
3.0-pound chicken, skinned and trimmed of excess fat 3.0 3.0
3 1.5-pound lobsters 4.5 4.5
4.5-pound lobsters 4.5 4.5
4 1.5-pound lobsters 4.0 6.0
6.0-pound lobsters 6.0 6.0
2 8-ounce pork chops, with bone, trimmed of fat 16.0 16.0
16.0-ounce pork chops, with bone, trimmed of fat 16.0 16.0
1 6-inch sprig fresh rosemary 1.0 6.0
6.0-

1 4- to 4.5- pound whole red snapper, pink snapper or black sea bass, gutted and scaled 4.0 4.0
4.0- to 4.5- pound whole red snapper, pink snapper or black sea bass, gutted and scaled 4.0 4.0
1 2-inch piece fresh ginger root, peeled 1.0 2.0
2.0-inch piece fresh ginger root, peeled 2.0 2.0
1 3.5-pound arctic char, boned and left whole 3.5 3.5
3.5-pound arctic char, boned and left whole 3.5 3.5
1 3-pound chicken, giblets and excess fat removed 3.0 3.0
3.0-pound chicken, giblets and excess fat removed 3.0 3.0
6 7-ounce pieces fresh sturgeon 42.0 42.0
42.0-ounce pieces fresh sturgeon 42.0 42.0
1 28-ounce can crushed tomatoes 28.0 28.0
28.0-ounce can crushed tomatoes 28.0 28.0
8 1-inch-thick bass steaks (about 2 ounces each) 8.0 8.0
8.0-inch-thick bass steaks (about 2 ounces each) 8.0 8.0
8 1-inch-thick bass steaks (about 2 ounces each) 8.0 8.0
8.0-inch-thick bass steaks (about 2 ounces each) 8.0 8.0
2 3-pound whole free-range chickens, cut into serving pieces, fat and skin removed 2.0 6.0


12.0-to-4-ounce flounder fillets or 12.0-ounce fillets, split down the center 12.0 16.0
4 3-to-4-ounce flounder fillets or 2 8-ounce fillets, split down the center 3.0 12.0
12.0-to-4-ounce flounder fillets or 12.0-ounce fillets, split down the center 12.0 12.0
12.0-to-4-ounce flounder fillets or 12.0-ounce fillets, split down the center 12.0 16.0
12.0-to-4-ounce flounder fillets or 12.0-ounce fillets, split down the center 12.0 16.0
4 4-ounce pieces of salmon fillet 4.0 16.0
16.0-ounce pieces of salmon fillet 16.0 16.0
3 17-ounce jars Kadota figs in syrup 51.0 51.0
51.0-ounce jars Kadota figs in syrup 51.0 51.0
1 5-ounce jar cloudberry jam (see note) 5.0 5.0
5.0-ounce jar cloudberry jam (see note) 5.0 5.0
2 10-ounce bags fresh spinach 20.0 20.0
20.0-ounce bags fresh spinach 20.0 20.0
1 5-pound duck 5.0 5.0
5.0-pound duck 5.0 5.0
1 12-ounce bottle of beer 12.0 12.0
12.0-ounce bottle of beer 12.0 12.0
2 pounds of salmon fillets, poached in court bouillon, or 2 15.5-ounce cans of salmon 2

1 11-ounce can mandarin orange slices, drained, plus additional slices for garnish 11.0 11.0
11.0-ounce can mandarin orange slices, drained, plus additional slices for garnish 11.0 11.0
1 28-ounce can tomatoes packed in puree, broken up 28.0 28.0
28.0-ounce can tomatoes packed in puree, broken up 28.0 28.0
1 12-ounce package mushrooms, sliced thin 12.0 12.0
12.0-ounce package mushrooms, sliced thin 12.0 12.0
1 16-ounce can tomatoes 16.0 16.0
16.0-ounce can tomatoes 16.0 16.0
1 14-ounce can condensed milk 14.0 14.0
14.0-ounce can condensed milk 14.0 14.0
1 12-ounce jar apricot preserves 12.0 12.0
12.0-ounce jar apricot preserves 12.0 12.0
1 6-ounce mixture of glazed pears, peaches, apricots, figs and dates, cut up 6.0 6.0
6.0-ounce mixture of glazed pears, peaches, apricots, figs and dates, cut up 6.0 6.0
1 15-ounce package raisins 15.0 15.0
15.0-ounce package raisins 15.0 15.0
1 16-ounce loaf day-old French bread cut in 0.5-inch cubes 16.0 16.0
16.0-ounce loaf day-old French bread cut 

1 15-ounce container skim-milk ricotta 15.0 15.0
15.0-ounce container skim-milk ricotta 15.0 15.0
1 35-ounce can tomatoes, with liquid, chopped 35.0 35.0
35.0-ounce can tomatoes, with liquid, chopped 35.0 35.0
1 28-ounce can whole tomatoes with juices 28.0 28.0
28.0-ounce can whole tomatoes with juices 28.0 28.0
4 7-ounce trout fillets (or gutted whole fish) 4.0 28.0
28.0-ounce trout fillets (or gutted whole fish) 28.0 28.0
1 12-ounce slab smoked meaty bacon, preferably nitrate-free, rind removed 1.0 12.0
12.0-ounce slab smoked meaty bacon, preferably nitrate-free, rind removed 12.0 12.0
4 8-ounce bluefish fillets 32.0 32.0
32.0-ounce bluefish fillets 32.0 32.0
1 28-ounce can peeled tomatoes, drained and roughly chopped 28.0 28.0
28.0-ounce can peeled tomatoes, drained and roughly chopped 28.0 28.0
1 10-inch baked pie shell 1.0 10.0
10.0-inch baked pie shell 10.0 10.0
1 15-ounce can of chickpeas, drained 15.0 15.0
15.0-ounce can of chickpeas, drained 15.0 15.0
4 1-pound skate wings 4.0

1 15-pound smoked ham, on the bone 15.0 15.0
15.0-pound smoked ham, on the bone 15.0 15.0
1 6-to 8-pound roaster chicken 6.0 6.0
6.0-to 8-pound roaster chicken 6.0 6.0
1 16-ounce can of chestnuts, drained of liquid and coarsely chopped 16.0 16.0
16.0-ounce can of chestnuts, drained of liquid and coarsely chopped 16.0 16.0
1 5.5-pound wild turkey 1.0 5.5
5.5-pound wild turkey 5.5 5.5
1 3-pound chicken, cut up for sauteing 1.0 3.0
3.0-pound chicken, cut up for sauteing 3.0 3.0
1 1-pound package of bacon 1.0 1.0
1.0-pound package of bacon 1.0 1.0
1 28-ounce can Italian plum tomatoes 1.0 28.0
28.0-ounce can Italian plum tomatoes 28.0 28.0
4 4-to 6-ounce grouper, catfish or halibut fillets 16.0 16.0
16.0-to 6-ounce grouper, catfish or halibut fillets 16.0 16.0
12 4-by-4-inch sheets of homemade pasta, or 8 dried lasagna noodles 12.0 48.0
48.0-by-4-inch sheets of homemade pasta, or 8 dried lasagna noodles 48.0 48.0
1 1-inch piece fresh ginger root, peeled and minced 1.0 1.0
1.0-inch piece fre

1 11-inch pie shell, unbaked and chilled or frozen 1.0 11.0
11.0-inch pie shell, unbaked and chilled or frozen 11.0 11.0
1 11-inch inch pie shell, baked 1.0 11.0
11.0-inch inch pie shell, baked 11.0 11.0
4 6-ounce baking potatoes 4.0 24.0
24.0-ounce baking potatoes 24.0 24.0
1 20-ounce can chick peas 20.0 20.0
20.0-ounce can chick peas 20.0 20.0
1 28-ounce can no-salt-added whole tomatoes 28.0 28.0
28.0-ounce can no-salt-added whole tomatoes 28.0 28.0
1 3-ounce can moist shredded coconut 13.0 3.0
3.0-ounce can moist shredded coconut 13.0 3.0
3 19-ounce cans cannellini beans, drained and rinsed 57.0 57.0
57.0-ounce cans cannellini beans, drained and rinsed 57.0 57.0
1 3-to-4 pound chicken, including liver, heart and gizzards Juice 3.0 3.0
3.0-to-4 pound chicken, including liver, heart and gizzards Juice 3.0 3.0
2 8-ounce cans of almond paste at room temperature 16.0 16.0
16.0-ounce cans of almond paste at room temperature 16.0 16.0
2 pounds fresh spinach in bulk or 1 10-ounce package fr

1 28-ounce can chopped tomatoes with juice 28.0 28.0
28.0-ounce can chopped tomatoes with juice 28.0 28.0
1 28-ounce can chopped tomatoes with juice 28.0 28.0
28.0-ounce can chopped tomatoes with juice 28.0 28.0
1 14-ounce can tomatoes, with juice 14.0 14.0
14.0-ounce can tomatoes, with juice 14.0 14.0
1 14-ounce can chopped tomatoes 14.0 14.0
14.0-ounce can chopped tomatoes 14.0 14.0
1 12-ounce bunch spinach, stemmed, cleaned and roughly chopped, or 1 6-ounce bag baby spinach, stems removed, roughly chopped 12.0 12.0
12.0-ounce bunch spinach, stemmed, cleaned and roughly chopped, or 12.0-ounce bag baby spinach, stems removed, roughly chopped 12.0 12.0
12.0-ounce bunch spinach, stemmed, cleaned and roughly chopped, or 12.0-ounce bag baby spinach, stems removed, roughly chopped 12.0 6.0
12.0-ounce bunch spinach, stemmed, cleaned and roughly chopped, or 12.0-ounce bag baby spinach, stems removed, roughly chopped 12.0 6.0
1 12-ounce bunch spinach, stemmed and washed, or a 6-ounce bag of b

1 28-ounce can tomatoes, whole or diced, with liquid 1.0 28.0
28.0-ounce can tomatoes, whole or diced, with liquid 28.0 28.0
2 3- to 4-pound goat legs, trimmed of fat and caul 2.0 6.0
6.0- to 4-pound goat legs, trimmed of fat and caul 6.0 6.0
1 4-pound fresh free-range organic chicken, cut into 8 pieces and skinned 1.0 4.0
4.0-pound fresh free-range organic chicken, cut into 8 pieces and skinned 4.0 4.0
1 6-pound bone-in lamb shoulder; bones removed and cut into 3-inch pieces, rinsed well and reserved, or 3 pounds boneless lamb shoulder (see note) 1.0 6.0
6.0-pound bone-in lamb shoulder; bones removed and cut into 3-inch pieces, rinsed well and reserved, or 3 pounds boneless lamb shoulder (see note) 6.0 6.0
1 28-ounce can chopped tomatoes with juice, peeled, seeded and chopped 1.0 28.0
28.0-ounce can chopped tomatoes with juice, peeled, seeded and chopped 28.0 28.0
4 8-inch or 6 6-inch flour tortillas 4.0 32.0
32.0-inch or 32.0-inch flour tortillas 32.0 32.0
32.0-inch or 32.0-inch flou

1 28-ounce can peeled tomatoes, preferably San Marzano 1.0 28.0
28.0-ounce can peeled tomatoes, preferably San Marzano 28.0 28.0
1 14-ounce can sweetened condensed milk 1.0 14.0
14.0-ounce can sweetened condensed milk 14.0 14.0
1 3-inch piece Mexican cinnamon (canela) 1.0 3.0
3.0-inch piece Mexican cinnamon (canela) 3.0 3.0
1 15-ounce can chickpeas, drained 1.0 15.0
15.0-ounce can chickpeas, drained 15.0 15.0
8 6-inch corn tortillas 8.0 48.0
48.0-inch corn tortillas 48.0 48.0
1 3.5- to 4-pound chicken 1.0 3.5
3.5- to 4-pound chicken 3.5 3.5
1 4-inch cinnamon stick 1.0 4.0
4.0-inch cinnamon stick 4.0 4.0
1 14-ounce can black beans, well drained 1.0 14.0
14.0-ounce can black beans, well drained 14.0 14.0
2 9-ounce boxes dry lasagna, broken into 3-inch shards 2.0 18.0
18.0-ounce boxes dry lasagna, broken into 3-inch shards 18.0 18.0
1 pound tomatoes, peeled, seeded* and cut in 0.25-inch dice, or 1 14-ounce can diced tomatoes, partially drained 1.0 14.0
1 pound tomatoes, peeled, seeded* an

2 28-ounce cans San Marzano 2.0 56.0
56.0-ounce cans San Marzano 56.0 56.0
1 15-ounce can coconut milk 1.0 15.0
15.0-ounce can coconut milk 15.0 15.0
2 2-inch strips lemon peel, pith removed 2.0 4.0
4.0-inch strips lemon peel, pith removed 4.0 4.0
1 15-ounce can evaporated milk 1.0 15.0
15.0-ounce can evaporated milk 15.0 15.0
1 1-inch piece ginger, peeled and slivered 1.0 1.0
1.0-inch piece ginger, peeled and slivered 1.0 1.0
2 15- to 16-ounce cans cannellini beans, rinsed and drained 2.0 30.0
30.0- to 16-ounce cans cannellini beans, rinsed and drained 30.0 30.0
1 14-ounce can diced or chopped tomatoes 1.0 14.0
14.0-ounce can diced or chopped tomatoes 14.0 14.0
1 2-inch-long piece of orange peel 1.0 2.0
2.0-inch-long piece of orange peel 2.0 2.0
1 8-ounce package thin Vietnamese rice noodles 1.0 8.0
8.0-ounce package thin Vietnamese rice noodles 8.0 8.0
1 5-ounce can water-packed light tuna, drained and broken up with a fork, or 5 to 6 ounces cooked fresh tuna, cut in thin bite-size p

1 6-ounce bag of spinach 1.0 6.0
6.0-ounce bag of spinach 6.0 6.0
1 1-to-2-inch cinnamon stick, broken into pieces, or 1 teaspoon ground cinnamon 1.0 1.0
1.0-to-2-inch cinnamon stick, broken into pieces, or 1 teaspoon ground cinnamon 1.0 1.0
3 tomatoes, peeled and roughly chopped, or 1 15-ounce can whole tomatoes 3.0 15.0
3 tomatoes, peeled and roughly chopped, or 15.0-ounce can whole tomatoes 3.0 15.0
1 2-inch piece fresh ginger 1.0 2.0
2.0-inch piece fresh ginger 2.0 2.0
1 4-inch Chinese long red hot chile, or to taste 1.0 4.0
4.0-inch Chinese long red hot chile, or to taste 4.0 4.0
1 4-inch serrano chile, or to taste 1.0 4.0
4.0-inch serrano chile, or to taste 4.0 4.0
1 generous bunch spinach (about 0.75 pound), stemmed, washed thoroughly and coarsely chopped, or 2 6-ounce bags baby spinach 1.0 12.0
1 generous bunch spinach (about 0.75 pound), stemmed, washed thoroughly and coarsely chopped, or 12.0-ounce bags baby spinach 1.0 12.0
1 28-ounce can chopped tomatoes, with juice 1.0 28.

1 8- to 12-ounce wedge of good Brie cheese, rind removed and sliced thin 1.0 8.0
8.0- to 12-ounce wedge of good Brie cheese, rind removed and sliced thin 8.0 8.0
1 28-ounce can chopped tomatoes in juice, pulsed in a food processor 1.0 28.0
28.0-ounce can chopped tomatoes in juice, pulsed in a food processor 28.0 28.0
1 28-ounce can tomatoes in juice 1.0 28.0
28.0-ounce can tomatoes in juice 28.0 28.0
1 2-inch cinnamon stick 2.0 2.0
2.0-inch cinnamon stick 2.0 2.0
1 12-ounce bunch spinach, stemmed and washed, or 1 6-ounce bag baby spinach 1.0 12.0
12.0-ounce bunch spinach, stemmed and washed, or 12.0-ounce bag baby spinach 12.0 12.0
12.0-ounce bunch spinach, stemmed and washed, or 12.0-ounce bag baby spinach 12.0 6.0
12.0-ounce bunch spinach, stemmed and washed, or 12.0-ounce bag baby spinach 12.0 6.0
1 large bunch spinach, stemmed, or 1 6-ounce bag baby spinach 1.0 6.0
1 large bunch spinach, stemmed, or 6.0-ounce bag baby spinach 6.0 6.0
1 15-ounce can white beans, drained and rinsed 1

12 6-inch corn tortillas 12.0 72.0
72.0-inch corn tortillas 72.0 72.0
1 28-ounce can whole peeled tomatoes 1.0 28.0
28.0-ounce can whole peeled tomatoes 28.0 28.0
1 3-pound chicken 1.0 3.0
3.0-pound chicken 3.0 3.0
2 16-ounce cans yellow hominy (do not use white) 2.0 32.0
32.0-ounce cans yellow hominy (do not use white) 32.0 32.0
2 10-ounce packages whole-leaf frozen spinach (do not thaw) 2.0 20.0
20.0-ounce packages whole-leaf frozen spinach (do not thaw) 20.0 20.0
1 15-ounce can chick peas, drained and rinsed 1.0 15.0
15.0-ounce can chick peas, drained and rinsed 15.0 15.0
1 14-ounce block of organic extra-firm tofu (find one that is not too dry) 1.0 14.0
14.0-ounce block of organic extra-firm tofu (find one that is not too dry) 14.0 14.0
1 12-inch round of pizza dough, stretched 1.0 12.0
12.0-inch round of pizza dough, stretched 12.0 12.0
1 12-inch round of pizza dough, stretched 1.0 12.0
12.0-inch round of pizza dough, stretched 12.0 12.0
1 12-inch round of pizza dough, stretched 1

1 14-ounce can chopped tomatoes, with juice 14.0 14.0
14.0-ounce can chopped tomatoes, with juice 14.0 14.0
1 3.5- to 4-pound whole chicken 1.0 3.5
3.5- to 4-pound whole chicken 3.5 3.5
1 28-ounce can diced tomatoes, or 3 10-ounce cans Ro-Tel canned tomatoes with green chiles 1.0 28.0
28.0-ounce can diced tomatoes, or 28.0-ounce cans Ro-Tel canned tomatoes with green chiles 28.0 28.0
28.0-ounce can diced tomatoes, or 28.0-ounce cans Ro-Tel canned tomatoes with green chiles 28.0 30.0
28.0-ounce can diced tomatoes, or 28.0-ounce cans Ro-Tel canned tomatoes with green chiles 28.0 30.0
1 14.5-ounce can chopped tomatoes, with juice 1.0 14.5
14.5-ounce can chopped tomatoes, with juice 14.5 14.5
1 5-ounce can tuna (packed in water or olive oil), drained 1.0 5.0
5.0-ounce can tuna (packed in water or olive oil), drained 5.0 5.0
1 28-ounce can plum tomatoes, chopped 1.0 28.0
28.0-ounce can plum tomatoes, chopped 28.0 28.0
1 3-inch piece cinnamon 1.0 3.0
3.0-inch piece cinnamon 3.0 3.0
1 1-inch 

1 2.5-inch piece of ginger, peeled and grated or minced 2.5 2.5
2.5-inch piece of ginger, peeled and grated or minced 2.5 2.5
4  5-ounce pieces cod fillet, about 1 to 1.5 inches thick 4.0 20.0
20.0-ounce pieces cod fillet, about 1 to 1.5 inches thick 20.0 20.0
1 15-oz. can dark red kidney beans, drained and rinsed 1.0 15.0
15.0-oz. can dark red kidney beans, drained and rinsed 15.0 15.0
1 2-inch piece cinnamon stick 1.0 2.0
2.0-inch piece cinnamon stick 2.0 2.0
1 1-inch piece fresh ginger, peeled and chopped (about 1 tablespoon, or use 2 teaspoons powdered ginger) 1.0 1.0
1.0-inch piece fresh ginger, peeled and chopped (about 1 tablespoon, or use 2 teaspoons powdered ginger) 1.0 1.0
1 7⁄8 cups lukewarm water 1.88 7.0
7.0⁄8 cups lukewarm water 1.88 7.0
1 7⁄8 cups lukewarm water 1.88 7.0
7.0⁄8 cups lukewarm water 1.88 7.0
1 15-ounce can of chickpeas 1.0 15.0
15.0-ounce can of chickpeas 15.0 15.0
1 6-ounce can of tomato paste 1.0 6.0
6.0-ounce can of tomato paste 6.0 6.0
2 15-ounce cans (

In [14]:
def fix_abbreviations(s):
    """
    Converts instances of oz. and g. to ounce and gram respectively
    """
    columns = ["input", "unit"]
    for col in columns:
        # replace oz. with ounce
        if s[col] == s[col]:
            match = re.findall(r"([0-9])\s*oz\.*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"([0-9])\s*oz\.*", m + " ounce", s[col])
            # replace g. with gram
            match = re.findall(r"([0-9])\s*g([^a-z])", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(
                        r"([0-9])\s*g([^a-z])", m[0] + " gram" + m[1], s[col]
                    )
            # replace tbsp with tablespoon
            match = re.findall(r"[Tt]bsp\.*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"[Tt]bsp\.*", "tablespoon", s[col])
            # replace tsp with teaspoon
            match = re.findall(r"[Tt]sp\.*", s[col])
            if match:
                for m in match:
                    s[col] = re.sub(r"[Tt]sp\.*", "teaspoon", s[col])
    return s

In [15]:
input_data = input_data.apply(fix_abbreviations, axis=1)

In [16]:
def fix_inconsistencies(row):
    """
    Fix various inconsistencies in the labels.
    """
    try:
        if ',' in row["name"]:
            name = row["name"].split(',',1)
            #print(name)
    except TypeError:
        print(row)  

In [17]:
input_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,


In [18]:
input_data[(input_data['input'].str.contains('garlic')) & (input_data['name'] != "garlic")].apply(fix_inconsistencies, axis=1)


input        2 cloves garlic
name                     NaN
qty                        0
range_end                  0
unit                     NaN
comment                  NaN
Name: 120232, dtype: object
input        4 garlic cloves, minced
name                             NaN
qty                                0
range_end                          0
unit                             NaN
comment                          NaN
Name: 144161, dtype: object
input        Add cooled zucchini and pine nuts to food proc...
name                                                       NaN
qty                                                          0
range_end                                                    0
unit                                                       NaN
comment                                                    NaN
Name: 150489, dtype: object
input        1 small garlic clove puréed
name                                 NaN
qty                                    0
range_end          

index
185       None
238       None
773       None
782       None
1266      None
          ... 
178853    None
178993    None
178994    None
179009    None
179037    None
Length: 1380, dtype: object

In [19]:
training_data, test_data = train_test_split(input_data, test_size=0.2)

In [20]:
#training_data.to_pickle("../data/interim/crf_training_data.pickle")
#test_data.to_pickle("../data/interim/crf_test_data.pickle")

In [21]:
# Examining some remaining outliers in the data
input_data.unit.unique()

array(['cup', nan, 'stalk', 'tablespoon', 'teaspoon', 'clove', 'pound',
       'ounce', 'pinch', 'sprig', 'dash', 'slice', 'head', 'bunch', 'box',
       'dozen', 'sheet', 'piece', 'pint', 'fillet', 'gallon', 'quart',
       'strip', 'bottle', 'drop', 'cake', 'scoop', 'stick', 'ear', 'can',
       'bulb', 'package', 'loaf', 'layer', 'rack', 'envelope', 'leaf',
       'ball', 'chunk', 'knob', 'bag', 'rectangle', 'inch', 'pair',
       'shake', 'wedge', 'branch', 'half', 'packet', 'handful', 'fifth',
       'steak', 'splash', 'log', 'slab', 'grind', 'square', 'milliliter',
       'liter', 'twist', 'cube', 'gram', 'bowl', 'stem', 'vial', 'length',
       'segment', 'plate', 'foot', 'cluster', 'fluid ounce', 'glass',
       'bar', 'link', 'spiral', 'hatch', 'basket', 'part', 'pat',
       'sliver', 'jar', 'side', 'hunk', 'thread', 'batch', 'lobe',
       'portion', 'serving', 'root', 'shot', 'tablespoons', 'small',
       'cloves', 'cups', 'ounces', 'pounds', 'very small', 'teaspoons',
   

In [22]:
input_data[(input_data['unit']=="12-ounce bottle") | (input_data['unit']=="48-ounce can") | (input_data['unit']=="12-ounce cans")]

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
140825,"48.0-ounce cans kidney, black, pinto or great ...","kidney, black, pinto or great Northern beans",48.0,,12-ounce cans,"or a combination, drained reserve the liquid a..."
142594,48.0-ounce can tomato juice,tomato juice,48.0,,48-ounce can,
142741,12.0-ounce bottle of beer,beer,12.0,,12-ounce bottle,


In [23]:
def fix_individual_rows(row):
    if row["input"] == "3 crushed red peppers":
        row["unit"] = ''
        row["comment"] = "crushed"
    if row["input"] == "1 small-to-medium daikon radish (cut into 1-inch cubes)":
        row["comment"] = row["unit"] + " " + row["comment"] 
        row["unit"] = ''
    if row["unit"] == "chopped":
        row["comment"] = "chopped"
        row["unit"] = ''
    if row["input"] == "1 heaping teaspoon black peppercorns":
        row["comment"] = "heaping"
        row["unit"] = "teaspoon"
    if row["input"] == "a 10-pound piece of pork belly with the skin":
        row["qty"] = "10"
        row["unit"] = "pound"
    
        
        