In [1]:
from decimal import Decimal, InvalidOperation
import spacy
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])

In [3]:
def match_up(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    labels = []
    name = df["name"].split()
    unit = df["unit"]
    qty = df["qty"]

    for token in df["input"]:
        decimal_token = None
        try:
            decimal_token = float(token)
        except ValueError:
            pass
        if token in list(name):
            labels.append("NAME")
            name.remove(token)
        elif token == unit:
            labels.append("UNIT")
            unit = ""
        elif decimal_token is not None and decimal_token == qty:
            labels.append("QTY")
            qty = float('nan')
        #elif token in list(comment):
        #    labels.append("COMMENT")
        #    comment.remove(token)
        else:
            labels.append("COMMENT")
    return labels

def add_prefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prev_tag = None
    new_data = []

    for token, tag in data:

        new_tag = ""

        p = "B" if ((prev_tag is None) or (tag != prev_tag)) else "I"
        new_tag = "%s-%s" % (p, tag)
        new_data.append(new_tag)
        prev_tag = tag

    return new_data


In [4]:
# Load raw data and do some preprocessing
training_data = pd.read_pickle("../data/interim/crf_data.pickle")
#test_data = pd.read_pickle("../data/interim/crf_test_data.pickle")

In [5]:
training_data.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup
1,1 cup peeled and cooked fresh chestnuts (about...,chestnut,1.0,cup
2,"1 medium size onion, peeled and chopped",onion,1.0,
3,"2 stalks celery, chopped coarse",celery,2.0,stalk
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,tablespoon
6,"2 tablespoons unflavored gelatin, dissolved in...",gelatin,2.0,tablespoon
7,Salt,Salt,,
8,1 cup canned plum tomatoes with juice,plum tomato,1.0,cup
9,6 cups veal or beef stock,veal stock,6.0,cup
10,0.33 cup Worcestershire sauce,Worcestershire sauce,0.33,cup


In [6]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(
    nlp.pipe(training_data["input"].astype("unicode").values, batch_size=50)
)

#test_data["input"] = list(
#    nlp.pipe(test_data["input"].astype("unicode").values, batch_size=50)
#)

In [7]:
training_data.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"(1.25, cups, cooked, and, pureed, fresh, butte...",butternut squash,1.25,cup
1,"(1, cup, peeled, and, cooked, fresh, chestnuts...",chestnut,1.0,cup
2,"(1, medium, size, onion, ,, peeled, and, chopped)",onion,1.0,
3,"(2, stalks, celery, ,, chopped, coarse)",celery,2.0,stalk
4,"(1.5, tablespoons, vegetable, oil)",vegetable oil,1.5,tablespoon
6,"(2, tablespoons, unflavored, gelatin, ,, disso...",gelatin,2.0,tablespoon
7,(Salt),Salt,,
8,"(1, cup, canned, plum, tomatoes, with, juice)",plum tomato,1.0,cup
9,"(6, cups, veal, or, beef, stock)",veal stock,6.0,cup
10,"(0.33, cup, Worcestershire, sauce)",Worcestershire sauce,0.33,cup


In [8]:
# for assigining labels we only need lemmas
"""training_data["name"] = list(
    nlp.pipe(training_data["name"].astype("unicode").values, batch_size=50)
)
training_data["unit"] = list(
    nlp.pipe(training_data["unit"].astype("unicode").values, batch_size=50)
)
training_data["comment"] = list(
    nlp.pipe(training_data["comment"].astype("unicode").values, batch_size=50)
)"""

"""test_data["name"] = list(
    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)
)
test_data["unit"] = list(
    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)
)
test_data["comment"] = list(
    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)
)"""

'test_data["name"] = list(\n    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)\n)\ntest_data["unit"] = list(\n    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)\n)\ntest_data["comment"] = list(\n    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)\n)'

In [9]:
#training_data.head()

In [10]:
training_data["input"] = training_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
"""training_data["name"] = training_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["unit"] = training_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["comment"] = training_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)"""

"""test_data["input"] = test_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["name"] = test_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["unit"] = test_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["comment"] = test_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)"""

'test_data["input"] = test_data["input"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["name"] = test_data["name"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["unit"] = test_data["unit"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["comment"] = test_data["comment"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)'

In [11]:
training_data.head(25)

Unnamed: 0_level_0,input,name,qty,unit
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,"[1.25, cup, cook, and, puree, fresh, butternut...",butternut squash,1.25,cup
1,"[1, cup, peel, and, cook, fresh, chestnut, (, ...",chestnut,1.0,cup
2,"[1, medium, size, onion, ,, peel, and, chop]",onion,1.0,
3,"[2, stalk, celery, ,, chop, coarse]",celery,2.0,stalk
4,"[1.5, tablespoon, vegetable, oil]",vegetable oil,1.5,tablespoon
6,"[2, tablespoon, unflavored, gelatin, ,, dissol...",gelatin,2.0,tablespoon
7,[Salt],Salt,,
8,"[1, cup, can, plum, tomato, with, juice]",plum tomato,1.0,cup
9,"[6, cup, veal, or, beef, stock]",veal stock,6.0,cup
10,"[0.33, cup, Worcestershire, sauce]",Worcestershire sauce,0.33,cup


In [12]:
training_data["labels"] = training_data.apply(match_up, axis=1)
#match_up(training_data.iloc[0])
#test_data["labels"] = test_data.apply(match_up, axis=1)

In [13]:
training_data[training_data.labels.apply(lambda x: 'OTHER' in x)]

Unnamed: 0_level_0,input,name,qty,unit,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1


In [14]:
training_data.iloc[25:50]

Unnamed: 0_level_0,input,name,qty,unit,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
26,"[13, tablespoon, butter]",butter,13.0,tablespoon,"[QTY, UNIT, NAME]"
27,"[4, large, artichoke, ,, trim]",artichoke,4.0,,"[QTY, COMMENT, NAME, COMMENT, COMMENT]"
28,"[Juice, of, 1, lemon]",lemon Juice,1.0,,"[NAME, COMMENT, QTY, NAME]"
29,[Salt],Salt,,,[NAME]
30,"[Confectioners, ', sugar]",Confectioners ' sugar,,,"[NAME, NAME, NAME]"
31,"[0.25, teaspoon, vanilla, extract]",vanilla extract,0.25,teaspoon,"[QTY, UNIT, NAME, NAME]"
32,"[2, large, egg]",egg,2.0,,"[QTY, COMMENT, NAME]"
33,"[Pinch, of, freshly, grate, nutmeg]",nutmeg,1.0,pinch,"[COMMENT, COMMENT, COMMENT, COMMENT, NAME]"
34,"[2, teaspoon, bake, powder]",bake powder,2.0,teaspoon,"[QTY, UNIT, NAME, NAME]"
35,"[0.25, teaspoon, salt]",salt,0.25,teaspoon,"[QTY, UNIT, NAME]"


In [15]:
crf_training_labels = pd.Series(
    training_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)
#crf_test_labels = pd.Series(
#    test_data.apply(
#        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
#    )
#)

In [16]:
crf_training_labels.head(25)

index
0     [B-QTY, B-UNIT, B-COMMENT, I-COMMENT, I-COMMEN...
1     [B-QTY, B-UNIT, B-COMMENT, I-COMMENT, I-COMMEN...
2     [B-QTY, B-COMMENT, I-COMMENT, B-NAME, B-COMMEN...
3     [B-QTY, B-UNIT, B-NAME, B-COMMENT, I-COMMENT, ...
4                       [B-QTY, B-UNIT, B-NAME, I-NAME]
6     [B-QTY, B-UNIT, B-COMMENT, B-NAME, B-COMMENT, ...
7                                              [B-NAME]
8     [B-QTY, B-UNIT, B-COMMENT, B-NAME, I-NAME, B-C...
9     [B-QTY, B-UNIT, B-NAME, B-COMMENT, I-COMMENT, ...
10                      [B-QTY, B-UNIT, B-NAME, I-NAME]
11              [B-QTY, B-UNIT, B-NAME, I-NAME, I-NAME]
12      [B-QTY, B-UNIT, B-NAME, I-NAME, I-NAME, I-NAME]
13                              [B-QTY, B-NAME, B-UNIT]
14    [B-QTY, B-COMMENT, B-NAME, B-COMMENT, I-COMMEN...
15    [B-QTY, B-NAME, B-COMMENT, I-COMMENT, I-COMMEN...
16     [B-QTY, B-COMMENT, B-NAME, B-COMMENT, I-COMMENT]
17                              [B-QTY, B-UNIT, B-NAME]
18    [B-QTY, B-UNIT, B-NAME, I-NAME, B-CO

In [17]:
crf_training_labels.to_pickle("../data/interim/crf_training_labels.pickle")
#crf_test_labels.to_pickle("../data/interim/crf_test_labels.pickle")