In [1]:
from decimal import Decimal, InvalidOperation
import spacy
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])

In [3]:
def match_up(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    labels = []

    for token in df["input"]:
        decimal_token = None
        try:
            decimal_token = Decimal(token)
        except InvalidOperation:
            pass
        if token in df["name"]:
            labels.append("NAME")
        elif token in df["unit"]:
            labels.append("UNIT")
        elif decimal_token is not None and decimal_token == df["qty"]:
            labels.append("QTY")
        elif token in df["comment"]:
            labels.append("COMMENT")
        else:
            labels.append("OTHER")
    return labels

def add_prefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prev_tag = None
    new_data = []

    for token, tag in data:

        new_tag = ""

        p = "B" if ((prev_tag is None) or (tag != prev_tag)) else "I"
        new_tag = "%s-%s" % (p, tag)
        new_data.append(new_tag)
        prev_tag = tag

    return new_data


In [6]:
# Load raw data and do some preprocessing
training_data = pd.read_pickle("../data/interim/crf_data.pickle")
#test_data = pd.read_pickle("../data/interim/crf_test_data.pickle")

In [7]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,cup,"cooked and pureed fresh, or 10.00 ounce packag..."
1,1 cup peeled and cooked fresh chestnuts (about...,chestnuts,1.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1 medium size onion, peeled and chopped",onion,1.0,,"medium size, peeled and chopped"
3,"2 stalks celery, chopped coarse",celery,2.0,stalk,chopped coarse
4,1.50 tablespoons vegetable oil,vegetable oil,1.5,tablespoon,


In [8]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(
    nlp.pipe(training_data["input"].astype("unicode").values, batch_size=50)
)

#test_data["input"] = list(
#    nlp.pipe(test_data["input"].astype("unicode").values, batch_size=50)
#)

In [9]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"(1.25, cups, cooked, and, pureed, fresh, butte...",butternut squash,1.25,cup,"cooked and pureed fresh, or 10.00 ounce packag..."
1,"(1, cup, peeled, and, cooked, fresh, chestnuts...",chestnuts,1.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"(1, medium, size, onion, ,, peeled, and, chopped)",onion,1.0,,"medium size, peeled and chopped"
3,"(2, stalks, celery, ,, chopped, coarse)",celery,2.0,stalk,chopped coarse
4,"(1.50, tablespoons, vegetable, oil)",vegetable oil,1.5,tablespoon,


In [10]:
# for assigining labels we only need lemmas
training_data["name"] = list(
    nlp.pipe(training_data["name"].astype("unicode").values, batch_size=50)
)
training_data["unit"] = list(
    nlp.pipe(training_data["unit"].astype("unicode").values, batch_size=50)
)
training_data["comment"] = list(
    nlp.pipe(training_data["comment"].astype("unicode").values, batch_size=50)
)

"""test_data["name"] = list(
    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)
)
test_data["unit"] = list(
    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)
)
test_data["comment"] = list(
    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)
)"""

'test_data["name"] = list(\n    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)\n)\ntest_data["unit"] = list(\n    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)\n)\ntest_data["comment"] = list(\n    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)\n)'

In [11]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"(1.25, cups, cooked, and, pureed, fresh, butte...","(butternut, squash)",1.25,(cup),"(cooked, and, pureed, fresh, ,, or, 10.00, oun..."
1,"(1, cup, peeled, and, cooked, fresh, chestnuts...",(chestnuts),1.0,(cup),"(peeled, and, cooked, fresh, (, about, 20, ), ..."
2,"(1, medium, size, onion, ,, peeled, and, chopped)",(onion),1.0,(nan),"(medium, size, ,, peeled, and, chopped)"
3,"(2, stalks, celery, ,, chopped, coarse)",(celery),2.0,(stalk),"(chopped, coarse)"
4,"(1.50, tablespoons, vegetable, oil)","(vegetable, oil)",1.5,(tablespoon),(nan)


In [12]:
training_data["input"] = training_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["name"] = training_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["unit"] = training_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["comment"] = training_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)

"""test_data["input"] = test_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["name"] = test_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["unit"] = test_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["comment"] = test_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)"""

'test_data["input"] = test_data["input"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["name"] = test_data["name"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["unit"] = test_data["unit"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["comment"] = test_data["comment"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)'

In [13]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,"[1.25, cup, cook, and, puree, fresh, butternut...","[butternut, squash]",1.25,[cup],"[cook, and, puree, fresh, ,, or, 10.00, ounce,..."
1,"[1, cup, peel, and, cook, fresh, chestnut, (, ...",[chestnut],1.0,[cup],"[peel, and, cook, fresh, (, about, 20, ), ,, o..."
2,"[1, medium, size, onion, ,, peel, and, chop]",[onion],1.0,[nan],"[medium, size, ,, peel, and, chop]"
3,"[2, stalk, celery, ,, chop, coarse]",[celery],2.0,[stalk],"[chop, coarse]"
4,"[1.50, tablespoon, vegetable, oil]","[vegetable, oil]",1.5,[tablespoon],[nan]


In [14]:
training_data["labels"] = training_data.apply(match_up, axis=1)

#test_data["labels"] = test_data.apply(match_up, axis=1)

In [15]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,"[1.25, cup, cook, and, puree, fresh, butternut...","[butternut, squash]",1.25,[cup],"[cook, and, puree, fresh, ,, or, 10.00, ounce,...","[QTY, UNIT, COMMENT, COMMENT, COMMENT, COMMENT..."
1,"[1, cup, peel, and, cook, fresh, chestnut, (, ...",[chestnut],1.0,[cup],"[peel, and, cook, fresh, (, about, 20, ), ,, o...","[QTY, UNIT, COMMENT, COMMENT, COMMENT, COMMENT..."
2,"[1, medium, size, onion, ,, peel, and, chop]",[onion],1.0,[nan],"[medium, size, ,, peel, and, chop]","[QTY, COMMENT, COMMENT, NAME, COMMENT, COMMENT..."
3,"[2, stalk, celery, ,, chop, coarse]",[celery],2.0,[stalk],"[chop, coarse]","[QTY, UNIT, NAME, OTHER, COMMENT, COMMENT]"
4,"[1.50, tablespoon, vegetable, oil]","[vegetable, oil]",1.5,[tablespoon],[nan],"[QTY, UNIT, NAME, NAME]"


In [16]:
crf_training_labels = pd.Series(
    training_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)
#crf_test_labels = pd.Series(
#    test_data.apply(
#        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
#    )
#)

In [17]:
crf_training_labels.to_pickle("../data/interim/crf_training_labels.pickle")
#crf_test_labels.to_pickle("../data/interim/crf_test_labels.pickle")