In [1]:
from decimal import Decimal, InvalidOperation
import spacy
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])

In [3]:
def match_up(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    labels = []

    for token in df["input"]:
        decimal_token = None
        try:
            decimal_token = Decimal(token)
        except InvalidOperation:
            pass
        if token in df["name"]:
            labels.append("NAME")
        elif token in df["unit"]:
            labels.append("UNIT")
        elif decimal_token is not None and decimal_token == df["qty"]:
            labels.append("QTY")
        elif token in df["comment"]:
            labels.append("COMMENT")
        else:
            labels.append("OTHER")
    return labels

def add_prefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prev_tag = None
    new_data = []

    for token, tag in data:

        new_tag = ""

        p = "B" if ((prev_tag is None) or (tag != prev_tag)) else "I"
        new_tag = "%s-%s" % (p, tag)
        new_data.append(new_tag)
        prev_tag = tag

    return new_data


In [17]:
# Load raw data and do some preprocessing
training_data = pd.read_pickle("../data/interim/crf_training_data.pickle")
#test_data = pd.read_pickle("../data/interim/crf_test_data.pickle")

In [5]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58015,1 teaspoon salt,salt,1.0,teaspoon,
13113,"0.50 cup bread crumbs, preferably fresh and li...",bread crumbs,0.5,cup,nan preferably fresh and lightly toasted
73732,1 teaspoon freshly ground pepper,freshly ground pepper,1.0,teaspoon,
23005,Salt to taste if desired,Salt,0.0,,to taste if desired
37293,0.50 teaspoon fennel seeds,fennel seeds,0.5,teaspoon,


In [6]:
test_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
53939,2 teaspoons Dijon mustard,Dijon mustard,2.0,teaspoon,
105577,"Leaves from 2 sprigs rosemary, chopped",rosemary,2.0,sprig,"Leaves from, chopped"
152379,Freshly ground pepper to taste,pepper,0.0,,
42496,0.13 teaspoon cayenne pepper,cayenne pepper,0.13,teaspoon,
100989,2 bay leaves,bay leaves,2.0,,


In [18]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(
    nlp.pipe(training_data["input"].astype("unicode").values, batch_size=50)
)

#test_data["input"] = list(
#    nlp.pipe(test_data["input"].astype("unicode").values, batch_size=50)
#)

In [8]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58015,"(1, teaspoon, salt)",salt,1.0,teaspoon,
13113,"(0.50, cup, bread, crumbs, ,, preferably, fres...",bread crumbs,0.5,cup,nan preferably fresh and lightly toasted
73732,"(1, teaspoon, freshly, ground, pepper)",freshly ground pepper,1.0,teaspoon,
23005,"(Salt, to, taste, if, desired)",Salt,0.0,,to taste if desired
37293,"(0.50, teaspoon, fennel, seeds)",fennel seeds,0.5,teaspoon,


In [19]:
# for assigining labels we only need lemmas
training_data["name"] = list(
    nlp.pipe(training_data["name"].astype("unicode").values, batch_size=50)
)
training_data["unit"] = list(
    nlp.pipe(training_data["unit"].astype("unicode").values, batch_size=50)
)
training_data["comment"] = list(
    nlp.pipe(training_data["comment"].astype("unicode").values, batch_size=50)
)

"""test_data["name"] = list(
    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)
)
test_data["unit"] = list(
    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)
)
test_data["comment"] = list(
    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)
)"""

'test_data["name"] = list(\n    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)\n)\ntest_data["unit"] = list(\n    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)\n)\ntest_data["comment"] = list(\n    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)\n)'

In [10]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58015,"(1, teaspoon, salt)",(salt),1.0,(teaspoon),(nan)
13113,"(0.50, cup, bread, crumbs, ,, preferably, fres...","(bread, crumbs)",0.5,(cup),"(nan, preferably, fresh, and, lightly, toasted)"
73732,"(1, teaspoon, freshly, ground, pepper)","(freshly, ground, pepper)",1.0,(teaspoon),(nan)
23005,"(Salt, to, taste, if, desired)",(Salt),0.0,(nan),"(to, taste, if, desired)"
37293,"(0.50, teaspoon, fennel, seeds)","(fennel, seeds)",0.5,(teaspoon),(nan)


In [20]:
training_data["input"] = training_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["name"] = training_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["unit"] = training_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["comment"] = training_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)

"""test_data["input"] = test_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["name"] = test_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["unit"] = test_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["comment"] = test_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)"""

'test_data["input"] = test_data["input"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["name"] = test_data["name"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["unit"] = test_data["unit"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)\ntest_data["comment"] = test_data["comment"].apply(\n    lambda doc: [token.lemma_ for token in doc]\n)'

In [12]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58015,"[1, teaspoon, salt]",[salt],1.0,[teaspoon],[nan]
13113,"[0.50, cup, bread, crumb, ,, preferably, fresh...","[bread, crumb]",0.5,[cup],"[nan, preferably, fresh, and, lightly, toast]"
73732,"[1, teaspoon, freshly, grind, pepper]","[freshly, grind, pepper]",1.0,[teaspoon],[nan]
23005,"[Salt, to, taste, if, desire]",[Salt],0.0,[nan],"[to, taste, if, desire]"
37293,"[0.50, teaspoon, fennel, seed]","[fennel, seed]",0.5,[teaspoon],[nan]


In [21]:
training_data["labels"] = training_data.apply(match_up, axis=1)

#test_data["labels"] = test_data.apply(match_up, axis=1)

In [14]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
58015,"[1, teaspoon, salt]",[salt],1.0,[teaspoon],[nan],"[QTY, UNIT, NAME]"
13113,"[0.50, cup, bread, crumb, ,, preferably, fresh...","[bread, crumb]",0.5,[cup],"[nan, preferably, fresh, and, lightly, toast]","[QTY, UNIT, NAME, NAME, OTHER, COMMENT, COMMEN..."
73732,"[1, teaspoon, freshly, grind, pepper]","[freshly, grind, pepper]",1.0,[teaspoon],[nan],"[QTY, UNIT, NAME, NAME, NAME]"
23005,"[Salt, to, taste, if, desire]",[Salt],0.0,[nan],"[to, taste, if, desire]","[NAME, COMMENT, COMMENT, COMMENT, COMMENT]"
37293,"[0.50, teaspoon, fennel, seed]","[fennel, seed]",0.5,[teaspoon],[nan],"[QTY, UNIT, NAME, NAME]"


In [22]:
crf_training_labels = pd.Series(
    training_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)
#crf_test_labels = pd.Series(
#    test_data.apply(
#        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
#    )
#)

In [23]:
crf_training_labels.to_pickle("../data/interim/crf_training_labels.pickle")
#crf_test_labels.to_pickle("../data/interim/crf_test_labels.pickle")