In [20]:
from decimal import Decimal, InvalidOperation
import spacy
import pandas as pd

In [2]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])

In [23]:
def match_up(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    labels = []

    for token in df["input"]:
        decimal_token = None
        try:
            decimal_token = Decimal(token)
        except InvalidOperation:
            pass
        if token in df["name"]:
            labels.append("NAME")
        elif token in df["unit"]:
            labels.append("UNIT")
        elif decimal_token is not None and decimal_token == df["qty"]:
            labels.append("QTY")
        elif token in df["comment"]:
            labels.append("COMMENT")
        else:
            labels.append("OTHER")
    return labels

def add_prefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prev_tag = None
    new_data = []

    for token, tag in data:

        new_tag = ""

        p = "B" if ((prev_tag is None) or (tag != prev_tag)) else "I"
        new_tag = "%s-%s" % (p, tag)
        new_data.append(new_tag)
        prev_tag = tag

    return new_data


In [7]:
# Load raw data and do some preprocessing
training_data = pd.read_pickle("../data/interim/crf_training_data.pickle")
test_data = pd.read_pickle("../data/interim/crf_test_data.pickle")

In [8]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
140056,3/4 cup sour cream,sour cream,0.75,cup,
69015,Salt to taste if desired,Salt,0.0,,to taste if desired
75350,Salt to taste if desired,Salt,0.0,,to taste if desired
27139,"1/2 pound baby carrots, cut into 1/2-inch piec...",carrots,0.5,pound,"baby, cut into 1/2-inch pieces and steamed unt..."
120378,2 cup flat-leaf parsley leaves,flat-leaf parsley leaves,2.0,cup,


In [9]:
test_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
56726,2 teaspoons Dijon-style mustard,Dijon-style mustard,2.0,teaspoon,
162845,Orange peel twist to garnish,Orange\tpeel,0.0,,
112236,2 tablespoons red-wine vinegar,red-wine vinegar,2.0,tablespoon,
91776,1/2 cup olive oil,olive oil,0.5,cup,
16599,2 1/2 teaspoons kosher salt,salt,2.5,teaspoon,kosher


In [10]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(
    nlp.pipe(training_data["input"].astype("unicode").values, batch_size=50)
)
test_data["input"] = list(
    nlp.pipe(test_data["input"].astype("unicode").values, batch_size=50)
)

In [11]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
140056,"(3/4, cup, sour, cream)",sour cream,0.75,cup,
69015,"(Salt, to, taste, if, desired)",Salt,0.0,,to taste if desired
75350,"(Salt, to, taste, if, desired)",Salt,0.0,,to taste if desired
27139,"(1/2, pound, baby, carrots, ,, cut, into, 1/2-...",carrots,0.5,pound,"baby, cut into 1/2-inch pieces and steamed unt..."
120378,"(2, cup, flat, -, leaf, parsley, leaves)",flat-leaf parsley leaves,2.0,cup,


In [12]:
# for assigining labels we only need lemmas
training_data["name"] = list(
    nlp.pipe(training_data["name"].astype("unicode").values, batch_size=50)
)
training_data["unit"] = list(
    nlp.pipe(training_data["unit"].astype("unicode").values, batch_size=50)
)
training_data["comment"] = list(
    nlp.pipe(training_data["comment"].astype("unicode").values, batch_size=50)
)

test_data["name"] = list(
    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)
)
test_data["unit"] = list(
    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)
)
test_data["comment"] = list(
    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)
)

In [13]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
140056,"(3/4, cup, sour, cream)","(sour, cream)",0.75,(cup),(nan)
69015,"(Salt, to, taste, if, desired)",(Salt),0.0,(nan),"(to, taste, if, desired)"
75350,"(Salt, to, taste, if, desired)",(Salt),0.0,(nan),"(to, taste, if, desired)"
27139,"(1/2, pound, baby, carrots, ,, cut, into, 1/2-...",(carrots),0.5,(pound),"(baby, ,, cut, into, 1/2-inch, pieces, and, st..."
120378,"(2, cup, flat, -, leaf, parsley, leaves)","(flat, -, leaf, parsley, leaves)",2.0,(cup),(nan)


In [14]:
training_data["input"] = training_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["name"] = training_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["unit"] = training_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["comment"] = training_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)

test_data["input"] = test_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["name"] = test_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["unit"] = test_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["comment"] = test_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)

In [15]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
140056,"[3/4, cup, sour, cream]","[sour, cream]",0.75,[cup],[nan]
69015,"[Salt, to, taste, if, desire]",[Salt],0.0,[nan],"[to, taste, if, desire]"
75350,"[Salt, to, taste, if, desire]",[Salt],0.0,[nan],"[to, taste, if, desire]"
27139,"[1/2, pound, baby, carrot, ,, cut, into, 1/2-i...",[carrot],0.5,[pound],"[baby, ,, cut, into, 1/2-inch, piece, and, ste..."
120378,"[2, cup, flat, -, leaf, parsley, leave]","[flat, -, leaf, parsley, leave]",2.0,[cup],[nan]


In [24]:
training_data["labels"] = training_data.apply(match_up, axis=1)

test_data["labels"] = test_data.apply(match_up, axis=1)

In [25]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
140056,"[3/4, cup, sour, cream]","[sour, cream]",0.75,[cup],[nan],"[OTHER, UNIT, NAME, NAME]"
69015,"[Salt, to, taste, if, desire]",[Salt],0.0,[nan],"[to, taste, if, desire]","[NAME, COMMENT, COMMENT, COMMENT, COMMENT]"
75350,"[Salt, to, taste, if, desire]",[Salt],0.0,[nan],"[to, taste, if, desire]","[NAME, COMMENT, COMMENT, COMMENT, COMMENT]"
27139,"[1/2, pound, baby, carrot, ,, cut, into, 1/2-i...",[carrot],0.5,[pound],"[baby, ,, cut, into, 1/2-inch, piece, and, ste...","[OTHER, UNIT, COMMENT, NAME, COMMENT, COMMENT,..."
120378,"[2, cup, flat, -, leaf, parsley, leave]","[flat, -, leaf, parsley, leave]",2.0,[cup],[nan],"[QTY, UNIT, NAME, NAME, NAME, NAME, NAME]"


In [26]:
crf_training_labels = pd.Series(
    training_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)
crf_test_labels = pd.Series(
    test_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)

NameError: ("name 'new_data' is not defined", 'occurred at index 140056')

In [None]:
crf_training_labels.to_pickle("../data/interim/crf_training_labels.pickle")
crf_test_labels.to_pickle("../data/interim/crf_test_labels.pickle")