In [1]:
from decimal import Decimal
import spacy
import pandas as pd

In [None]:
nlp = spacy.load("en_core_web_lg", disable=["tagger", "parser", "ner", "textcat"])

In [2]:
def match_up(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    labels = []

    for token in df["input"]:
        decimal_token = None
        try:
            decimal_token = Decimal(token)
        except InvalidOperation:
            pass
        if token in df["name"]:
            labels.append("NAME")
        elif token in df["unit"]:
            labels.append("UNIT")
        elif decimal_token is not None and decimal_token == df["qty"]:
            labels.append("QTY")
        elif token in df["comment"]:
            labels.append("COMMENT")
        elif decimal_token is not None and decimal_token == df["range_end"]:
            labels.append("RANGE_END")
        else:
            labels.append("OTHER")
    return labels



def add_prefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prev_tag = None
    newData = []

    for token, tag in data:

        new_tag = ""

        p = "B" if ((prev_tag is None) or (tag != prev_tag)) else "I"
        new_tag = "%s-%s" % (p, tag)
        new_data.append(new_tag)
        prev_tag = tag

    return new_data


In [4]:
# Load raw data and do some preprocessing
training_data = pd.read_pickle("../../data/interim/crf_training_data.pickle")
test_data = pd.read_pickle("../../data/interim/crf_test_data.pickle")

In [5]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107273,"Freshly grated imported Parmesan cheese, prefe...",Parmesan cheese,0.0,0.0,,"Freshly grated imported, preferably parmigiano..."
58171,"1 large sweet potato, peeled and cut into 1/2-...",sweet potato,1.0,0.0,,"large, peeled and cut into 1/2-inch cubes"
6569,Freshly ground black pepper to taste,black pepper,0.0,0.0,,Freshly ground to taste
177211,0.25 cup all-purpose flour,all-purpose flour,0.25,0.0,cup,
33971,"2 2-ounce cans anchovy fillets, packed in oil",anchovy fillets,4.0,0.0,ounce,"2 2-ounce cans, packed in oil"


In [6]:
test_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
102201,1 cup raw rice,rice,1.0,0.0,cup,raw
149431,1/2 teaspoon salt,salt,0.5,0.0,teaspoon,
50733,1 cup heavy cream,heavy cream,1.0,0.0,cup,
66931,"2 cloves garlic, peeled and minced",garlic,2.0,0.0,clove,peeled and minced
78296,1/4 cup Marsala wine,Marsala wine,0.25,0.0,cup,


In [7]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(
    nlp.pipe(training_data["input"].astype("unicode").values, batch_size=50)
)
test_data["input"] = list(
    nlp.pipe(test_data["input"].astype("unicode").values, batch_size=50)
)

In [8]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107273,"(Freshly, grated, imported, Parmesan, cheese, ...",Parmesan cheese,0.0,0.0,,"Freshly grated imported, preferably parmigiano..."
58171,"(1, large, sweet, potato, ,, peeled, and, cut,...",sweet potato,1.0,0.0,,"large, peeled and cut into 1/2-inch cubes"
6569,"(Freshly, ground, black, pepper, to, taste)",black pepper,0.0,0.0,,Freshly ground to taste
177211,"(0.25, cup, all, -, purpose, flour)",all-purpose flour,0.25,0.0,cup,
33971,"(2, 2-ounce, cans, anchovy, fillets, ,, packed...",anchovy fillets,4.0,0.0,ounce,"2 2-ounce cans, packed in oil"


In [9]:
# for assigining labels we only need lemmas
training_data["name"] = list(
    nlp.pipe(training_data["name"].astype("unicode").values, batch_size=50)
)
training_data["unit"] = list(
    nlp.pipe(training_data["unit"].astype("unicode").values, batch_size=50)
)
training_data["comment"] = list(
    nlp.pipe(training_data["comment"].astype("unicode").values, batch_size=50)
)

test_data["name"] = list(
    nlp.pipe(test_data["name"].astype("unicode").values, batch_size=50)
)
test_data["unit"] = list(
    nlp.pipe(test_data["unit"].astype("unicode").values, batch_size=50)
)
test_data["comment"] = list(
    nlp.pipe(test_data["comment"].astype("unicode").values, batch_size=50)
)

In [10]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107273,"(Freshly, grated, imported, Parmesan, cheese, ...","(Parmesan, cheese)",0.0,0.0,(nan),"(Freshly, grated, imported, ,, preferably, par..."
58171,"(1, large, sweet, potato, ,, peeled, and, cut,...","(sweet, potato)",1.0,0.0,(nan),"(large, ,, peeled, and, cut, into, 1/2-inch, c..."
6569,"(Freshly, ground, black, pepper, to, taste)","(black, pepper)",0.0,0.0,(nan),"(Freshly, ground, to, taste)"
177211,"(0.25, cup, all, -, purpose, flour)","(all, -, purpose, flour)",0.25,0.0,(cup),(nan)
33971,"(2, 2-ounce, cans, anchovy, fillets, ,, packed...","(anchovy, fillets)",4.0,0.0,(ounce),"(2, 2-ounce, cans, ,, packed, in, oil)"


In [11]:
training_data["input"] = training_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["name"] = training_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["unit"] = training_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
training_data["comment"] = training_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)

test_data["input"] = test_data["input"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["name"] = test_data["name"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["unit"] = test_data["unit"].apply(
    lambda doc: [token.lemma_ for token in doc]
)
test_data["comment"] = test_data["comment"].apply(
    lambda doc: [token.lemma_ for token in doc]
)

In [12]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107273,"[Freshly, grate, import, Parmesan, cheese, ,, ...","[Parmesan, cheese]",0.0,0.0,[nan],"[Freshly, grate, import, ,, preferably, parmig..."
58171,"[1, large, sweet, potato, ,, peel, and, cut, i...","[sweet, potato]",1.0,0.0,[nan],"[large, ,, peel, and, cut, into, 1/2-inch, cube]"
6569,"[Freshly, grind, black, pepper, to, taste]","[black, pepper]",0.0,0.0,[nan],"[Freshly, grind, to, taste]"
177211,"[0.25, cup, all, -, purpose, flour]","[all, -, purpose, flour]",0.25,0.0,[cup],[nan]
33971,"[2, 2-ounce, can, anchovy, fillet, ,, pack, in...","[anchovy, fillet]",4.0,0.0,[ounce],"[2, 2-ounce, can, ,, pack, in, oil]"


In [13]:
training_data["labels"] = training_data.apply(_matchUp, axis=1)

test_data["labels"] = test_data.apply(_matchUp, axis=1)

In [14]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment,labels
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
107273,"[Freshly, grate, import, Parmesan, cheese, ,, ...","[Parmesan, cheese]",0.0,0.0,[nan],"[Freshly, grate, import, ,, preferably, parmig...","[COMMENT, COMMENT, COMMENT, NAME, NAME, COMMEN..."
58171,"[1, large, sweet, potato, ,, peel, and, cut, i...","[sweet, potato]",1.0,0.0,[nan],"[large, ,, peel, and, cut, into, 1/2-inch, cube]","[QTY, COMMENT, NAME, NAME, COMMENT, COMMENT, C..."
6569,"[Freshly, grind, black, pepper, to, taste]","[black, pepper]",0.0,0.0,[nan],"[Freshly, grind, to, taste]","[COMMENT, COMMENT, NAME, NAME, COMMENT, COMMENT]"
177211,"[0.25, cup, all, -, purpose, flour]","[all, -, purpose, flour]",0.25,0.0,[cup],[nan],"[QTY, UNIT, NAME, NAME, NAME, NAME]"
33971,"[2, 2-ounce, can, anchovy, fillet, ,, pack, in...","[anchovy, fillet]",4.0,0.0,[ounce],"[2, 2-ounce, can, ,, pack, in, oil]","[COMMENT, COMMENT, COMMENT, NAME, NAME, COMMEN..."


In [15]:
crf_training_labels = pd.Series(
    training_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)
crf_test_labels = pd.Series(
    test_data.apply(
        lambda row: add_prefixes(zip(row["input"], row["labels"])), axis=1
    )
)

In [16]:
crf_training_labels.to_pickle("../../data/interim/crf_training_labels.pickle")
crf_test_labels.to_pickle("../../data/interim/crf_test_labels.pickle")