In [41]:
import csv
import unicodedata
import spacy
import pandas as pd
import re
import numpy as np
from decimal import Decimal, InvalidOperation
from fractions import Fraction

nlp = spacy.load('en_core_web_lg', disable=['tagger','parser','ner','textcat'])

In [58]:
## TODO: # handle abbreviation like "100g" by treating it as "100 grams"
#          Maybe with:
#             s = re.sub(r"(\d+)g", r"\1 grams", s)
#             s = re.sub(r"(\d+)oz", r"\1 ounces", s)



def cleanUnicodeFractions(s):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """
    match = re.match(r"([\u2150-\u215E\u00BC-\u00BE])", s)
    if match:
        for m in match.groups():
            s = re.sub(r""+re.escape(m), str(float(Fraction(unicodedata.numeric(m)))), s)
    return s


def _row_to_labels(row):
    """Extracts labels from a labelled ingredient data row.

    Args:
        A row of full data about an ingredient, including input and labels.

    Returns:
        A dictionary of the label data extracted from the row.
    """
    labels = {}
    label_keys = ["name", "qty", "range_end", "unit", "comment"]
    for key in label_keys:
        labels[key] = row[key]
    return labels


def _parseNumbers(s):
    """
    Parses a string that represents a number into a decimal data type so that
    we can match the quantity field in the db with the quantity that appears
    in the display name. Rounds the result to 2 places.
    """

    # Replacess $'s with spaces. The reverse of clumpFractions.
    ss = re.sub(r"\$", " ", s)

    m3 = re.match(r"^\d+$", ss)
    if m3 is not None:
        return Decimal(round(Decimal(ss), 2))

    m1 = re.match(r"(\d+)\s+(\d)/(\d)", ss)
    if m1 is not None:
        num = int(m1.group(1)) + (Decimal(m1.group(2)) / Decimal(m1.group(3)))
        return Decimal(str(round(num, 2)))

    m2 = re.match(r"^(\d)/(\d)$", ss)
    if m2 is not None:
        num = Decimal(m2.group(1)) / Decimal(m2.group(2))
        return Decimal(str(round(num, 2)))

    return None

def _matchUp(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    ret = []

    # strip parens from the token, since they often appear in the
    # display_name, but are removed from the comment.
    token = utils.normalizeToken(token)
    decimalToken = _parseNumbers(df["input"])

    # Iterate through the labels in descending order of label importance.
    for label_key in ['name', 'unit', 'qty', 'comment', 'range_end']:
        label_value = df[label_key]
        if isinstance(label_value, basestring):
            for n, vt in enumerate(tokenizer.tokenize(label_value)):
                if utils.normalizeToken(vt) == token:
                    ret.append(label_key.upper())

        elif decimalToken is not None:
            if label_value == decimalToken:
                ret.append(label_key.upper())

    return ret



def _addPrefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prevTag = None
    newData = []

    for n, (token, tag) in enumerate(data):

        newTag = ""

        p = "B" if ((prevTag is None) or (tag == prevTag)) else "I"
        newTag = "%s-%s" % (p, tag)

        newData.append((token, newTag))
        prevTag = tag

    return newData


def _bestTag(tags):

    if len(tags) == 1:
        return tags[0]

    # if there are multiple tags, pick the first which isn't COMMENT
    else:
        for t in tags:
            if (t != "B-COMMENT") and (t != "I-COMMENT"):
                return t

    # we have no idea what to guess
    return "OTHER"

def mergeFractions(sent):
    sent = sent.split()
    try:
        frac1 = Fraction(sent[0])
        try:
            frac2 = Fraction(sent[1])
            sent[0] = str(float(frac1 + frac2))
            del sent[1]
        except ValueError:
            sent[0] = str(float(frac1))
    except ValueError:
        pass
    return " ".join(sent)

In [4]:
# Load raw data and do some preprocessing
input_data = pd.read_csv("../data/processed/crf_training_input.csv")
input_data.drop(["index"], axis=1, inplace=True)
input_data.dropna(axis=0, subset=["input"], inplace=True)
input_data["input"] = input_data["input"].apply(cleanUnicodeFractions) 
# convert mixed and partial fractions at begining of string to float
input_data["input"] = input_data["input"].apply(mergeFractions)

In [5]:
input_data.head(10)

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1.0 cup peeled and cooked fresh chestnuts (abo...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1.0 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2.0 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,
6,"2.0 tablespoons unflavored gelatin, dissolved ...",gelatin,2.0,0.0,tablespoon,"unflavored, dissolved in 1/2 cup water"
7,Salt,Salt,0.0,0.0,,
8,1.0 cup canned plum tomatoes with juice,plum tomatoes,1.0,0.0,cup,"canned, with juice"
9,6.0 cups veal or beef stock,stock,6.0,0.0,cup,veal or beef
10,0.3333333333333333 cup Worcestershire sauce,Worcestershire sauce,0.33,0.0,cup,


In [6]:
# have spacy parse the input string with the full pipeline to generate features this will take some time
input_data["input"] = list(nlp.pipe(input_data["input"].astype('unicode').values, batch_size=50))


In [7]:
input_data.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,"(1.25, cups, cooked, and, pureed, fresh, butte...",butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,"(1.0, cup, peeled, and, cooked, fresh, chestnu...",chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"(1.0, medium, -, size, onion, ,, peeled, and, ...",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"(2.0, stalks, celery, ,, chopped, coarse)",celery,2.0,0.0,stalk,chopped coarse
4,"(1.5, tablespoons, vegetable, oil)",vegetable oil,1.5,0.0,tablespoon,


In [8]:
# for assigining labels we only need lemmas
input_data["name"] = list(nlp.pipe(input_data["name"].astype('unicode').values, batch_size=50))
input_data["unit"] = list(nlp.pipe(input_data["unit"].astype('unicode').values, batch_size=50))
input_data["comment"] = list(nlp.pipe(input_data["comment"].astype('unicode').values, batch_size=50))


In [9]:
input_data.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,"(1.25, cups, cooked, and, pureed, fresh, butte...","(butternut, squash)",1.25,0.0,(cup),"(cooked, and, pureed, fresh, ,, or, 1, 10-ounc..."
1,"(1.0, cup, peeled, and, cooked, fresh, chestnu...",(chestnuts),1.0,0.0,(cup),"(peeled, and, cooked, fresh, (, about, 20, ), ..."
2,"(1.0, medium, -, size, onion, ,, peeled, and, ...",(onion),1.0,0.0,(nan),"(medium, -, size, ,, peeled, and, chopped)"
3,"(2.0, stalks, celery, ,, chopped, coarse)",(celery),2.0,0.0,(stalk),"(chopped, coarse)"
4,"(1.5, tablespoons, vegetable, oil)","(vegetable, oil)",1.5,0.0,(tablespoon),(nan)


In [10]:
input_data["input"] = input_data["input"].apply(lambda doc: [token.lemma_ for token in doc])
input_data["name"] = input_data["name"].apply(lambda doc: [token.lemma_ for token in doc])
input_data["unit"] = input_data["unit"].apply(lambda doc: [token.lemma_ for token in doc])
input_data["comment"] = input_data["comment"].apply(lambda doc: [token.lemma_ for token in doc])


In [26]:
input_data.head()

Unnamed: 0,input,name,qty,range_end,unit,comment
0,"[1.25, cup, cook, and, puree, fresh, butternut...","[butternut, squash]",1.25,0.0,[cup],"[cook, and, puree, fresh, ,, or, 1, 10-ounce, ..."
1,"[1.0, cup, peel, and, cook, fresh, chestnut, (...",[chestnut],1.0,0.0,[cup],"[peel, and, cook, fresh, (, about, 20, ), ,, o..."
2,"[1.0, medium, -, size, onion, ,, peel, and, chop]",[onion],1.0,0.0,[nan],"[medium, -, size, ,, peel, and, chop]"
3,"[2.0, stalk, celery, ,, chop, coarse]",[celery],2.0,0.0,[stalk],"[chop, coarse]"
4,"[1.5, tablespoon, vegetable, oil]","[vegetable, oil]",1.5,0.0,[tablespoon],[nan]


0         [1.25, cups, cooked, and, pureed, fresh, butte...
1         [1.0, cup, peeled, and, cooked, fresh, chestnu...
2         [1.0, medium, -, size, onion, ,, peeled, and, ...
3                 [2.0, stalks, celery, ,, chopped, coarse]
4                        [1.5, tablespoons, vegetable, oil]
                                ...                        
179202                      [0.75, oz, ., pineapple, juice]
179203                   [1.0, tsp, ., fresh, lemon, juice]
179204                                 [Angostura, bitters]
179205                               [Wedge, of, pineapple]
179206                                   [Brandied, cherry]
Name: tokens, Length: 179063, dtype: object

0         [1.25, cups, cooked, and, pureed, fresh, butte...
1         [1.0, cup, peeled, and, cooked, fresh, chestnu...
2         [1.0, medium, -, size, onion, ,, peeled, and, ...
3                 [2.0, stalks, celery, ,, chopped, coarse]
4                        [1.5, tablespoons, vegetable, oil]
                                ...                        
179202                      [0.75, oz, ., pineapple, juice]
179203                   [1.0, tsp, ., fresh, lemon, juice]
179204                                 [Angostura, bitters]
179205                               [Wedge, of, pineapple]
179206                                   [Brandied, cherry]
Name: tokens, Length: 179063, dtype: object

In [59]:
labels = []
print(input_data.iloc[0]["input"])

for token in input_data.iloc[0]["input"]:
    decimalToken = None
    try:
        decimalToken = Decimal(token)
    except InvalidOperation:
        pass
    for label_key in ['name', 'unit', 'qty', 'comment', 'range_end']:
        if isinstance(input_data.iloc[0][label_key], list):
            if token in input_data.iloc[0][label_key]:
                labels.append(label_key.upper())
        elif decimalToken == input_data.iloc[0][label_key]:
            labels.append(label_key.upper())

_addPrefixes([(input_data.iloc[0]["input"], labels)])
    

['1.25', 'cup', 'cook', 'and', 'puree', 'fresh', 'butternut', 'squash', ',', 'or', '1', '10-ounce', 'package', 'freeze', 'squash', ',', 'defrost']


NameError: name 'newTags' is not defined