In [1]:
import csv
import unicodedata
import spacy
import pandas as pd
import re
import numpy as np
from decimal import Decimal, InvalidOperation
from fractions import Fraction

nlp = spacy.load('en_core_web_lg', disable=['ner','textcat'])

In [21]:
def word2features(sent, i):

    features = {
        'bias': 1.0,
        'lemma': sent[i].lemma_,
        'pos': sent[i].pos_,
        'tag': sent[i].tag_,
        'dep': sent[i].dep_,
        'shape': sent[i].shape_,
        'is_alpha': sent[i].is_alpha,
        'is_digit': sent[i].is_digit,
        'is_stop': sent[i].is_stop,
        'is_title': sent[i].is_title,
        'is_punct': sent[i].is_punct

    }
    if i > 0:
        features.update({
            '-1:bias': 1.0,
            '-1:lemma': sent[i-1].lemma_,
            '-1:pos': sent[i-1].pos_,
            '-1:tag': sent[i-1].tag_,
            '-1:dep': sent[i-1].dep_,
            '-1:shape': sent[i-1].shape_,
            '-1:is_alpha': sent[i-1].is_alpha,
            '-1:is_digit': sent[i-1].is_digit,
            '-1:is_stop': sent[i-1].is_stop,
            '-1:is_title': sent[i-1].is_title,
            '-1:is_left_punct': sent[i-1].is_left_punct
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        features.update({
            '+1:bias': 1.0,
            '+1:lemma': sent[i+1].lemma_,
            '+1:pos': sent[i+1].pos_,
            '+1:tag': sent[i+1].tag_,
            '+1:dep': sent[i+1].dep_,
            '+1:shape': sent[i+1].shape_,
            '+1:is_alpha': sent[i+1].is_alpha,
            '+1:is_digit': sent[i+1].is_digit,
            '+1:is_stop': sent[i+1].is_stop,
            '+1:is_title': sent[i+1].is_title,
            '+1:is_right_punct': sent[i+1].is_right_punct
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [3]:
## TODO: # handle abbreviation like "100g" by treating it as "100 grams"
#          Maybe with:
#             s = re.sub(r"(\d+)g", r"\1 grams", s)
#             s = re.sub(r"(\d+)oz", r"\1 ounces", s)



def cleanUnicodeFractions(s):
    """
    Replace unicode fractions with ascii representation, preceded by a
    space.

    "1\x215e" => "1 7/8"
    """
    match = re.match(r"([\u2150-\u215E\u00BC-\u00BE])", s)
    if match:
        for m in match.groups():
            s = re.sub(r""+re.escape(m), str(float(Fraction(unicodedata.numeric(m)))), s)
    return s

def _matchUp(df):
    """
    Returns our best guess of the match between the tags and the
    words from the display text.
    This problem is difficult for the following reasons:
        * not all the words in the display name have associated tags
        * the quantity field is stored as a number, but it appears
          as a string in the display name
        * the comment is often a compilation of different comments in
          the display name
    """
    labels = []

    for token in df["input"]:
        decimalToken = None
        try:
            decimalToken = Decimal(token)
        except InvalidOperation:
            pass
        if token in df['name']:
            labels.append("NAME")
        elif token in df['unit']:
            labels.append("UNIT")
        elif decimalToken is not None and decimalToken == df['qty']:
            labels.append("QTY")
        elif token in df['comment']:
            labels.append("COMMENT")
        elif decimalToken is not None and decimalToken == df['range_end']:
            labels.append("RANGE_END")
        else:
            labels.append("OTHER")
    return labels



def _addPrefixes(data):
    """
    We use BIO tagging/chunking to differentiate between tags
    at the start of a tag sequence and those in the middle. This
    is a common technique in entity recognition.

    Reference: http://www.kdd.cis.ksu.edu/Courses/Spring-2013/CIS798/Handouts/04-ramshaw95text.pdf
    """
    prevTag = None
    newData = []

    for token, tag in data:

        newTag = ""

        p = "B" if ((prevTag is None) or (tag != prevTag)) else "I"
        newTag = "%s-%s" % (p, tag)
        newData.append(newTag)
        prevTag = tag

    return newData



def mergeFractions(sent):
    sent = sent.split()
    try:
        frac1 = Fraction(sent[0])
        try:
            frac2 = Fraction(sent[1])
            sent[0] = str(float(frac1 + frac2))
            del sent[1]
        except ValueError:
            sent[0] = str(float(frac1))
    except ValueError:
        pass
    return " ".join(sent)

In [14]:
# Load raw data and do some preprocessing
input_data = pd.read_csv("../data/processed/crf_training_input.csv", nrows=1000)
input_data.drop(["index"], axis=1, inplace=True)
input_data.dropna(axis=0, subset=["input"], inplace=True)
input_data["input"] = input_data["input"].apply(cleanUnicodeFractions) 
# convert mixed and partial fractions at begining of string to float
input_data["input"] = input_data["input"].apply(mergeFractions)

In [15]:
input_data.head(10)

Unnamed: 0,input,name,qty,range_end,unit,comment
0,1.25 cups cooked and pureed fresh butternut sq...,butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,1.0 cup peeled and cooked fresh chestnuts (abo...,chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"1.0 medium-size onion, peeled and chopped",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"2.0 stalks celery, chopped coarse",celery,2.0,0.0,stalk,chopped coarse
4,1.5 tablespoons vegetable oil,vegetable oil,1.5,0.0,tablespoon,
6,"2.0 tablespoons unflavored gelatin, dissolved ...",gelatin,2.0,0.0,tablespoon,"unflavored, dissolved in 1/2 cup water"
7,Salt,Salt,0.0,0.0,,
8,1.0 cup canned plum tomatoes with juice,plum tomatoes,1.0,0.0,cup,"canned, with juice"
9,6.0 cups veal or beef stock,stock,6.0,0.0,cup,veal or beef
10,0.3333333333333333 cup Worcestershire sauce,Worcestershire sauce,0.33,0.0,cup,


In [16]:

# have spacy parse the input string with the full pipeline to generate features this will take some time
input_data["input"] = list(nlp.pipe(input_data["input"].astype('unicode').values, batch_size=50))

In [17]:
input_data.head(10)

Unnamed: 0,input,name,qty,range_end,unit,comment
0,"(1.25, cups, cooked, and, pureed, fresh, butte...",butternut squash,1.25,0.0,cup,"cooked and pureed fresh, or 1 10-ounce package..."
1,"(1.0, cup, peeled, and, cooked, fresh, chestnu...",chestnuts,1.0,0.0,cup,"peeled and cooked fresh (about 20), or 1 cup c..."
2,"(1.0, medium, -, size, onion, ,, peeled, and, ...",onion,1.0,0.0,,"medium-size, peeled and chopped"
3,"(2.0, stalks, celery, ,, chopped, coarse)",celery,2.0,0.0,stalk,chopped coarse
4,"(1.5, tablespoons, vegetable, oil)",vegetable oil,1.5,0.0,tablespoon,
6,"(2.0, tablespoons, unflavored, gelatin, ,, dis...",gelatin,2.0,0.0,tablespoon,"unflavored, dissolved in 1/2 cup water"
7,(Salt),Salt,0.0,0.0,,
8,"(1.0, cup, canned, plum, tomatoes, with, juice)",plum tomatoes,1.0,0.0,cup,"canned, with juice"
9,"(6.0, cups, veal, or, beef, stock)",stock,6.0,0.0,cup,veal or beef
10,"(0.3333333333333333, cup, Worcestershire, sauce)",Worcestershire sauce,0.33,0.0,cup,


In [25]:
[word2features(input_data.iloc[20]["input"], i) for i in range(len(input_data.iloc[20]["input"]))]
                                                  
                                                  

[{'bias': 1.0,
  'lemma': '12.0',
  'pos': 'NUM',
  'tag': 'CD',
  'dep': 'nummod',
  'shape': 'dd.d',
  'is_alpha': False,
  'is_digit': False,
  'is_stop': False,
  'is_title': False,
  'is_punct': False,
  'BOS': True,
  '+1:bias': 1.0,
  '+1:lemma': 'oyster',
  '+1:pos': 'NOUN',
  '+1:tag': 'NNS',
  '+1:dep': 'ROOT',
  '+1:shape': 'xxxx',
  '+1:is_alpha': True,
  '+1:is_digit': False,
  '+1:is_stop': False,
  '+1:is_title': False,
  '+1:is_right_punct': False},
 {'bias': 1.0,
  'lemma': 'oyster',
  'pos': 'NOUN',
  'tag': 'NNS',
  'dep': 'ROOT',
  'shape': 'xxxx',
  'is_alpha': True,
  'is_digit': False,
  'is_stop': False,
  'is_title': False,
  'is_punct': False,
  '-1:bias': 1.0,
  '-1:lemma': '12.0',
  '-1:pos': 'NUM',
  '-1:tag': 'CD',
  '-1:dep': 'nummod',
  '-1:shape': 'dd.d',
  '-1:is_alpha': False,
  '-1:is_digit': False,
  '-1:is_stop': False,
  '-1:is_title': False,
  '-1:is_left_punct': False,
  '+1:bias': 1.0,
  '+1:lemma': 'and',
  '+1:pos': 'CCONJ',
  '+1:tag': 'CC',

In [11]:
input_data["input"]

0      [(1.25, NUM, CD, nummod, d.dd, False, False), ...
1      [(1.0, NUM, CD, nummod, d.d, False, False), (c...
2      [(1.0, NUM, CD, nummod, d.d, False, False), (m...
3      [(2.0, NUM, CD, nummod, d.d, False, False), (s...
4      [(1.5, NUM, CD, nummod, d.d, False, False), (t...
                             ...                        
995    [(1.0, NUM, CD, nummod, d.d, False, False), (t...
996    [(0.25, NUM, CD, nummod, d.dd, False, False), ...
997    [(1.5, NUM, CD, nummod, d.d, False, False), (c...
998    [(0.75, NUM, CD, nummod, d.dd, False, False), ...
999    [(0.5, NUM, CD, nummod, d.d, False, False), (c...
Name: input, Length: 998, dtype: object

In [9]:
crf_training_data.to_csv("../data/interim/crf_training_features.csv", index=False)

  """Entry point for launching an IPython kernel.
