In [1]:
import spacy
import pandas as pd

In [2]:
# Load spacy NLP model
nlp = spacy.load("en_core_web_lg", disable=["ner", "textcat"])

In [3]:
def word2features(sent, i):

    features = {
        "bias": 1.0,
        "lemma": sent[i].lemma_,
        "pos": sent[i].pos_,
        "tag": sent[i].tag_,
        "dep": sent[i].dep_,
        "shape": sent[i].shape_,
        "is_alpha": sent[i].is_alpha,
        "is_stop": sent[i].is_stop,
        "is_title": sent[i].is_title,
        "like_num": sent[i].like_num,
        "is__left_punct": sent[i].is_left_punct,
        "is__right_punct": sent[i].is_right_punct,
    }
    if i > 0:
        features.update(
            {
                "-1:lemma": sent[i - 1].lemma_,
                "-1:pos": sent[i - 1].pos_,
                "-1:tag": sent[i - 1].tag_,
                "-1:dep": sent[i - 1].dep_,
                "-1:shape": sent[i - 1].shape_,
                "-1:is_alpha": sent[i - 1].is_alpha,
                "-1:is_stop": sent[i - 1].is_stop,
                "-1:is_title": sent[i - 1].is_title,
                "-1:like_num": sent[i - 1].like_num,
                "-1:is_left_punct": sent[i - 1].is_left_punct,
                "-1:is_right_punct": sent[i - 1].is_right_punct,
            }
        )
        if i > 1:
            features.update(
                {
                    "-2:lemma": sent[i - 2].lemma_,
                    "-2:pos": sent[i - 2].pos_,
                    "-2:tag": sent[i - 2].tag_,
                    "-2:dep": sent[i - 2].dep_,
                    "-2:shape": sent[i - 2].shape_,
                    "-2:is_alpha": sent[i - 2].is_alpha,
                    "-2:is_stop": sent[i - 2].is_stop,
                    "-2:is_title": sent[i - 2].is_title,
                    "-2:like_num": sent[i - 2].like_num,
                    "-2:is_left_punct": sent[i - 2].is_left_punct,
                    "-2:is_right_punct": sent[i - 2].is_right_punct,
                }
            )
    else:
        features["BOS"] = True

    if i < len(sent) - 1:
        features.update(
            {
                "+1:lemma": sent[i + 1].lemma_,
                "+1:pos": sent[i + 1].pos_,
                "+1:tag": sent[i + 1].tag_,
                "+1:dep": sent[i + 1].dep_,
                "+1:shape": sent[i + 1].shape_,
                "+1:is_alpha": sent[i + 1].is_alpha,
                "+1:is_stop": sent[i + 1].is_stop,
                "+1:is_title": sent[i + 1].is_title,
                "+1:like_num": sent[i + 1].like_num,
                "+1:is_left_punct": sent[i + 1].is_left_punct,
                "+1:is_right_punct": sent[i + 1].is_right_punct,
            }
        )
        if i < len(sent) - 2:
            features.update(
                {
                    "+2:lemma": sent[i + 2].lemma_,
                    "+2:pos": sent[i + 2].pos_,
                    "+2:tag": sent[i + 2].tag_,
                    "+2:dep": sent[i + 2].dep_,
                    "+2:shape": sent[i + 2].shape_,
                    "+2:is_alpha": sent[i + 2].is_alpha,
                    "+2:is_stop": sent[i + 2].is_stop,
                    "+2:is_title": sent[i + 2].is_title,
                    "+2:like_num": sent[i + 2].like_num,
                    "+2:is_right_punct": sent[i + 2].is_left_punct,
                    "+2:is_right_punct": sent[i + 2].is_right_punct,
                }
            )
    else:
        features["EOS"] = True

    return features

In [12]:
# Load cleaned data
training_data = pd.read_pickle("../data/interim/crf_training_data.pickle")
#test_data = pd.read_pickle("../data/interim/crf_test_data.pickle")

In [5]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58015,1 teaspoon salt,salt,1.0,teaspoon,
13113,"0.50 cup bread crumbs, preferably fresh and li...",bread crumbs,0.5,cup,nan preferably fresh and lightly toasted
73732,1 teaspoon freshly ground pepper,freshly ground pepper,1.0,teaspoon,
23005,Salt to taste if desired,Salt,0.0,,to taste if desired
37293,0.50 teaspoon fennel seeds,fennel seeds,0.5,teaspoon,


In [6]:
test_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
53939,2 teaspoons Dijon mustard,Dijon mustard,2.0,teaspoon,
105577,"Leaves from 2 sprigs rosemary, chopped",rosemary,2.0,sprig,"Leaves from, chopped"
152379,Freshly ground pepper to taste,pepper,0.0,,
42496,0.13 teaspoon cayenne pepper,cayenne pepper,0.13,teaspoon,
100989,2 bay leaves,bay leaves,2.0,,


In [13]:

# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(nlp.pipe(training_data["input"].astype('unicode').values, batch_size=50))

#test_data["input"] = list(nlp.pipe(test_data["input"].astype('unicode').values, batch_size=50))


In [8]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
58015,"(1, teaspoon, salt)",salt,1.0,teaspoon,
13113,"(0.50, cup, bread, crumbs, ,, preferably, fres...",bread crumbs,0.5,cup,nan preferably fresh and lightly toasted
73732,"(1, teaspoon, freshly, ground, pepper)",freshly ground pepper,1.0,teaspoon,
23005,"(Salt, to, taste, if, desired)",Salt,0.0,,to taste if desired
37293,"(0.50, teaspoon, fennel, seeds)",fennel seeds,0.5,teaspoon,


In [14]:
crf_training_features = training_data["input"].apply(lambda doc: [word2features(doc,i) for i in range(len(doc))])
                                                  
#crf_test_features = test_data["input"].apply(lambda doc: [word2features(doc,i) for i in range(len(doc))])                                                  


In [10]:
crf_training_features[0]

[{'bias': 1.0,
  'lemma': '1.25',
  'pos': 'NUM',
  'tag': 'CD',
  'dep': 'nummod',
  'shape': 'd.dd',
  'is_alpha': False,
  'is_stop': False,
  'is_title': False,
  'like_num': True,
  'is__left_punct': False,
  'is__right_punct': False,
  'BOS': True,
  '+1:lemma': 'cup',
  '+1:pos': 'NOUN',
  '+1:tag': 'NNS',
  '+1:dep': 'nsubj',
  '+1:shape': 'xxxx',
  '+1:is_alpha': True,
  '+1:is_stop': False,
  '+1:is_title': False,
  '+1:like_num': False,
  '+1:is_left_punct': False,
  '+1:is_right_punct': False,
  '+2:lemma': 'cook',
  '+2:pos': 'VERB',
  '+2:tag': 'VBN',
  '+2:dep': 'acl',
  '+2:shape': 'xxxx',
  '+2:is_alpha': True,
  '+2:is_stop': False,
  '+2:is_title': False,
  '+2:like_num': False,
  '+2:is_right_punct': False},
 {'bias': 1.0,
  'lemma': 'cup',
  'pos': 'NOUN',
  'tag': 'NNS',
  'dep': 'nsubj',
  'shape': 'xxxx',
  'is_alpha': True,
  'is_stop': False,
  'is_title': False,
  'like_num': False,
  'is__left_punct': False,
  'is__right_punct': False,
  '-1:lemma': '1.25',


In [15]:
crf_training_features.to_pickle("../data/interim/crf_training_features.pickle")
#crf_test_features.to_pickle("../data/interim/crf_test_features.pickle")