In [11]:
import spacy
import pandas as pd

In [None]:
# Load spacy NLP model
nlp = spacy.load("en_core_web_lg", disable=["ner", "textcat"])

In [2]:
def word2features(sent, i):

    features = {
        "bias": 1.0,
        "lemma": sent[i].lemma_,
        "pos": sent[i].pos_,
        "tag": sent[i].tag_,
        "dep": sent[i].dep_,
        "shape": sent[i].shape_,
        "is_alpha": sent[i].is_alpha,
        "is_stop": sent[i].is_stop,
        "is_title": sent[i].is_title,
        "is_punct": sent[i].is_punct,
    }
    if i > 0:
        features.update(
            {
                "-1:lemma": sent[i - 1].lemma_,
                "-1:pos": sent[i - 1].pos_,
                "-1:tag": sent[i - 1].tag_,
                "-1:dep": sent[i - 1].dep_,
                "-1:shape": sent[i - 1].shape_,
                "-1:is_alpha": sent[i - 1].is_alpha,
                "-1:is_stop": sent[i - 1].is_stop,
                "-1:is_title": sent[i - 1].is_title,
                "-1:is_left_punct": sent[i - 1].is_left_punct,
            }
        )
        if i > 1:
            features.update(
                {
                    "-2:lemma": sent[i - 2].lemma_,
                    "-2:pos": sent[i - 2].pos_,
                    "-2:tag": sent[i - 2].tag_,
                    "-2:dep": sent[i - 2].dep_,
                    "-2:shape": sent[i - 2].shape_,
                    "-2:is_alpha": sent[i - 2].is_alpha,
                    "-2:is_stop": sent[i - 2].is_stop,
                    "-2:is_title": sent[i - 2].is_title,
                    "-2:is_left_punct": sent[i - 2].is_left_punct,
                }
            )
    else:
        features["BOS"] = True

    if i < len(sent) - 1:
        features.update(
            {
                "+1:lemma": sent[i + 1].lemma_,
                "+1:pos": sent[i + 1].pos_,
                "+1:tag": sent[i + 1].tag_,
                "+1:dep": sent[i + 1].dep_,
                "+1:shape": sent[i + 1].shape_,
                "+1:is_alpha": sent[i + 1].is_alpha,
                "+1:is_stop": sent[i + 1].is_stop,
                "+1:is_title": sent[i + 1].is_title,
                "+1:is_right_punct": sent[i + 1].is_right_punct,
            }
        )
        if i < len(sent) - 2:
            features.update(
                {
                    "+2:lemma": sent[i + 2].lemma_,
                    "+2:pos": sent[i + 2].pos_,
                    "+2:tag": sent[i + 2].tag_,
                    "+2:dep": sent[i + 2].dep_,
                    "+2:shape": sent[i + 2].shape_,
                    "+2:is_alpha": sent[i + 2].is_alpha,
                    "+2:is_stop": sent[i + 2].is_stop,
                    "+2:is_title": sent[i + 2].is_title,
                    "+2:is_right_punct": sent[i + 2].is_right_punct,
                }
            )
    else:
        features["EOS"] = True

    return features

In [3]:
# Load cleaned data
training_data = pd.read_pickle("../data/interim/crf_training_data.pickle")
test_data = pd.read_pickle("../data/interim/crf_test_data.pickle")

In [4]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107273,"Freshly grated imported Parmesan cheese, prefe...",Parmesan cheese,0.0,0.0,,"Freshly grated imported, preferably parmigiano..."
58171,"1 large sweet potato, peeled and cut into 1/2-...",sweet potato,1.0,0.0,,"large, peeled and cut into 1/2-inch cubes"
6569,Freshly ground black pepper to taste,black pepper,0.0,0.0,,Freshly ground to taste
177211,0.25 cup all-purpose flour,all-purpose flour,0.25,0.0,cup,
33971,"2 2-ounce cans anchovy fillets, packed in oil",anchovy fillets,4.0,0.0,ounce,"2 2-ounce cans, packed in oil"


In [5]:
test_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
102201,1 cup raw rice,rice,1.0,0.0,cup,raw
149431,1/2 teaspoon salt,salt,0.5,0.0,teaspoon,
50733,1 cup heavy cream,heavy cream,1.0,0.0,cup,
66931,"2 cloves garlic, peeled and minced",garlic,2.0,0.0,clove,peeled and minced
78296,1/4 cup Marsala wine,Marsala wine,0.25,0.0,cup,


In [6]:

# have spacy parse the input string with the full pipeline to generate features this will take some time
training_data["input"] = list(nlp.pipe(training_data["input"].astype('unicode').values, batch_size=50))

test_data["input"] = list(nlp.pipe(test_data["input"].astype('unicode').values, batch_size=50))


In [7]:
training_data.head()

Unnamed: 0_level_0,input,name,qty,range_end,unit,comment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
107273,"(Freshly, grated, imported, Parmesan, cheese, ...",Parmesan cheese,0.0,0.0,,"Freshly grated imported, preferably parmigiano..."
58171,"(1, large, sweet, potato, ,, peeled, and, cut,...",sweet potato,1.0,0.0,,"large, peeled and cut into 1/2-inch cubes"
6569,"(Freshly, ground, black, pepper, to, taste)",black pepper,0.0,0.0,,Freshly ground to taste
177211,"(0.25, cup, all, -, purpose, flour)",all-purpose flour,0.25,0.0,cup,
33971,"(2, 2-ounce, cans, anchovy, fillets, ,, packed...",anchovy fillets,4.0,0.0,ounce,"2 2-ounce cans, packed in oil"


In [8]:
crf_training_features = training_data["input"].apply(lambda doc: [word2features(doc,i) for i in range(len(doc))])
                                                  
crf_test_features = test_data["input"].apply(lambda doc: [word2features(doc,i) for i in range(len(doc))])                                                  


In [9]:
crf_training_features[0]

[{'bias': 1.0,
  'lemma': '1.25',
  'pos': 'NUM',
  'tag': 'CD',
  'dep': 'nummod',
  'shape': 'd.dd',
  'is_alpha': False,
  'is_stop': False,
  'is_title': False,
  'is_punct': False,
  'BOS': True,
  '+1:lemma': 'cup',
  '+1:pos': 'NOUN',
  '+1:tag': 'NNS',
  '+1:dep': 'ROOT',
  '+1:shape': 'xxxx',
  '+1:is_alpha': True,
  '+1:is_stop': False,
  '+1:is_title': False,
  '+1:is_right_punct': False,
  '+2:lemma': 'cook',
  '+2:pos': 'VERB',
  '+2:tag': 'VBN',
  '+2:dep': 'acl',
  '+2:shape': 'xxxx',
  '+2:is_alpha': True,
  '+2:is_stop': False,
  '+2:is_title': False,
  '+2:is_right_punct': False},
 {'bias': 1.0,
  'lemma': 'cup',
  'pos': 'NOUN',
  'tag': 'NNS',
  'dep': 'ROOT',
  'shape': 'xxxx',
  'is_alpha': True,
  'is_stop': False,
  'is_title': False,
  'is_punct': False,
  '-1:lemma': '1.25',
  '-1:pos': 'NUM',
  '-1:tag': 'CD',
  '-1:dep': 'nummod',
  '-1:shape': 'd.dd',
  '-1:is_alpha': False,
  '-1:is_stop': False,
  '-1:is_title': False,
  '-1:is_left_punct': False,
  '+1:l

In [10]:
crf_training_features.to_pickle("../data/interim/crf_training_features.pickle")
crf_test_features.to_pickle("../data/interim/crf_test_features.pickle")