In [1]:
# TODO DELETE ME
%load_ext nb_black

<IPython.core.display.Javascript object>

In [2]:
# TODO

# find accuracy of Base model (using Spacy Root Verb Dependency)
# find accuracy using vanilla CRF Suite
# explore features and hyperparameters of CRF Suite Algo
# find accuracy using tweaked CRF Suite
# display results using Displacy
# interpret results using eli5

<IPython.core.display.Javascript object>

In [3]:
import warnings

warnings.filterwarnings("ignore")

<IPython.core.display.Javascript object>

In [4]:
import pandas as pd
import math

# to set random seed
import numpy as np

# used to create CRF model
from sklearn_crfsuite import CRF

# used to evaluate model
from sklearn_crfsuite import metrics

# for hyperparameter tuning parameter range
import scipy

# for specifying f1 metrics
from sklearn.metrics import make_scorer

# for cross validation of hyperparameters
from sklearn.model_selection import RandomizedSearchCV

# to visualize the weight of parameters of the fitted model
import eli5

<IPython.core.display.Javascript object>

In [5]:
pd.set_option("max_row", 600)

<IPython.core.display.Javascript object>

In [6]:
np.random.seed(42)

<IPython.core.display.Javascript object>

In [7]:
data = pd.read_csv("tagged_data.csv", index_col=0)

<IPython.core.display.Javascript object>

In [8]:
data.head(10)

Unnamed: 0,sentence#,word,pos,tag
0,0.0,Preheat,VB,U-Action
1,0.0,oven,NN,U-Utensil
2,0.0,to,IN,O
3,0.0,425,CD,O
4,0.0,degrees,NNS,O
5,0.0,F.,NN,O
6,1.0,Press,NN,U-Action
7,1.0,dough,NN,O
8,1.0,into,IN,O
9,1.0,the,DT,O


<IPython.core.display.Javascript object>

In [9]:
data["tag"].value_counts()

O               2498
U-Action         366
U-Ingredient     241
B-Ingredient      70
L-Ingredient      70
B-Utensil         60
L-Utensil         59
U-Utensil         59
I-Utensil         15
I-Ingredient       3
B-Action           2
L-Action           2
Name: tag, dtype: int64

<IPython.core.display.Javascript object>

In [10]:
print(len(data["word"].values))

3445


<IPython.core.display.Javascript object>

In [11]:
words = list(set(data["word"].values))
len(words)

869

<IPython.core.display.Javascript object>

In [12]:
agg_func = lambda s: [
    (w, p, t)
    for w, p, t in zip(
        s["word"].values.tolist(), s["pos"].values.tolist(), s["tag"].values.tolist()
    )
]

<IPython.core.display.Javascript object>

In [13]:
grouped = data.groupby("sentence#").apply(agg_func)

<IPython.core.display.Javascript object>

In [14]:
sentences = [s for s in grouped]

<IPython.core.display.Javascript object>

In [15]:
len(sentences)

264

<IPython.core.display.Javascript object>

In [16]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        # set the bias for each word
        "bias": 1.0,
        # get the lower case form of the word
        "word.lower()": word.lower(),
        # get last 3 letters for the word
        "word[-3:]": word[-3:],
        # get last 2 letters for the word
        "word[-2:]": word[-2:],
        # check whether the word is uppercase or not
        "word.isupper()": word.isupper(),
        # check whether the word is title case or not
        "word.istitle()": word.istitle(),
        # check whether the word is digit or not, useful to identifying quantities which will be tagged as 'O'
        "word.isdigit()": word.isdigit(),
        # specifying the pos for word
        "postag": postag,
        # get first 2 letters for the POS tag
        "postag[:2]": postag[:2],
        # checking if the word is alphabet or not
        "isalpha()": not word.isalpha(),
    }

    # if word is starting of sentence
    if i > 0:

        # if word is not the beginning of sentence
        # then get the word before it i.e. i-1 index
        word1 = sent[i - 1][0]

        # then get the pos before it i.e. i-1 index
        postag1 = sent[i - 1][1]

        features.update(
            {
                # setting the lower form of word at index i-1
                "-1:word.lower()": word1.lower(),
                # checking if the word at index i-1 is titlecase
                "-1:word.istitle()": word1.istitle(),
                # checking if the word at index i-1 is uppercase
                "-1:word.isupper()": word1.isupper(),
                # setting the pos of word at index i-1
                "-1:postag": postag1,
                # get first 2 letters for the POS tag for i-1 indexed word
                "-1:postag[:2]": postag1[:2],
            }
        )
    else:
        # setting the BOS or Begining of sentence to True
        features["BOS"] = True

    # if word is at the end of sentence
    if i < len(sent) - 1:

        # if word is not the end of sentence
        # then get the word after it i.e. i+1 index
        word1 = sent[i + 1][0]

        # then get the pos after it i.e. i+1 index
        postag1 = sent[i + 1][1]

        features.update(
            {
                # setting the lower form of word at index i+1
                "+1:word.lower()": word1.lower(),
                # checking if the word at index i+1 is titlecase
                "+1:word.istitle()": word1.istitle(),
                # checking if the word at index i+1 is titlecase
                "+1:word.isupper()": word1.isupper(),
                # setting the pos of word at index i+1
                "+1:postag": postag1,
                # get first 2 letters for the POS tag for i+1 indexed word
                "+1:postag[:2]": postag1[:2],
            }
        )
    else:
        # setting the EOS or End of sentence to True
        features["EOS"] = True

    return features

<IPython.core.display.Javascript object>

In [17]:
def sent2features(sent):
    """Convert sentences which are lists containing (w, p, t) into features"""
    return [word2features(sent, i) for i in range(len(sent))]

<IPython.core.display.Javascript object>

In [18]:
def sent2labels(sent):
    """Retrieve all the labels from sentences which are lists containing (w, p, t)"""
    return [label for token, postag, label in sent]

<IPython.core.display.Javascript object>

In [19]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

<IPython.core.display.Javascript object>

In [20]:
# split into train and test
boundary = math.ceil(len(X) * 0.8)

<IPython.core.display.Javascript object>

In [21]:
boundary

212

<IPython.core.display.Javascript object>

In [22]:
# train data
x_train = X[:boundary]
y_train = y[:boundary]

# test data
x_test = X[boundary:]
y_test = y[boundary:]

<IPython.core.display.Javascript object>

In [23]:
data[data["word"] == "("]

Unnamed: 0,sentence#,word,pos,tag
150,14.0,(,-LRB-,O
284,28.0,(,-LRB-,O
383,37.0,(,-LRB-,O
411,38.0,(,-LRB-,O
423,38.0,(,-LRB-,O
764,63.0,(,-LRB-,O
829,68.0,(,-LRB-,O
1123,95.0,(,-LRB-,O
1309,111.0,(,-LRB-,O
1367,117.0,(,-LRB-,O


<IPython.core.display.Javascript object>

In [24]:
print(len(x_train))
print(len(x_test))

212
52


<IPython.core.display.Javascript object>

In [25]:
# creating CRF model with Gradient Descent
crf = CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=False,
)

<IPython.core.display.Javascript object>

In [26]:
# fitting the model using train data
crf.fit(x_train, y_train)

CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

<IPython.core.display.Javascript object>

In [27]:
# to get all the labels/tags of data
labels = list(crf.classes_)

<IPython.core.display.Javascript object>

In [28]:
labels

['U-Action',
 'U-Utensil',
 'O',
 'B-Utensil',
 'L-Utensil',
 'U-Ingredient',
 'B-Ingredient',
 'L-Ingredient',
 'I-Utensil',
 'I-Ingredient',
 'B-Action',
 'L-Action']

<IPython.core.display.Javascript object>

Since we are not interested in 'O' tags we will check the performance of the CRF model using f1 scores for every tag except O tags.

In [29]:
labels.remove("O")
labels

['U-Action',
 'U-Utensil',
 'B-Utensil',
 'L-Utensil',
 'U-Ingredient',
 'B-Ingredient',
 'L-Ingredient',
 'I-Utensil',
 'I-Ingredient',
 'B-Action',
 'L-Action']

<IPython.core.display.Javascript object>

In [30]:
# performing predictions based on the fitted model
y_pred = crf.predict(x_test)

<IPython.core.display.Javascript object>

In [31]:
# finding the f1 score
metrics.flat_f1_score(y_test, y_pred, average="weighted", labels=labels)

0.719570446804729

<IPython.core.display.Javascript object>

In [32]:
# finding the f1 score
print(metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

    U-Action      0.911     0.878     0.894        82
   U-Utensil      1.000     0.412     0.583        17
   B-Utensil      1.000     0.231     0.375        13
   L-Utensil      1.000     0.308     0.471        13
U-Ingredient      0.738     0.689     0.713        45
B-Ingredient      0.474     0.750     0.581        12
L-Ingredient      0.474     0.750     0.581        12
   I-Utensil      1.000     0.333     0.500         3
I-Ingredient      0.000     0.000     0.000         0
    B-Action      0.000     0.000     0.000         0
    L-Action      0.000     0.000     0.000         0

   micro avg      0.782     0.690     0.733       197
   macro avg      0.600     0.395     0.427       197
weighted avg      0.839     0.690     0.720       197



<IPython.core.display.Javascript object>

The model overfits!!

In [33]:
len(data[((data["word"] == "Boil") | (data["word"] == "boil"))])

13

<IPython.core.display.Javascript object>

In [34]:
len(data[((data["word"] == "Cover") | (data["word"] == "cover"))])

17

<IPython.core.display.Javascript object>

In [35]:
len(data[((data["word"] == "Preheat") | (data["word"] == "preheat"))])

9

<IPython.core.display.Javascript object>

In [36]:
eli5.show_weights(crf, top=30)

From \ To,O,B-Action,L-Action,U-Action,B-Ingredient,I-Ingredient,L-Ingredient,U-Ingredient,B-Utensil,I-Utensil,L-Utensil,U-Utensil
O,3.158,0.295,0.0,1.464,1.237,0.0,0.0,2.113,2.03,0.0,0.0,1.397
B-Action,0.0,0.0,2.485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L-Action,0.247,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U-Action,1.561,0.0,0.0,0.0,0.837,0.0,0.0,1.543,0.0,0.0,0.0,1.573
B-Ingredient,0.0,0.0,0.0,0.0,0.0,1.419,5.739,0.0,0.0,0.0,0.0,0.0
I-Ingredient,-0.141,0.0,0.0,0.0,0.0,0.0,1.432,0.0,0.0,0.0,0.0,0.0
L-Ingredient,1.034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U-Ingredient,1.712,0.0,0.0,0.191,0.655,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-Utensil,-1.289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.349,4.245,0.0
I-Utensil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.556,3.695,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11
+3.189,bias,,,,,,,,,,
+2.513,isalpha(),,,,,,,,,,
+2.074,+1:word.lower():of,,,,,,,,,,
+1.948,word.lower():hour,,,,,,,,,,
+1.819,-1:word.lower():salted,,,,,,,,,,
+1.783,word.lower():let,,,,,,,,,,
+1.751,postag:VBZ,,,,,,,,,,
+1.743,word.lower():depending,,,,,,,,,,
+1.684,+1:word.lower()::,,,,,,,,,,
+1.624,word.lower():boil,,,,,,,,,,

Weight?,Feature
+3.189,bias
+2.513,isalpha()
+2.074,+1:word.lower():of
+1.948,word.lower():hour
+1.819,-1:word.lower():salted
+1.783,word.lower():let
+1.751,postag:VBZ
+1.743,word.lower():depending
+1.684,+1:word.lower()::
+1.624,word.lower():boil

Weight?,Feature
1.024,-1:word.lower():and
0.865,-1:postag:CC
0.865,-1:postag[:2]:CC
0.719,+1:word.lower():up
0.715,word.lower():roll
0.715,word[-3:]:oll
0.713,+1:word.lower():lightly
0.683,+1:postag:RP
0.683,+1:postag[:2]:RP
0.668,word[-2:]:ll

Weight?,Feature
0.915,-1:word.lower():brown
0.856,word.lower():lightly
0.798,-1:word.lower():roll
0.763,word[-3:]:tly
0.716,word[-2:]:ly
0.695,postag[:2]:RB
0.666,postag:RB
0.653,word[-3:]:up
0.653,word.lower():up
0.628,postag[:2]:RP

Weight?,Feature
+2.646,postag[:2]:VB
+2.369,word.istitle()
+2.156,-1:word.lower():before
+2.075,-1:word.lower():teaspoonful
+2.043,word.lower():boil
+1.970,word[-3:]:oil
+1.828,postag:VB
+1.810,word.lower():stirring
+1.791,+1:word.lower():ground
+1.749,-1:word.lower():continue

Weight?,Feature
+1.647,+1:postag[:2]:NN
+1.397,+1:word.lower():cream
+0.992,word[-2:]:ed
+0.965,+1:word.lower():cheese
+0.961,-1:word.lower():with
+0.929,word[-3:]:und
+0.929,word.lower():ground
+0.920,+1:word.lower():sugar
+0.857,+1:word.lower():roll
+0.835,+1:postag:NNS

Weight?,Feature
1.892,word.lower():water
1.372,word[-3:]:ter
1.178,+1:word.lower():has
1.087,-1:word.lower():of
1.013,+1:postag:VBZ
0.981,+1:word.lower():wrapper
0.98,word.lower():roll
0.98,word[-3:]:oll
0.949,word[-2:]:ll
0.94,-1:word.lower():egg

Weight?,Feature
+0.966,-1:word.lower():ground
+0.904,word[-2:]:am
+0.872,word[-3:]:eam
+0.872,word.lower():cream
+0.825,word[-2:]:ck
+0.766,word.lower():sugar
+0.766,word[-3:]:gar
+0.761,-1:postag[:2]:NN
+0.748,word.lower():chiles
+0.747,-1:word.lower():seasoned

Weight?,Feature
+3.075,word.lower():oil
+2.571,-1:word.lower():the
+2.174,word.lower():chicken
+2.101,word.lower():flour
+2.037,word.lower():potatoes
+2.037,word[-3:]:oes
+1.911,word.lower():butter
+1.904,word[-3:]:ken
+1.861,word.lower():loaves
+1.747,postag[:2]:NN

Weight?,Feature
+1.753,-1:word.lower():a
+1.201,word[-2:]:um
+1.128,-1:word.lower():or
+1.054,+1:word.lower():low
+1.033,+1:word.lower():pot
+0.978,+1:postag[:2]:JJ
+0.976,+1:postag:JJ
+0.878,word[-3:]:low
+0.878,word[-2:]:ow
+0.779,word.lower():paper

Weight?,Feature
+1.635,+1:postag[:2]:NN
+0.974,-1:word.lower():glass
+0.963,-1:postag:JJ
+0.906,-1:postag[:2]:JJ
+0.904,+1:word.lower():sauce
+0.528,word[-2:]:up
+0.467,word.lower():sauce
+0.456,word[-3:]:uce
+0.393,+1:word.lower():dish
+0.376,word.lower():baking

Weight?,Feature
+1.286,word[-3:]:pan
+1.189,-1:word.lower():large
+1.099,word[-2:]:an
+0.951,-1:word.lower():soup
+0.927,word[-2:]:ls
+0.850,postag[:2]:NN
+0.838,word[-3:]:pot
+0.838,word.lower():pot
+0.823,word[-2:]:ot
+0.782,-1:postag:NN

Weight?,Feature
+2.383,word.lower():blender
+1.993,word.lower():pans
+1.962,word.lower():jars
+1.962,word[-3:]:ars
+1.617,word.lower():oven
+1.579,word[-3:]:ven
+1.578,-1:word.lower():serving
+1.556,-1:word.lower():between
+1.536,+1:word.lower():wrap
+1.484,-1:word.lower():1


<IPython.core.display.Javascript object>

**Hyperparameter Optimization**

In [37]:
# creating a CRF Hyperparameter tuned model
crf_hp = CRF(
    algorithm="lbfgs", c1=10, c2=0.1, max_iterations=100, all_possible_transitions=False
)

# parameters to hypertune
params_space = {"c1": scipy.stats.expon(scale=0.5), "c2": scipy.stats.expon(scale=0.05)}

# metric for evaluation
f1_scorer = make_scorer(metrics.flat_f1_score, average="weighted", labels=labels)

# search using 3 fold cross validation
rs = RandomizedSearchCV(
    crf_hp, params_space, cv=3, verbose=1, n_jobs=-1, n_iter=50, scoring=f1_scorer
)

<IPython.core.display.Javascript object>

In [38]:
# fitting the hyperparameters
rs.fit(x_train, y_train)

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   14.1s
[Parallel(n_jobs=-1)]: Done 150 out of 150 | elapsed:  1.0min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=CRF(algorithm='lbfgs', all_possible_states=None,
                                 all_possible_transitions=False, averaging=None,
                                 c=None, c1=10, c2=0.1,
                                 calibration_candidates=None,
                                 calibration_eta=None,
                                 calibration_max_trials=None,
                                 calibration_rate=None,
                                 calibration_samples=None, delta=None,
                                 epsilon=None, error_sensitive=None, gamma=None,
                                 keep_te...
                                        'c2': <scipy.stats._distn_infrastructure.rv_frozen object at 0x000001995A591288>},
                   pre_dispatch='2*n_jobs', random_state=None, refit=True,
                   return_train_score=False,
                   scoring=make_scorer(flat_f1_score, average=weighte

<IPython.core.display.Javascript object>

Best Parameters:

In [39]:
print("best params:", rs.best_params_)
print("best CV score:", rs.best_score_)

best params: {'c1': 0.010399653999569311, 'c2': 0.1751778737579156}
best CV score: 0.7246940731524202


<IPython.core.display.Javascript object>

In [40]:
# set the best estimator
crf_hp_tuned = rs.best_estimator_

<IPython.core.display.Javascript object>

In [41]:
# predict using the best CRF model
y_pred = crf_hp_tuned.predict(x_test)

<IPython.core.display.Javascript object>

In [42]:
# print the f1 evaluation metric
print(metrics.flat_classification_report(y_test, y_pred, labels=labels, digits=3))

              precision    recall  f1-score   support

    U-Action      0.911     0.878     0.894        82
   U-Utensil      1.000     0.412     0.583        17
   B-Utensil      1.000     0.385     0.556        13
   L-Utensil      1.000     0.462     0.632        13
U-Ingredient      0.750     0.733     0.742        45
B-Ingredient      0.500     0.667     0.571        12
L-Ingredient      0.500     0.667     0.571        12
   I-Utensil      1.000     0.333     0.500         3
I-Ingredient      0.000     0.000     0.000         0
    B-Action      0.000     0.000     0.000         0
    L-Action      0.000     0.000     0.000         0

   micro avg      0.805     0.711     0.755       197
   macro avg      0.606     0.412     0.459       197
weighted avg      0.845     0.711     0.748       197



<IPython.core.display.Javascript object>

In [43]:
# transition features: given B-Utensil then what is score for I-Utensil

<IPython.core.display.Javascript object>

In [44]:
# state features: what property of feature describes the tag

<IPython.core.display.Javascript object>

In [45]:
eli5.show_weights(crf_hp_tuned, top=30)

From \ To,O,B-Action,L-Action,U-Action,B-Ingredient,I-Ingredient,L-Ingredient,U-Ingredient,B-Utensil,I-Utensil,L-Utensil,U-Utensil
O,3.052,0.358,0.0,1.413,1.157,0.0,0.0,1.882,1.928,0.0,0.0,1.172
B-Action,0.0,0.0,1.876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
L-Action,0.245,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U-Action,1.364,0.0,0.0,0.0,0.845,0.0,0.0,1.217,0.0,0.0,0.0,1.315
B-Ingredient,0.0,0.0,0.0,0.0,0.0,1.128,5.164,0.0,0.0,0.0,0.0,0.0
I-Ingredient,-0.347,0.0,0.0,0.0,0.0,0.0,1.103,0.0,0.0,0.0,0.0,0.0
L-Ingredient,0.982,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
U-Ingredient,1.614,0.0,0.0,0.358,0.647,0.0,0.0,0.0,0.0,0.0,0.0,0.0
B-Utensil,-1.28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.566,3.548,0.0
I-Utensil,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.993,2.986,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11
+2.788,bias,,,,,,,,,,
+1.910,isalpha(),,,,,,,,,,
+1.828,+1:word.lower():of,,,,,,,,,,
+1.670,word.lower():hour,,,,,,,,,,
+1.473,word.lower():boil,,,,,,,,,,
+1.440,word.lower():depending,,,,,,,,,,
+1.399,-1:word.lower():salted,,,,,,,,,,
+1.387,word.lower():let,,,,,,,,,,
+1.379,postag[:2]:IN,,,,,,,,,,
+1.379,postag:IN,,,,,,,,,,

Weight?,Feature
+2.788,bias
+1.910,isalpha()
+1.828,+1:word.lower():of
+1.670,word.lower():hour
+1.473,word.lower():boil
+1.440,word.lower():depending
+1.399,-1:word.lower():salted
+1.387,word.lower():let
+1.379,postag[:2]:IN
+1.379,postag:IN

Weight?,Feature
0.834,-1:word.lower():and
0.767,-1:postag:CC
0.767,-1:postag[:2]:CC
0.71,+1:word.lower():lightly
0.634,word.lower():roll
0.634,word[-3:]:oll
0.631,+1:word.lower():up
0.623,word[-2:]:ll
0.622,word.lower():brown
0.617,word[-2:]:wn

Weight?,Feature
0.869,-1:word.lower():brown
0.809,word.lower():lightly
0.771,word[-3:]:tly
0.715,word[-2:]:ly
0.69,-1:word.lower():roll
0.648,postag[:2]:RB
0.642,postag:RB
0.608,word.lower():up
0.608,word[-3:]:up
0.585,postag[:2]:RP

Weight?,Feature
+2.250,postag[:2]:VB
+1.958,word.istitle()
+1.809,postag:VB
+1.763,-1:word.lower():before
+1.749,word.lower():boil
+1.623,-1:word.lower():teaspoonful
+1.547,word[-3:]:oil
+1.536,word.lower():cover
+1.476,BOS
+1.462,+1:word.lower():ground

Weight?,Feature
+1.528,+1:postag[:2]:NN
+1.105,+1:word.lower():cream
+0.957,word[-2:]:ed
+0.828,+1:word.lower():cheese
+0.792,+1:word.lower():sugar
+0.779,word[-3:]:und
+0.779,word.lower():ground
+0.756,-1:word.lower():with
+0.747,+1:word.lower():roll
+0.732,+1:postag:NNS

Weight?,Feature
1.554,word.lower():water
1.2,word[-3:]:ter
1.073,-1:word.lower():of
0.884,+1:word.lower():has
0.835,word[-2:]:ll
0.835,+1:word.lower():wrapper
0.829,word.lower():roll
0.829,word[-3:]:oll
0.824,+1:postag:VBZ
0.823,word[-2:]:er

Weight?,Feature
+0.841,-1:word.lower():ground
+0.835,word[-3:]:eam
+0.835,word.lower():cream
+0.808,word[-2:]:am
+0.771,-1:postag[:2]:NN
+0.720,word[-3:]:gar
+0.720,word.lower():sugar
+0.705,word.lower():wrapper
+0.703,word[-2:]:ar
+0.690,-1:word.lower():roll

Weight?,Feature
+2.500,word.lower():oil
+2.198,-1:word.lower():the
+1.725,word.lower():potatoes
+1.725,word[-3:]:oes
+1.683,word.lower():butter
+1.682,word.lower():flour
+1.649,word.lower():chicken
+1.533,word[-3:]:ken
+1.492,postag[:2]:NN
+1.440,word.lower():pepper

Weight?,Feature
+1.250,-1:word.lower():a
+1.052,word[-2:]:um
+0.966,+1:word.lower():low
+0.952,+1:postag[:2]:JJ
+0.932,+1:postag:JJ
+0.915,+1:word.lower():pot
+0.811,-1:word.lower():or
+0.797,word[-2:]:ow
+0.788,word[-3:]:low
+0.758,+1:postag[:2]:NN

Weight?,Feature
+1.142,+1:postag[:2]:NN
+0.876,-1:postag:JJ
+0.846,-1:postag[:2]:JJ
+0.810,-1:word.lower():glass
+0.728,+1:word.lower():sauce
+0.507,word[-2:]:up
+0.506,+1:postag:NNS
+0.473,word.lower():sauce
+0.463,word[-3:]:uce
+0.454,+1:word.lower():dish

Weight?,Feature
+1.015,word[-3:]:pan
+1.000,-1:word.lower():large
+0.947,word[-2:]:an
+0.855,-1:postag:NN
+0.829,-1:word.lower():soup
+0.813,word.lower():pot
+0.813,word[-3:]:pot
+0.797,word[-2:]:ot
+0.768,word[-2:]:ls
+0.668,-1:word.lower():paper

Weight?,Feature
+1.920,word.lower():blender
+1.642,word.lower():pans
+1.551,word[-3:]:ars
+1.551,word.lower():jars
+1.444,word.lower():oven
+1.415,word[-3:]:ven
+1.359,+1:word.lower():wrap
+1.333,word[-3:]:der
+1.321,word[-2:]:ag
+1.321,word[-3:]:bag


<IPython.core.display.Javascript object>

## Untagged Data Stuff

In [46]:
untagged_test_data = pd.read_csv("./Untagged Test Data/untagged_test_data.csv")

<IPython.core.display.Javascript object>

In [47]:
untagged_test_data.head()

Unnamed: 0,recipe_name,Step#,word,pos
0,homemade vegetable soup from a can,0.0,Combine,VB
1,homemade vegetable soup from a can,0.0,all,DT
2,homemade vegetable soup from a can,0.0,ingredients,NNS
3,homemade vegetable soup from a can,0.0,in,IN
4,homemade vegetable soup from a can,0.0,large,JJ


<IPython.core.display.Javascript object>

In [48]:
len(untagged_test_data)

3364

<IPython.core.display.Javascript object>

In [49]:
# 30:5:30

<IPython.core.display.Javascript object>

In [50]:
agg_func_test = lambda s: [
    (w, p) for w, p in zip(s["word"].values.tolist(), s["pos"].values.tolist())
]

<IPython.core.display.Javascript object>

In [51]:
grouped_test = untagged_test_data.groupby(["recipe_name", "Step#"]).apply(agg_func_test)

<IPython.core.display.Javascript object>

In [52]:
test_sentences = [s for s in grouped_test]

<IPython.core.display.Javascript object>

In [53]:
len(test_sentences)

244

<IPython.core.display.Javascript object>

In [54]:
X_untagged_test = [sent2features(s) for s in test_sentences]

<IPython.core.display.Javascript object>

In [55]:
len(X_untagged_test)

244

<IPython.core.display.Javascript object>

In [56]:
pred_test = crf_hp_tuned.predict(X_untagged_test)

<IPython.core.display.Javascript object>

In [57]:
len(pred_test)

244

<IPython.core.display.Javascript object>

In [58]:
count = 0
for l in pred_test:
    for tag in l:
        if tag == "U-Action":
            count += 1

<IPython.core.display.Javascript object>

In [59]:
count

318

<IPython.core.display.Javascript object>

In [60]:
len(list(untagged_test_data["word"].values))

3364

<IPython.core.display.Javascript object>

In [61]:
len(list(set(untagged_test_data["word"].values)))

824

<IPython.core.display.Javascript object>

Boostrap output to the untagged data df

In [62]:
flat_list = []
for sublist in pred_test:
    for item in sublist:
        flat_list.append(item)

<IPython.core.display.Javascript object>

In [63]:
len(flat_list)

3364

<IPython.core.display.Javascript object>

In [64]:
untagged_test_data["Predicted Output"] = flat_list

<IPython.core.display.Javascript object>

In [65]:
len(untagged_test_data[untagged_test_data["Predicted Output"] == "U-Action"])

318

<IPython.core.display.Javascript object>

In [66]:
untagged_test_data[untagged_test_data["Predicted Output"] == "U-Action"]

Unnamed: 0,recipe_name,Step#,word,pos,Predicted Output
0,homemade vegetable soup from a can,0.0,Combine,VB,U-Action
7,homemade vegetable soup from a can,1.0,Bring,VB,U-Action
10,homemade vegetable soup from a can,1.0,boil,NN,U-Action
12,homemade vegetable soup from a can,1.0,simmer,VB,U-Action
17,homemade vegetable soup from a can,2.0,Serve,VB,U-Action
52,homemade vegetable soup from a can,4.0,add,VB,U-Action
64,how i got my family to eat spinach spinach cas...,0.0,Preheat,VB,U-Action
70,how i got my family to eat spinach spinach cas...,1.0,Place,NN,U-Action
75,how i got my family to eat spinach spinach cas...,1.0,squeeze,VB,U-Action
87,how i got my family to eat spinach spinach cas...,2.0,combine,VB,U-Action


<IPython.core.display.Javascript object>

In [67]:
data.head()

Unnamed: 0,sentence#,word,pos,tag
0,0.0,Preheat,VB,U-Action
1,0.0,oven,NN,U-Utensil
2,0.0,to,IN,O
3,0.0,425,CD,O
4,0.0,degrees,NNS,O


<IPython.core.display.Javascript object>