In [55]:
from typing import Tuple, List
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
import random
import nltk
nltk.download('treebank')
from nltk.corpus import treebank
from collections import defaultdict

import os

[nltk_data] Downloading package treebank to /home/quontas/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


In [56]:
pos_tags = [
    tag for sentence in treebank.tagged_sents() 
    for _, tag in sentence
]
pos_tag_map = defaultdict(lambda: len(pos_tag_map))
for tag in pos_tags:
    pos_tag_map[tag]
print(pos_tag_map)

defaultdict(<function <lambda> at 0x7f2d46f29730>,
            {'#': 45,
             '$': 34,
             "''": 29,
             ',': 1,
             '-LRB-': 37,
             '-NONE-': 16,
             '-RRB-': 38,
             '.': 10,
             ':': 31,
             'CC': 13,
             'CD': 2,
             'DT': 7,
             'EX': 28,
             'FW': 41,
             'IN': 9,
             'JJ': 4,
             'JJR': 32,
             'JJS': 25,
             'LS': 44,
             'MD': 5,
             'NN': 8,
             'NNP': 0,
             'NNPS': 35,
             'NNS': 3,
             'PDT': 39,
             'POS': 26,
             'PRP': 19,
             'PRP$': 24,
             'RB': 17,
             'RBR': 20,
             'RBS': 40,
             'RP': 23,
             'SYM': 43,
             'TO': 18,
             'UH': 42,
             'VB': 6,
             'VBD': 14,
             'VBG': 12,
             'VBN': 15,
             'VBP': 22,
             'VB

In [57]:
TAG_MAP = {
    "OTHER": -1,
    "B-NAME": 0,
    "I-NAME": 1,
    "B-COMMENT": 2,
    "I-COMMENT": 3,
    "B-RANGE_END": 4,
    "B-UNIT": 5,
    "I-UNIT": 6,
    "B-QTY": 7,
}

REVERSE_MAP = {v: k for k, v in TAG_MAP.items()}


def parse_recipe(recipe: str) -> Tuple[List[str], List[int]]:
    """Given a CRF-tagged recipe string, converts it into a token/tag sequence.

    Args:
        recipe: A newline-delimited CRF recipe.
    Returns:
        A tuple of (tokens, tags) where tokens are List[str], and tags are List[int]
    """
    rows = recipe.split("\n")
    tokens = []
    tags = []
    for row in rows:
        if not row:
            continue
        token, _, _, _, _, tag = row.split("\t")
        tokens.append(token)
        tags.append(TAG_MAP[tag])
    return tokens, tags


def read_crf_file(filename):
    
    with open(filename) as f:
        lines = f.read()
        recipes = lines.split("\n\n")

        tokens = []
        tags = []
        for recipe in recipes:
            recipe_tokens, recipe_tags = parse_recipe(recipe)
            tokens.append(recipe_tokens)
            tags.append(recipe_tags)
        return tokens, tags

In [58]:
def get_feature(token, token_index, sent, pos_tags):
    """Extract features of given word(token)"""
    token_feature = {    
        'token'             : token,                                    # Token itself
        'is_first'          : token_index == 0,                         # Is token at the beginning of the sentence
        'is_last'           : token_index == len(sent)-1,               # Is token at the end of the sentence

        'is_capitalized'    : token[0].upper() == token[0],             # Is first letter of token a capital letter
        'is_all_capitalized': token.upper() == token,                   # Are all letters of token capital letters
        'is_capitals_inside': token[1:].lower() != token[1:],           # Is there any capital letters in the token
        'is_numeric'        : token.isdigit(),                          # Is there any digits in the token

        'prefix-1'          : token[0],                                 # Token prefix containing only one letter
        'prefix-2'          : '' if len(token) < 2  else token[:1],     # Token prefix containing two letters

        'suffix-1'          : token[-1],                                # Token suffix containing only one letter
        'suffix-2'          : '' if len(token) < 2  else token[-2:],    # Token suffix containing two letters

        'prev-token'        : '' if token_index == 0     else sent[token_index - 1][0],     # Previous token in the sentence
        '2-prev-token'      : '' if token_index <= 1     else sent[token_index - 2][0],     # Two previous token in the sentence

        'next-token'        : '' if token_index == len(sent) - 1     else sent[token_index + 1][0],     # Next token in the sentence
        '2-next-token'      : '' if token_index >= len(sent) - 2     else sent[token_index + 2][0],      # Two next token in the sentence
        'pos-tag'           : pos_tag_map[pos_tags[token_index]]
    }
    return token_feature

In [None]:
def construct_data(filename):
    tokens, tags = read_crf_file(filename)
    X = []
    y = []
    for token_list, tag_list in zip(tokens, tags):
        sentence_pos_tags = nltk.pos_tag(token_list)
        sentence = ' '.join(token_list)
        
        for i, token in enumerate(token_list):
            X.append(get_feature(token, i, sentence, sentence_pos_tags))
            y.append(tag_list[i])
    return X, y

In [None]:
X, y = construct_data('../data/train.crftags')
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3)

In [None]:
pipeline = make_pipeline(DictVectorizer(), LogisticRegression(class_weight='balanced'))

pipeline = pipeline.fit(X_train, y_train)



In [None]:
y_predicted = pipeline.predict(X_valid)
print(f'Accuracy: {accuracy_score(y_valid, y_predicted)*100}%')

'Accuracy: 72.84365964101912%'


In [None]:
X_test, y_test = construct_data('../data/test.crftags')

y_test_predictions = pipeline.predict(X_test)
print(f'Accuracy: {accuracy_score(y_test, y_test_predictions)*100}%')
test_tokens = [x['token'] for x in X_test]

'Accuracy: 74.57286432160805%'
