In [2]:
import sklearn
import scipy.stats
import sklearn_crfsuite
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

import format_tagged_sentences as fts

In [3]:
def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit()
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper()
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper()
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

def sent2tokens(sent):
    return [token for token, label in sent]

In [4]:
to_train = pd.read_csv('data/processed/NER/crf/training_set.csv', names = ['text'])
to_test = pd.read_csv('data/processed/NER/crf/test_set.csv', names = ['text'])

train = fts.bilou_to_crfsuite(fts.tagged_to_bilou(to_train.text.to_list()))
test = fts.bilou_to_crfsuite(fts.tagged_to_bilou(to_test.text.to_list()))

X_train = [sent2features(s) for s in train]
y_train = [sent2labels(s) for s in train]

X_test = [sent2features(s) for s in test]
y_test = [sent2labels(s) for s in test]

In [5]:
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0,
    c2=1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)



CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=True,
    averaging=None, c=None, c1=0, c2=1, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [6]:
labels = list(crf.classes_)
labels.remove('O')

y_pred = crf.predict(X_test)
metrics.flat_f1_score(y_test, y_pred, average='weighted', labels=labels)

sorted_labels = sorted(
    labels,
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

              precision    recall  f1-score   support

      B-dish      0.664     0.500     0.570       170
      I-dish      0.631     0.467     0.536       150
      L-dish      0.648     0.488     0.557       170
      U-dish      0.560     0.156     0.243        90
  U-occasion      0.667     0.429     0.522        14
B-restaurant      0.400     0.261     0.316        23
I-restaurant      0.000     0.000     0.000         7
L-restaurant      0.533     0.348     0.421        23
U-restaurant      0.000     0.000     0.000         6

   micro avg      0.630     0.417     0.501       653
   macro avg      0.456     0.294     0.352       653
weighted avg      0.611     0.417     0.487       653



  _warn_prf(average, modifier, msg_start, len(result))


In [7]:
%%timeit

crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0,
    c2=1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

1 loop, best of 3: 20.4 s per loop


In [8]:
!pip install memory_profiler
%load_ext memory_profiler

Collecting memory_profiler
  Downloading https://files.pythonhosted.org/packages/8f/fd/d92b3295657f8837e0177e7b48b32d6651436f0293af42b76d134c3bb489/memory_profiler-0.58.0.tar.gz
Building wheels for collected packages: memory-profiler
  Building wheel for memory-profiler (setup.py) ... [?25l[?25hdone
  Created wheel for memory-profiler: filename=memory_profiler-0.58.0-cp36-none-any.whl size=30181 sha256=1bcc495268aeb5736494cd01f7dad33731832df70d0903c2ed82d6cfaeaf8df1
  Stored in directory: /root/.cache/pip/wheels/02/e4/0b/aaab481fc5dd2a4ea59e78bc7231bb6aae7635ca7ee79f8ae5
Successfully built memory-profiler
Installing collected packages: memory-profiler
Successfully installed memory-profiler-0.58.0


In [10]:
%%timeit
sentence = [sent2features("Everybody was very friendly here. My kids loved the beef tacos and I had a chicken salad. I would probably order tacos or enchiladas next time instead. One of my sons didn't like what he got first and they were happy to exchange it for something else.  My kids had so much fun playing at the playground. I'm so glad to have somewhere close by where my kids can eat and play.")]

1000 loops, best of 3: 1.5 ms per loop


In [11]:
print(crf.predict([sent2features("Everybody was very friendly here. My kids loved the beef tacos and I had a chicken salad. I would probably order tacos or enchiladas next time instead. One of my sons didn't like what he got first and they were happy to exchange it for something else.  My kids had so much fun playing at the playground. I'm so glad to have somewhere close by where my kids can eat and play.")]))

[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'