In [4]:
import pandas as pd
import numpy as np

df_crf = pd.read_csv("ner_dataset.csv", encoding="latin1")

In [5]:
df_crf= df_crf.fillna(method="ffill")
df_crf.tail()

Unnamed: 0,Sentence #,Word,POS,Tag
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [6]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [7]:
getter = SentenceGetter(data)
sent = getter.get_next()
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [8]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]

    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True

    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [10]:
sentences = getter.sentences
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [14]:
X[0][1]

{'+1:postag': 'NNS',
 '+1:postag[:2]': 'NN',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:word.lower()': 'demonstrators',
 '-1:postag': 'NNS',
 '-1:postag[:2]': 'NN',
 '-1:word.istitle()': True,
 '-1:word.isupper()': False,
 '-1:word.lower()': 'thousands',
 'bias': 1.0,
 'postag': 'IN',
 'postag[:2]': 'IN',
 'word.isdigit()': False,
 'word.istitle()': False,
 'word.isupper()': False,
 'word.lower()': 'of',
 'word[-2:]': 'of',
 'word[-3:]': 'of'}

In [17]:

from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.1,
          max_iterations=100,
          all_possible_transitions=False)

In [18]:
from sklearn.cross_validation import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report



In [19]:
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)

In [20]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

             precision    recall  f1-score   support

      B-art       0.37      0.11      0.17       402
      B-eve       0.52      0.35      0.42       308
      B-geo       0.85      0.90      0.88     37644
      B-gpe       0.97      0.94      0.95     15870
      B-nat       0.66      0.37      0.47       201
      B-org       0.78      0.72      0.75     20143
      B-per       0.84      0.81      0.82     16990
      B-tim       0.93      0.88      0.90     20333
      I-art       0.11      0.03      0.04       297
      I-eve       0.34      0.21      0.26       253
      I-geo       0.82      0.79      0.80      7414
      I-gpe       0.92      0.55      0.69       198
      I-nat       0.61      0.27      0.38        51
      I-org       0.81      0.79      0.80     16784
      I-per       0.84      0.89      0.87     17251
      I-tim       0.83      0.76      0.80      6528
          O       0.99      0.99      0.99    887908

avg / total       0.97      0.97      0.97  

In [21]:
crf.fit(X, y)

CRF(algorithm='lbfgs', all_possible_states=None,
  all_possible_transitions=False, averaging=None, c=None, c1=0.1, c2=0.1,
  calibration_candidates=None, calibration_eta=None,
  calibration_max_trials=None, calibration_rate=None,
  calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
  gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
  max_linesearch=None, min_freq=None, model_filename=None,
  num_memories=None, pa_type=None, period=None, trainer_cls=None,
  variance=None, verbose=False)

In [24]:
import eli5
eli5.show_weights(crf, top=10)

From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,4.29,0.879,0.0,1.575,0.0,2.092,0.0,1.387,0.0,1.605,0.0,2.497,0.0,4.17,0.0,2.986,0.0
B-art,-0.014,0.0,8.442,0.0,0.0,-0.398,0.0,0.0,0.0,0.0,0.0,0.516,0.0,-0.844,0.0,0.336,0.0
I-art,-0.651,0.0,8.04,0.0,0.0,-0.702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.016,0.0,-0.684,0.0
B-eve,-0.753,0.0,0.0,0.0,7.956,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.572,0.0
I-eve,-0.324,0.0,0.0,0.0,7.341,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.621,0.0
B-geo,0.677,0.752,0.0,0.545,0.0,0.0,8.752,0.579,0.0,0.0,0.0,1.155,0.0,1.143,0.0,2.344,0.0
I-geo,-0.469,0.822,0.0,0.0,0.0,0.0,7.424,-1.366,0.0,0.0,0.0,-0.074,0.0,1.331,0.0,1.033,0.0
B-gpe,0.679,-1.609,0.0,-0.32,0.0,0.681,0.0,0.0,7.485,0.0,0.0,2.05,0.0,1.459,0.0,0.767,0.0
I-gpe,-0.298,0.0,0.0,0.0,0.0,-1.087,0.0,0.0,6.337,0.0,0.0,0.0,0.0,0.148,0.0,0.0,0.0
B-nat,-1.108,0.0,0.0,0.0,0.0,0.625,0.0,0.0,0.0,0.0,7.067,0.0,0.0,-0.305,0.0,-0.413,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+8.012,word.lower():last,,,,,,,,,,,,,,,
+7.999,word.lower():month,,,,,,,,,,,,,,,
+5.813,word.lower():chairman,,,,,,,,,,,,,,,
+5.612,word.lower():columbia,,,,,,,,,,,,,,,
+5.555,word.lower():year,,,,,,,,,,,,,,,
+5.232,word.lower():week,,,,,,,,,,,,,,,
+5.146,word.lower():months,,,,,,,,,,,,,,,
+5.067,word.lower():internet,,,,,,,,,,,,,,,
+4.833,word.lower():weeks,,,,,,,,,,,,,,,
… 9219 more positive …,… 9219 more positive …,,,,,,,,,,,,,,,

Weight?,Feature
+8.012,word.lower():last
+7.999,word.lower():month
+5.813,word.lower():chairman
+5.612,word.lower():columbia
+5.555,word.lower():year
+5.232,word.lower():week
+5.146,word.lower():months
+5.067,word.lower():internet
+4.833,word.lower():weeks
… 9219 more positive …,… 9219 more positive …

Weight?,Feature
+5.369,word.lower():twitter
+4.858,word.lower():spaceshipone
+4.294,word.lower():nevirapine
+4.271,+1:word.lower():enkhbayar
+4.263,+1:word.lower():boots
+3.893,word.lower():english
+3.802,-1:word.lower():engine
+3.655,word[-3:]:One
+3.588,-1:word.lower():film
+3.540,word.lower():russian

Weight?,Feature
+3.025,-1:word.lower():boeing
+2.553,+1:word.lower():gained
+2.473,+1:word.lower():came
+2.418,-1:word.lower():cajun
+2.297,word.lower():notice
+2.260,word.lower():constitution
+2.112,word.lower():flowers
+2.109,+1:word.lower():times
+2.072,+1:word.lower():marks
+2.056,word.lower():a

Weight?,Feature
+4.333,word.lower():games
+4.263,word.lower():ramadan
+4.160,-1:word.lower():falklands
+3.501,-1:word.lower():typhoon
+3.484,word[-3:]:mes
+3.050,+1:word.lower():dean
+3.046,+1:word.lower():men
+3.028,-1:word.lower():wars
+2.942,-1:word.lower():happy
+2.938,-1:word.lower():solemn

Weight?,Feature
+4.329,+1:word.lower():mascots
+3.603,word.lower():games
+3.022,+1:word.lower():era
+2.756,word.lower():series
+2.577,word.lower():dean
+2.509,+1:word.lower():rally
+2.508,+1:word.lower():caused
+2.504,+1:word.lower():disaster
+2.441,word.lower():sabbath
+2.426,+1:word.lower():tore

Weight?,Feature
+6.238,word.lower():mid-march
+6.002,word.lower():caribbean
+5.503,word.lower():martian
+5.446,word.lower():beijing
+5.086,word.lower():persian
+4.737,-1:word.lower():hamas
+4.521,-1:word.lower():mr.
+4.509,word.lower():balkans
+4.362,-1:word.lower():serb
… 5969 more positive …,… 5969 more positive …

Weight?,Feature
+4.211,word.lower():led-invasion
+4.151,word.lower():holiday
+4.065,word.lower():caribbean
+3.651,+1:word.lower():possessions
+3.446,+1:word.lower():regional
+3.430,+1:word.lower():french
+3.374,-1:word.lower():nahr
+3.296,word.lower():shogunate
+3.296,-1:word.lower():tokugawa
+3.232,word.lower():restaurant

Weight?,Feature
+6.735,word.lower():afghan
+6.602,word.lower():niger
+6.219,word.lower():nepal
+5.432,word.lower():spaniard
+5.391,word.lower():azerbaijan
+5.138,word.lower():iranian
+5.127,word.lower():mexican
+5.080,word.lower():argentine
+4.926,word.lower():gibraltar
+4.829,word.lower():iraqi

Weight?,Feature
+5.622,+1:word.lower():mayor
+4.073,-1:word.lower():democratic
+3.844,-1:word.lower():bosnian
+3.602,+1:word.lower():developed
+3.543,word.lower():korean
+3.308,word[-3:]:can
+3.226,-1:word.lower():soviet
+3.217,word.lower():city
+3.179,+1:word.lower():health
+3.172,word.lower():cypriots

Weight?,Feature
+6.149,word.lower():katrina
+5.371,word.lower():marburg
+4.334,word.lower():rita
+3.535,+1:word.lower():shot
+2.959,word[-3:]:ita
+2.791,word.lower():leukemia
+2.769,word[-3:]:urg
+2.759,word[-3:]:mia
+2.665,word.lower():paul
+2.647,+1:word.lower():strain

Weight?,Feature
+2.681,word.lower():rita
+2.327,word[-3:]:ita
+2.315,+1:word.lower():outbreak
+1.944,-1:word.lower():hurricanes
+1.909,word[-2:]:ta
+1.747,word.lower():flu
+1.670,word[-2:]:lu
+1.654,-1:word.lower():type
+1.624,+1:word.lower():relief
+1.613,-1:postag:NN

Weight?,Feature
+7.344,word.lower():philippine
+6.075,word.lower():mid-march
+5.812,word.lower():hamas
+5.779,-1:word.lower():rice
+5.629,word.lower():al-qaida
+5.071,word.lower():taleban
+4.756,word.lower():taliban
+4.729,-1:word.lower():senator
+4.723,word.lower():reuters
+4.662,word.lower():hezbollah

Weight?,Feature
+3.981,+1:word.lower():attained
+3.785,+1:word.lower():reporter
+3.486,-1:word.lower():associated
+3.463,word.lower():singapore
+3.400,word.lower():member-countries
+3.365,-1:word.lower():decathlon
+3.360,+1:word.lower():ohlmert
… 6766 more positive …,… 6766 more positive …
… 1548 more negative …,… 1548 more negative …
-3.909,word[-2:]:lf

Weight?,Feature
+7.301,word.lower():president
+6.125,word.lower():obama
+5.647,word.lower():senator
+5.367,word.lower():greenspan
+5.325,word.lower():vice
+4.824,word.lower():western
+4.721,word.lower():hall
+4.600,word.lower():prime
+4.541,word.lower():clinton
+4.510,word.lower():frank

Weight?,Feature
+4.163,word.lower():obama
+3.625,+1:word.lower():advisor
+3.517,word.lower():pressewednesday
+3.464,+1:word.lower():timothy
+3.230,+1:word.lower():gao
+3.191,+1:word.lower():fighters
+3.102,-1:word.lower():michael
+3.079,word.lower():gates
… 5573 more positive …,… 5573 more positive …
… 1380 more negative …,… 1380 more negative …

Weight?,Feature
+7.226,word.lower():multi-candidate
+6.381,word.lower():february
+6.335,word.lower():january
+6.181,word.lower():2000
+6.126,word.lower():one-year
+5.950,word.lower():weekend
+5.557,+1:word.lower():week
+5.225,word.lower():august
+5.199,word.lower():december
+4.961,word.lower():september

Weight?,Feature
+4.467,+1:word.lower():stocky
+4.098,+1:word.lower():old
+4.080,word.lower():working-age
+3.831,word.lower():2000
+3.821,word.lower():april
+3.654,+1:word.lower():jose
+3.597,-1:word.lower():this
+3.468,+1:word.lower():reflected
+3.407,+1:word.lower():month
+3.403,-1:word.lower():past
