In [2]:
import pandas as pd
import numpy as np

data = pd.read_csv("../../data/ner_dataset.csv", encoding="latin1")

In [3]:
data = data.fillna(method="ffill")

In [4]:
data.tail(10)

Unnamed: 0,Sentence #,Word,POS,Tag
1048565,Sentence: 47958,impact,NN,O
1048566,Sentence: 47958,.,.,O
1048567,Sentence: 47959,Indian,JJ,B-gpe
1048568,Sentence: 47959,forces,NNS,O
1048569,Sentence: 47959,said,VBD,O
1048570,Sentence: 47959,they,PRP,O
1048571,Sentence: 47959,responded,VBD,O
1048572,Sentence: 47959,to,TO,O
1048573,Sentence: 47959,the,DT,O
1048574,Sentence: 47959,attack,NN,O


In [5]:
words = list(set(data["Word"].values))

In [6]:
n_words = len(words); n_words

35178

In [7]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

In [8]:
getter = SentenceGetter(data)

In [9]:
sent = getter.get_next()

In [10]:
print(sent)

[('Thousands', 'NNS', 'O'), ('of', 'IN', 'O'), ('demonstrators', 'NNS', 'O'), ('have', 'VBP', 'O'), ('marched', 'VBN', 'O'), ('through', 'IN', 'O'), ('London', 'NNP', 'B-geo'), ('to', 'TO', 'O'), ('protest', 'VB', 'O'), ('the', 'DT', 'O'), ('war', 'NN', 'O'), ('in', 'IN', 'O'), ('Iraq', 'NNP', 'B-geo'), ('and', 'CC', 'O'), ('demand', 'VB', 'O'), ('the', 'DT', 'O'), ('withdrawal', 'NN', 'O'), ('of', 'IN', 'O'), ('British', 'JJ', 'B-gpe'), ('troops', 'NNS', 'O'), ('from', 'IN', 'O'), ('that', 'DT', 'O'), ('country', 'NN', 'O'), ('.', '.', 'O')]


In [12]:
# 모든 문장
sentences = getter.sentences

In [14]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

## Feature preparation 
- Craft a set of features and prepare the dataset

In [23]:
def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],
    }
    
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()':word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
        
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]


def sent2labels(sent):
    return [label for token, postag, label in sent]


def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [24]:
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

In [25]:
print(len(X), len(y))

47959 47959


In [26]:
sentences[0]

[('Thousands', 'NNS', 'O'),
 ('of', 'IN', 'O'),
 ('demonstrators', 'NNS', 'O'),
 ('have', 'VBP', 'O'),
 ('marched', 'VBN', 'O'),
 ('through', 'IN', 'O'),
 ('London', 'NNP', 'B-geo'),
 ('to', 'TO', 'O'),
 ('protest', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('war', 'NN', 'O'),
 ('in', 'IN', 'O'),
 ('Iraq', 'NNP', 'B-geo'),
 ('and', 'CC', 'O'),
 ('demand', 'VB', 'O'),
 ('the', 'DT', 'O'),
 ('withdrawal', 'NN', 'O'),
 ('of', 'IN', 'O'),
 ('British', 'JJ', 'B-gpe'),
 ('troops', 'NNS', 'O'),
 ('from', 'IN', 'O'),
 ('that', 'DT', 'O'),
 ('country', 'NN', 'O'),
 ('.', '.', 'O')]

In [28]:
X[0][1]

{'bias': 1.0,
 'word.lower()': 'of',
 'word[-3:]': 'of',
 'word[-2]': 'of',
 'word.isupper()': False,
 'word.istitle()': False,
 'word.isdigit()': False,
 'postag': 'IN',
 'postag[:2]': 'IN',
 '-1:word.lower()': 'thousands',
 '-1:word.istitle()': True,
 '-1:word.isupper()': False,
 '-1:postag': 'NNS',
 '-1:postag[:2]': 'NN',
 '+1:word.lower()': 'demonstrators',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'NNS',
 '+1:postag[:2]': 'NN'}

## Fit the CRF model 

In [30]:
# !pip install sklearn-crfsuite

In [31]:
from sklearn_crfsuite import CRF

crf = CRF(algorithm='lbfgs',
          c1=0.1,
          c2=0.2,
          max_iterations=100,
          all_possible_transitions=False)

In [32]:
from sklearn.model_selection import cross_val_predict
from sklearn_crfsuite.metrics import flat_classification_report

In [33]:
%%time
pred = cross_val_predict(estimator=crf, X=X, y=y, cv=5)



CPU times: user 20min 9s, sys: 6.33 s, total: 20min 15s
Wall time: 20min 15s


## Evaluate the model

In [35]:
report = flat_classification_report(y_pred=pred, y_true=y)
print(report)

              precision    recall  f1-score   support

       B-art       0.38      0.10      0.15       402
       B-eve       0.53      0.35      0.42       308
       B-geo       0.85      0.91      0.88     37644
       B-gpe       0.97      0.94      0.95     15870
       B-nat       0.67      0.37      0.47       201
       B-org       0.79      0.72      0.75     20143
       B-per       0.83      0.82      0.83     16990
       B-tim       0.93      0.88      0.90     20333
       I-art       0.12      0.03      0.04       297
       I-eve       0.36      0.22      0.27       253
       I-geo       0.82      0.80      0.81      7414
       I-gpe       0.92      0.55      0.69       198
       I-nat       0.54      0.27      0.36        51
       I-org       0.81      0.79      0.80     16784
       I-per       0.84      0.90      0.87     17251
       I-tim       0.84      0.76      0.80      6528
           O       0.99      0.99      0.99    887908

    accuracy              

In [36]:
%%time
crf.fit(X, y)

CPU times: user 4min 58s, sys: 1.1 s, total: 4min 59s
Wall time: 4min 59s




CRF(algorithm='lbfgs', all_possible_states=None, all_possible_transitions=False,
    averaging=None, c=None, c1=0.1, c2=0.2, calibration_candidates=None,
    calibration_eta=None, calibration_max_trials=None, calibration_rate=None,
    calibration_samples=None, delta=None, epsilon=None, error_sensitive=None,
    gamma=None, keep_tempfiles=None, linesearch=None, max_iterations=100,
    max_linesearch=None, min_freq=None, model_filename=None, num_memories=None,
    pa_type=None, period=None, trainer_cls=None, variance=None, verbose=False)

In [37]:
!pip install eli5

Collecting eli5
[?25l  Downloading https://files.pythonhosted.org/packages/97/2f/c85c7d8f8548e460829971785347e14e45fa5c6617da374711dec8cb38cc/eli5-0.10.1-py2.py3-none-any.whl (105kB)
[K     |████████████████████████████████| 112kB 965kB/s eta 0:00:01
Collecting graphviz (from eli5)
  Downloading https://files.pythonhosted.org/packages/f5/74/dbed754c0abd63768d3a7a7b472da35b08ac442cf87d73d5850a6f32391e/graphviz-0.13.2-py2.py3-none-any.whl
Installing collected packages: graphviz, eli5
Successfully installed eli5-0.10.1 graphviz-0.13.2


In [38]:
import eli5

Using TensorFlow backend.


In [39]:
eli5.show_weights(crf, top=30)



From \ To,O,B-art,I-art,B-eve,I-eve,B-geo,I-geo,B-gpe,I-gpe,B-nat,I-nat,B-org,I-org,B-per,I-per,B-tim,I-tim
O,4.315,0.91,0.0,1.906,0.0,2.06,0.0,1.414,0.0,1.646,0.0,2.176,0.0,4.351,0.0,2.925,0.0
B-art,-0.185,0.0,7.402,0.0,0.0,-0.518,0.0,0.0,0.0,0.0,0.0,0.509,0.0,-0.669,0.0,0.024,0.0
I-art,-0.659,0.0,6.905,0.0,0.0,-1.144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.115,0.0,-0.735,0.0
B-eve,-0.696,0.0,0.0,0.0,7.473,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.276,0.0
I-eve,-0.301,0.0,0.0,0.0,7.056,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.763,0.0
B-geo,0.556,0.438,0.0,0.518,0.0,0.0,8.494,0.611,0.0,0.0,0.0,0.808,0.0,1.084,0.0,2.173,0.0
I-geo,-0.547,0.747,0.0,0.0,0.0,0.0,6.952,-1.17,0.0,0.0,0.0,-0.284,0.0,0.054,0.0,0.54,0.0
B-gpe,1.065,-1.02,0.0,-0.372,0.0,0.87,0.0,0.0,7.173,0.0,0.0,1.931,0.0,1.528,0.0,0.718,0.0
I-gpe,-0.43,0.0,0.0,0.0,0.0,-0.558,0.0,0.0,6.262,0.0,0.0,0.0,0.0,0.539,0.0,0.0,0.0
B-nat,-1.034,0.0,0.0,0.0,0.0,0.48,0.0,0.0,0.0,0.0,6.517,0.0,0.0,-0.017,0.0,-0.1,0.0

Weight?,Feature,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,Unnamed: 16_level_0
Weight?,Feature,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Weight?,Feature,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Weight?,Feature,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3,Unnamed: 14_level_3,Unnamed: 15_level_3,Unnamed: 16_level_3
Weight?,Feature,Unnamed: 2_level_4,Unnamed: 3_level_4,Unnamed: 4_level_4,Unnamed: 5_level_4,Unnamed: 6_level_4,Unnamed: 7_level_4,Unnamed: 8_level_4,Unnamed: 9_level_4,Unnamed: 10_level_4,Unnamed: 11_level_4,Unnamed: 12_level_4,Unnamed: 13_level_4,Unnamed: 14_level_4,Unnamed: 15_level_4,Unnamed: 16_level_4
Weight?,Feature,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5
Weight?,Feature,Unnamed: 2_level_6,Unnamed: 3_level_6,Unnamed: 4_level_6,Unnamed: 5_level_6,Unnamed: 6_level_6,Unnamed: 7_level_6,Unnamed: 8_level_6,Unnamed: 9_level_6,Unnamed: 10_level_6,Unnamed: 11_level_6,Unnamed: 12_level_6,Unnamed: 13_level_6,Unnamed: 14_level_6,Unnamed: 15_level_6,Unnamed: 16_level_6
Weight?,Feature,Unnamed: 2_level_7,Unnamed: 3_level_7,Unnamed: 4_level_7,Unnamed: 5_level_7,Unnamed: 6_level_7,Unnamed: 7_level_7,Unnamed: 8_level_7,Unnamed: 9_level_7,Unnamed: 10_level_7,Unnamed: 11_level_7,Unnamed: 12_level_7,Unnamed: 13_level_7,Unnamed: 14_level_7,Unnamed: 15_level_7,Unnamed: 16_level_7
Weight?,Feature,Unnamed: 2_level_8,Unnamed: 3_level_8,Unnamed: 4_level_8,Unnamed: 5_level_8,Unnamed: 6_level_8,Unnamed: 7_level_8,Unnamed: 8_level_8,Unnamed: 9_level_8,Unnamed: 10_level_8,Unnamed: 11_level_8,Unnamed: 12_level_8,Unnamed: 13_level_8,Unnamed: 14_level_8,Unnamed: 15_level_8,Unnamed: 16_level_8
Weight?,Feature,Unnamed: 2_level_9,Unnamed: 3_level_9,Unnamed: 4_level_9,Unnamed: 5_level_9,Unnamed: 6_level_9,Unnamed: 7_level_9,Unnamed: 8_level_9,Unnamed: 9_level_9,Unnamed: 10_level_9,Unnamed: 11_level_9,Unnamed: 12_level_9,Unnamed: 13_level_9,Unnamed: 14_level_9,Unnamed: 15_level_9,Unnamed: 16_level_9
Weight?,Feature,Unnamed: 2_level_10,Unnamed: 3_level_10,Unnamed: 4_level_10,Unnamed: 5_level_10,Unnamed: 6_level_10,Unnamed: 7_level_10,Unnamed: 8_level_10,Unnamed: 9_level_10,Unnamed: 10_level_10,Unnamed: 11_level_10,Unnamed: 12_level_10,Unnamed: 13_level_10,Unnamed: 14_level_10,Unnamed: 15_level_10,Unnamed: 16_level_10
Weight?,Feature,Unnamed: 2_level_11,Unnamed: 3_level_11,Unnamed: 4_level_11,Unnamed: 5_level_11,Unnamed: 6_level_11,Unnamed: 7_level_11,Unnamed: 8_level_11,Unnamed: 9_level_11,Unnamed: 10_level_11,Unnamed: 11_level_11,Unnamed: 12_level_11,Unnamed: 13_level_11,Unnamed: 14_level_11,Unnamed: 15_level_11,Unnamed: 16_level_11
Weight?,Feature,Unnamed: 2_level_12,Unnamed: 3_level_12,Unnamed: 4_level_12,Unnamed: 5_level_12,Unnamed: 6_level_12,Unnamed: 7_level_12,Unnamed: 8_level_12,Unnamed: 9_level_12,Unnamed: 10_level_12,Unnamed: 11_level_12,Unnamed: 12_level_12,Unnamed: 13_level_12,Unnamed: 14_level_12,Unnamed: 15_level_12,Unnamed: 16_level_12
Weight?,Feature,Unnamed: 2_level_13,Unnamed: 3_level_13,Unnamed: 4_level_13,Unnamed: 5_level_13,Unnamed: 6_level_13,Unnamed: 7_level_13,Unnamed: 8_level_13,Unnamed: 9_level_13,Unnamed: 10_level_13,Unnamed: 11_level_13,Unnamed: 12_level_13,Unnamed: 13_level_13,Unnamed: 14_level_13,Unnamed: 15_level_13,Unnamed: 16_level_13
Weight?,Feature,Unnamed: 2_level_14,Unnamed: 3_level_14,Unnamed: 4_level_14,Unnamed: 5_level_14,Unnamed: 6_level_14,Unnamed: 7_level_14,Unnamed: 8_level_14,Unnamed: 9_level_14,Unnamed: 10_level_14,Unnamed: 11_level_14,Unnamed: 12_level_14,Unnamed: 13_level_14,Unnamed: 14_level_14,Unnamed: 15_level_14,Unnamed: 16_level_14
Weight?,Feature,Unnamed: 2_level_15,Unnamed: 3_level_15,Unnamed: 4_level_15,Unnamed: 5_level_15,Unnamed: 6_level_15,Unnamed: 7_level_15,Unnamed: 8_level_15,Unnamed: 9_level_15,Unnamed: 10_level_15,Unnamed: 11_level_15,Unnamed: 12_level_15,Unnamed: 13_level_15,Unnamed: 14_level_15,Unnamed: 15_level_15,Unnamed: 16_level_15
Weight?,Feature,Unnamed: 2_level_16,Unnamed: 3_level_16,Unnamed: 4_level_16,Unnamed: 5_level_16,Unnamed: 6_level_16,Unnamed: 7_level_16,Unnamed: 8_level_16,Unnamed: 9_level_16,Unnamed: 10_level_16,Unnamed: 11_level_16,Unnamed: 12_level_16,Unnamed: 13_level_16,Unnamed: 14_level_16,Unnamed: 15_level_16,Unnamed: 16_level_16
+7.491,word.lower():month,,,,,,,,,,,,,,,
+7.118,word.lower():last,,,,,,,,,,,,,,,
+5.276,word.lower():year,,,,,,,,,,,,,,,
+5.074,word.lower():chairman,,,,,,,,,,,,,,,
+4.747,word.lower():columbia,,,,,,,,,,,,,,,
+4.721,word[-2]:N1,,,,,,,,,,,,,,,
+4.591,word.lower():week,,,,,,,,,,,,,,,
+4.487,word.lower():internet,,,,,,,,,,,,,,,
+4.298,word.lower():weeks,,,,,,,,,,,,,,,
+4.291,-1:word.lower():iraqi,,,,,,,,,,,,,,,

Weight?,Feature
+7.491,word.lower():month
+7.118,word.lower():last
+5.276,word.lower():year
+5.074,word.lower():chairman
+4.747,word.lower():columbia
+4.721,word[-2]:N1
+4.591,word.lower():week
+4.487,word.lower():internet
+4.298,word.lower():weeks
+4.291,-1:word.lower():iraqi

Weight?,Feature
+4.417,word.lower():twitter
+3.717,word.lower():spaceshipone
+3.421,word.lower():english
+3.295,word[-3:]:One
+3.293,word.lower():canal
+3.140,word.lower():nevirapine
+2.843,-1:word.lower():film
+2.841,-1:word.lower():engine
+2.520,word.lower():spanish
+2.436,word[-2]:00

Weight?,Feature
+2.428,-1:word.lower():boeing
+1.870,+1:word.lower():came
+1.813,+1:word.lower():gained
+1.686,-1:word.lower():cajun
+1.658,word.lower():flowers
+1.595,word.lower():constitution
+1.564,+1:word.lower():times
+1.559,word.lower():notice
+1.543,word.lower():declaration
+1.486,word.lower():monument

Weight?,Feature
+3.494,word.lower():ramadan
+3.247,word.lower():games
+3.067,word[-3:]:mes
+2.866,-1:word.lower():war
+2.824,-1:word.lower():wars
+2.813,-1:word.lower():typhoon
+2.360,word.lower():hopman
+2.228,-1:word.lower():falklands
+2.210,word.lower():olympic
+2.207,word[-3:]:pic

Weight?,Feature
+2.846,word.lower():games
+2.257,+1:word.lower():mascots
+2.071,word.lower():series
+1.978,word.lower():dean
+1.817,+1:word.lower():without
+1.781,+1:word.lower():era
+1.753,+1:word.lower():rally
+1.745,-1:word.lower():jewish
+1.731,+1:word.lower():caused
+1.698,word.lower():sabbath

Weight?,Feature
+5.073,word.lower():caribbean
+4.809,-1:word.lower():mr.
+4.682,word.lower():martian
+4.398,word.lower():beijing
+4.322,word.lower():mid-march
+4.180,word.lower():persian
+3.980,-1:word.lower():hamas
+3.889,-1:word.lower():serb
+3.782,word.lower():paris
+3.506,word.lower():europe

Weight?,Feature
+2.882,word.lower():caribbean
+2.665,+1:word.lower():regional
+2.641,word.lower():east
+2.618,word.lower():island
+2.562,-1:word.lower():nahr
+2.497,word.lower():ocean
+2.420,word.lower():marine
+2.271,-1:word.lower():gulf
+2.238,word.lower():airport
+2.224,word.lower():led-invasion

Weight?,Feature
+6.997,word.lower():niger
+6.226,word.lower():nepal
+5.588,word.lower():afghan
+4.445,word.lower():azerbaijan
+4.279,word.lower():iranian
+4.254,word.lower():korean
+4.249,word.lower():spaniard
+4.150,word.lower():jordan
+4.035,word.lower():argentine
+4.008,word.lower():mexican

Weight?,Feature
+4.659,+1:word.lower():mayor
+3.343,-1:word.lower():bosnian
+2.783,word.lower():cypriots
+2.542,-1:word.lower():democratic
+2.307,word[-3:]:can
+2.263,word.lower():korean
+2.159,+1:word.lower():developed
+2.083,word.lower():indians
+2.055,-1:word.lower():soviet
+2.054,word[-2]:bs

Weight?,Feature
+5.069,word.lower():katrina
+4.591,word.lower():marburg
+3.370,word.lower():rita
+2.938,word[-2]:N1
+2.814,word[-3:]:5N1
+2.814,word.lower():h5n1
+2.636,word[-3:]:ita
+2.586,+1:word.lower():strain
+2.438,word.lower():paul
+2.224,word[-3:]:urg

Weight?,Feature
+1.924,word.lower():rita
+1.917,word[-3:]:ita
+1.729,-1:word.lower():hurricanes
+1.649,word[-2]:ta
+1.614,+1:word.lower():outbreak
+1.573,-1:word.lower():type
+1.510,word[-2]:lu
+1.476,word.lower():flu
+1.471,-1:postag:NN
+1.468,-1:word.istitle()

Weight?,Feature
+6.317,word.lower():philippine
+5.101,word.lower():al-qaida
+4.757,word.lower():hamas
+4.682,-1:word.lower():senator
+4.555,-1:word.lower():rice
+4.546,word.lower():hezbollah
+4.491,word.lower():taleban
+4.306,word.lower():mid-march
+4.230,word.lower():university
+4.138,word.lower():congress

Weight?,Feature
+3.104,+1:word.lower():reporter
+2.632,word.lower():department
+2.621,-1:word.lower():associated
+2.513,word.lower():singapore
+2.488,-1:word.lower():&
+2.483,word.lower():ministry
+2.436,word.lower():airlines
+2.412,+1:word.lower():ms.
+2.406,word.lower():times
+2.401,-1:word.lower():people

Weight?,Feature
+5.775,word.lower():president
+4.828,word.lower():obama
+4.794,word.lower():vice
+4.639,word.lower():prime
+4.594,word.lower():senator
+4.355,word.lower():greenspan
+3.907,word.lower():hall
+3.699,word[-2]:r.
+3.674,word.lower():milosevic
+3.628,word.lower():clinton

Weight?,Feature
+2.983,word.lower():obama
+2.721,+1:word.lower():advisor
+2.609,+1:word.lower():timothy
+2.524,word.lower():peter
+2.471,-1:word.lower():'
+2.366,+1:word.lower():fighters
+2.323,-1:word.lower():michael
+2.293,word.lower():gates
+2.217,-1:word.lower():david
+2.197,-1:word.lower():davis

Weight?,Feature
+5.742,word.lower():multi-candidate
+5.402,+1:word.lower():week
+4.981,word.lower():weekend
+4.951,word.lower():one-year
+4.780,word.lower():2000
+4.742,word.lower():august
+4.715,word.lower():february
+4.701,word.lower():january
+4.418,word.lower():december
+4.249,word[-3:]:Day

Weight?,Feature
+3.361,+1:word.lower():month
+3.141,-1:word.lower():this
+3.136,word.lower():evening
+2.939,-1:word.lower():past
+2.753,word[-2]:m.
+2.753,word[-3:]:.m.
+2.685,word.lower():march
+2.646,word.lower():morning
+2.645,word.lower():august
+2.590,+1:word.lower():early
