In [30]:
import nltk
import pycrfsuite
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
random_seed = 123

## Load Data

In [31]:
def loadData(filename):
    sentences = []
    with open(filename, encoding='iso8859-15') as f:
        sent = []
        for line in f:
            line = line.strip()
            if (len(line) == 0 ):
                if len(sent) != 0:
                    sentences.append(sent)
                    sent = []
            else:
                ls = line.split(' ')
                word, tag = ls[0],ls[-1]
                sent.append((word,tag))
    return sentences

def writeData(filename,sentences):
    with open(filename,"w") as f:
        for sent in sentences:
            for tuple in sent:
                f.write(tuple[0] + " "+ tuple[1]+"\n")
            f.write("\n")
            
def writeData1(filename,sentences):
    with open(filename,"w") as f:
        for sent in sentences:
            for tuple in sent:
                f.write(tuple[0] + "\n")
            f.write("\n")

In [32]:
def pos_tag_sentence(sentences):
    pos_tagged_sentences = []
    for i, sent in enumerate(sentences):
        # Obtain the list of tokens in the document
        tokens = [token for token, label in sent]
        # Perform POS tagging
        tagged = nltk.pos_tag(tokens)
        # Take the word, POS tag, and its label
        pos_tagged_sentences.append([(w, pos, label) for (w, label), (word, pos) in zip(sent, tagged)])
    return pos_tagged_sentences

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = [
        'bias',
        'word.lower=' + word.lower(),
        'word[-3:]=' + word[-3:],
        'word[-2:]=' + word[-2:],
        'word.isupper=%s' % word.isupper(),
        'word.istitle=%s' % word.istitle(),
        'word.isdigit=%s' % word.isdigit(),
        'postag=' + postag,
        'postag[:2]=' + postag[:2],
    ]
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.extend([
            '-1:word.lower=' + word1.lower(),
            '-1:word.istitle=%s' % word1.istitle(),
            '-1:word.isupper=%s' % word1.isupper(),
            '-1:postag=' + postag1,
            '-1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('BOS')
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.extend([
            '+1:word.lower=' + word1.lower(),
            '+1:word.istitle=%s' % word1.istitle(),
            '+1:word.isupper=%s' % word1.isupper(),
            '+1:postag=' + postag1,
            '+1:postag[:2]=' + postag1[:2],
        ])
    else:
        features.append('EOS')
                
    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [33]:
sentences = loadData('ner.txt')
sentences = pos_tag_sentence(sentences)
X = [sent2features(s) for s in sentences]
y = [sent2labels(s) for s in sentences]

# X = sentences
# y=sentences

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=random_seed)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.10, random_state=random_seed)

In [35]:
print(X_train[0])
# writeData("ner.train",X_train)
# writeData("ner.dev",X_dev)
# writeData("ner.test",X_test)

[['bias', 'word.lower=studies', 'word[-3:]=ies', 'word[-2:]=es', 'word.isupper=False', 'word.istitle=True', 'word.isdigit=False', 'postag=NNS', 'postag[:2]=NN', 'BOS', '+1:word.lower=on', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=IN', '+1:postag[:2]=IN'], ['bias', 'word.lower=on', 'word[-3:]=on', 'word[-2:]=on', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=IN', 'postag[:2]=IN', '-1:word.lower=studies', '-1:word.istitle=True', '-1:word.isupper=False', '-1:postag=NNS', '-1:postag[:2]=NN', '+1:word.lower=the', '+1:word.istitle=False', '+1:word.isupper=False', '+1:postag=DT', '+1:postag[:2]=DT'], ['bias', 'word.lower=the', 'word[-3:]=the', 'word[-2:]=he', 'word.isupper=False', 'word.istitle=False', 'word.isdigit=False', 'postag=DT', 'postag[:2]=DT', '-1:word.lower=on', '-1:word.istitle=False', '-1:word.isupper=False', '-1:postag=IN', '-1:postag[:2]=IN', '+1:word.lower=radioimmunoassay', '+1:word.istitle=False', '+1:word.isupper=False', '+1:po

## Training the model

In [36]:
trainer = pycrfsuite.Trainer(verbose=False)
for xseq, yseq in zip(X_train, y_train):
    trainer.append(xseq, yseq)
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 1000,  # stop earlier
    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})
trainer.train('ner.model')

In [37]:
trainer.params()
trainer.logparser.last_iteration
print(len(trainer.logparser.iterations), trainer.logparser.iterations[-1])

969 {'num': 969, 'scores': {}, 'loss': 5206.987555, 'feature_norm': 52.912217, 'error_norm': 17.438782, 'active_features': 1860, 'linesearch_trials': 2, 'linesearch_step': 0.5, 'time': 0.067}


## Evaluating the model on Dev/Test set

In [38]:
tagger = pycrfsuite.Tagger()
tagger.open('ner.model')

# Create a mapping of labels to indices
labels = {"D": 1, "T": 2, "O":0}

print("Evaluating on Dev Set")
#Evaluating on Dev Set
y_pred_dev = [tagger.tag(xseq) for xseq in X_dev]
# for i in range(len(X_test)):
#     for x, y,yp in zip([x[1].split("=")[1] for x in X_test[i]],y_test[i],y_pred_test[i]):
#         print("%s (%s) (%s)" % (x, y, yp))
# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred_dev for tag in row])
truths = np.array([labels[tag] for row in y_dev for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    labels = [0,1,2],
    target_names=["O", "D","T"]))


print("Evaluating on Train Set")
#Evaluating on Train Set
y_pred_test = [tagger.tag(xseq) for xseq in X_test]

# Convert the sequences of tags into a 1-dimensional array
predictions = np.array([labels[tag] for row in y_pred_test for tag in row])
truths = np.array([labels[tag] for row in y_test for tag in row])

# Print out the classification report
print(classification_report(
    truths, predictions,
    labels = [0,1,2],
    target_names=["O", "D","T"]))

# for i in range(len(X_dev)):
#     for x, y,yp in zip([x[1].split("=")[1] for x in X_dev[i]],y_dev[i],y_pred_dev[i]):
#         print("%s (%s) (%s)" % (x, y, yp))

Evaluating on Dev Set
             precision    recall  f1-score   support

          O       0.93      0.97      0.95      4593
          D       0.83      0.69      0.75       384
          T       0.47      0.32      0.38       302

avg / total       0.90      0.91      0.90      5279

Evaluating on Train Set
             precision    recall  f1-score   support

          O       0.94      0.97      0.96     10874
          D       0.80      0.65      0.71      1007
          T       0.75      0.58      0.66       796

avg / total       0.92      0.92      0.92     12677



In [39]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(9))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-9:])

Top likely transitions:
D      -> D       2.735692
O      -> O       2.035593
T      -> T       1.833928
D      -> O       -1.411420
T      -> O       -1.456108
O      -> D       -1.473607
O      -> T       -2.462634
D      -> T       -2.854172
T      -> D       -4.945421

Top unlikely transitions:
D      -> D       2.735692
O      -> O       2.035593
T      -> T       1.833928
D      -> O       -1.411420
T      -> O       -1.456108
O      -> D       -1.473607
O      -> T       -2.462634
D      -> T       -2.854172
T      -> D       -4.945421


In [40]:
def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-6s %s" % (weight, label, attr))    

print("Top positive:")
print_state_features(Counter(info.state_features).most_common(20))

print("\nTop negative:")
print_state_features(Counter(info.state_features).most_common()[-20:])

Top positive:
6.404469 D      word.lower=diabetes
6.397222 D      word.lower=incontinence
6.053253 D      word.lower=tumors
5.649193 D      word.lower=cancers
5.511970 D      -1:word.lower=baclofen
5.439216 T      word.lower=fenfluramines
5.116713 T      word.lower=resection
4.974191 T      word.lower=antibiotics
4.907409 T      word.lower=vaccination
4.814898 T      word[-3:]=xel
4.740379 D      word.lower=bleeding
4.680517 D      word.lower=depression
4.622520 T      word.lower=alteplase
4.600431 T      +1:word.lower=yag
4.461216 O      word.lower=versus
4.392960 D      word[-2:]=lc
4.378241 D      word.lower=infection
4.340741 T      word.lower=ventilation
4.239456 D      word.lower=strokes
4.198719 D      word.lower=hypertension

Top negative:
-1.693500 O      postag[:2]=NN
-1.723514 T      word[-2:]=as
-1.790795 O      word[-3:]=oot
-1.820180 O      word[-3:]=che
-1.829894 D      +1:word.lower=for
-1.832364 O      word.lower=lung
-1.846011 O      word.lower=pulmonary
-1.860870 O  