# Saving lives with AI#Extract disease names form given set of 20000 paragraphs#AUTHOR-NIKHIL MUGGANAWAR

In [1]:
# import libaries
import numpy as np
import pandas as pd
import nltk, pprint
import matplotlib.pyplot as plt
import random

import gzip, os, pickle # gzip for reading the gz files, pickle to save/dump trained model 
import _pickle as cPickle

import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV

# supress warnings
import warnings
warnings.filterwarnings('ignore')



# IMPORT DATA

In [2]:
df=pd.read_csv("train.csv",nrows=10000)
df_actual_test=pd.read_csv('test.csv',nrows=5000)

In [3]:
df_actual_test.shape

(5000, 4)

In [4]:
df.columns

Index(['id', 'Doc_ID', 'Sent_ID', 'Word', 'tag'], dtype='object')

In [5]:
from sklearn.model_selection import train_test_split
df_train, df_test= train_test_split(df,train_size=0.50, random_state=101)

In [6]:
disease_indicators=[]
for i in df_train['Word']:
    for j in df_train['tag']:
        if j[0]=='B' or j[0]=='I':
            if i.lower() not in disease_indicators:
                disease_indicators.append(i.lower())
    #print (i)

In [7]:
# extracts features for the word at index i in a sentence 
def npchunk_features(sentence, i, history):
    word, pos = sentence[i]
    
    # the first word has both previous word and previous tag undefined
    if i == 0:
        prevword, prevpos = "<START>", "<START>"
    else:
        prevword, prevpos = sentence[i-1]

    # gazetteer lookup features (see section below)
    #gazetteer = gazetteer_lookup(word)
    if word in disease_indicators:
        return {"pos": pos, "prevpos": prevpos, 'word':word,
           'word_is_disease_indicator': word,
            } 
    else:
        return {"pos": pos, "prevpos": prevpos, 'word':word,
            }
    

In [8]:
train_postag=df_train['Word']
train_label=df_train['tag']
test_postag=df_test['Word']
test_label=df_test['tag']
test_postag_actual=df_actual_test['Word']

In [9]:
test_postag_actual.shape

(5000,)

In [10]:
train_postag=train_postag.str.lower()
test_postag=test_postag.str.lower()
test_postag_actual=test_postag_actual.str.lower()

In [11]:
alphabets=['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']

In [12]:
#filter only those strings which start with a character
def removenumbersandspecialcharacters(list):
    new_list=[]    
    for i in list:
        try:
            if i[0] in alphabets:
                new_list.append(i)
                
        except:
            continue
    return new_list         

In [13]:
train_postag_new=removenumbersandspecialcharacters(train_postag)
test_postag_new=removenumbersandspecialcharacters(test_postag)
test_postag_actual_new=removenumbersandspecialcharacters(test_postag_actual)

In [14]:
# POS tagging sentences
# takes in a list of sentences and returns a list of POS-tagged sentences
# in the form (word, tag)
def pos_tag(sent_list):
    pos_tags = []    
    tagged_words = nltk.pos_tag([sent for sent in sent_list if sent])
    pos_tags.append(tagged_words)
    return pos_tags

In [15]:
train_pos = pos_tag(train_postag_new)
test_pos = pos_tag(test_postag_new)
test_pos_actual = pos_tag(test_postag_actual_new)

In [16]:
#train_label=list(map(lambda x:x.split('-')[0],train_label))
#test_label=list(map(lambda x:x.split('-')[0],test_label))
train_label=list(map(lambda x:x.split('indications')[0],train_label))
test_label=list(map(lambda x:x.split('indications')[0],test_label))

In [17]:
#train_label=train_label.tolist()
#test_label=test_label.tolist()

In [18]:
print(train_label)

['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'I-', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O

In [19]:
newtrain=[]
for x in train_pos:
  for y in x:
    for n in y:
        newtrain.append(n.split(" "))
        #print(new, end=' ')

In [20]:
newtest=[]
for x in test_pos:
  for y in x:
    for n in y:
        newtest.append(n.split(" "))
        #print(new, end=' ')

In [21]:
newtestactual=[]
for x in test_pos_actual:
  for y in x:
    for n in y:
        newtestactual.append(n.split(" "))

In [22]:
#subset = test_pos_actual[['data_date', 'data_1', 'data_2']]
#tuples = [tuple(x) for x in subset.values]

In [23]:
wordtrain=newtrain[::2]
postrain=newtrain[1::2]
wordtest=newtest[::2]
postest=newtest[1::2]

In [24]:
wordtestactual=newtestactual[::2]
postestactual=newtestactual[1::2]

In [25]:
wordtrain = [item for sublist in wordtrain for item in sublist]
postrain = [item for sublist in postrain for item in sublist]
wordtest = [item for sublist in wordtest for item in sublist]
postest = [item for sublist in postest for item in sublist]

In [26]:
postestactual = [item for sublist in postestactual for item in sublist]

In [27]:
resulttrain=list(zip(wordtrain,postrain,train_label))
resulttest=list(zip(wordtest,postest,test_label))

In [28]:
output = []
for x in train_label:
    if x not in output:
        output.append(x)
print (output)

['O', 'I-', 'B-']


In [29]:
from nltk.corpus import conll2000
from nltk import conlltags2tree, tree2conlltags

In [30]:
train_trees = [conlltags2tree(resulttrain)]
test_trees = [conlltags2tree(resulttest)]

In [31]:
# unigram chunker

from nltk import ChunkParserI

class UnigramChunker(ChunkParserI):    
    def __init__(self, train_sents):
        # convert train sents from tree format to tags
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] 
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        
        # convert to tree again
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)
        

In [32]:
# unigram chunker 
unigram_chunker = UnigramChunker(train_trees)
print(unigram_chunker.evaluate(test_trees))

ChunkParse score:
    IOB Accuracy:  98.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [33]:
# extract the list of pos tags
postags = sorted(set([pos for sent in train_trees for (word, pos) in sent.leaves()]))

# for each tag, assign the most likely IOB label
print(unigram_chunker.tagger.tag(postags))

[('CC', 'O'), ('CD', 'O'), ('DT', 'O'), ('EX', 'O'), ('FW', 'O'), ('IN', 'O'), ('JJ', 'O'), ('JJR', 'O'), ('JJS', 'O'), ('MD', 'O'), ('NN', 'O'), ('NNP', 'O'), ('NNS', 'O'), ('PDT', 'O'), ('PRP', 'O'), ('PRP$', 'O'), ('RB', 'O'), ('RBR', 'O'), ('RBS', 'O'), ('TO', 'O'), ('VB', 'O'), ('VBD', 'O'), ('VBG', 'O'), ('VBN', 'O'), ('VBP', 'O'), ('VBZ', 'O'), ('WDT', 'O'), ('WP', 'O'), ('WP$', 'O'), ('WRB', 'O')]


In [34]:
# bigram tagger

class BigramChunker(ChunkParserI):    
    def __init__(self, train_sents):
        # convert train sents from tree format to tags
        train_data = [[(t, c) for w, t, c in nltk.chunk.tree2conlltags(sent)] 
                      for sent in train_sents]
        self.tagger = nltk.BigramTagger(train_data)
        
    def parse(self, sentence):
        pos_tags = [pos for (word, pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        
        # convert to tree again
        conlltags = [(word, pos, chunktag) for ((word, pos), chunktag) in zip(sentence, chunktags)]
        return nltk.chunk.conlltags2tree(conlltags)

In [35]:
# biigram chunker 
bigram_chunker = BigramChunker(train_trees)
print(bigram_chunker.evaluate(test_trees))

ChunkParse score:
    IOB Accuracy:  98.6%%
    Precision:      0.0%%
    Recall:         0.0%%
    F-Measure:      0.0%%


In [36]:
# example sentence
sent_pos = train_pos[0]
sent_pos

[('empresas', 'NN'),
 ('ligand', 'NN'),
 ('identified', 'VBN'),
 ('regulation', 'NN'),
 ('the', 'DT'),
 ('dónde', 'NN'),
 ('proteolysis', 'NN'),
 ('to', 'TO'),
 ('agbr', 'VB'),
 ('temperature', 'NN'),
 ('to', 'TO'),
 ('on', 'IN'),
 ('the', 'DT'),
 ('phenotypic', 'NN'),
 ('of', 'IN'),
 ('in', 'IN'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('quantitative', 'NN'),
 ('a', 'DT'),
 ('stable', 'JJ'),
 ('imaging', 'NN'),
 ('graphite', 'JJ'),
 ('hence', 'NN'),
 ('testing', 'VBG'),
 ('we', 'PRP'),
 ('los', 'JJ'),
 ('results', 'NNS'),
 ('período', 'VBP'),
 ('the', 'DT'),
 ('tumor', 'NN'),
 ('speech', 'NN'),
 ('brønsted', 'VBN'),
 ('powerful', 'JJ'),
 ('world', 'NN'),
 ('material', 'NN'),
 ('coincidental', 'VBP'),
 ('the', 'DT'),
 ('of', 'IN'),
 ('of', 'IN'),
 ('that', 'DT'),
 ('followed', 'VBD'),
 ('to', 'TO'),
 ('to', 'TO'),
 ('in', 'IN'),
 ('to', 'TO'),
 ('injury', 'VB'),
 ('and', 'CC'),
 ('failure', 'VB'),
 ('release', 'NN'),
 ('frontal', 'JJ'),
 ('however', 'RB'),
 ('is', 'VBZ'),
 ('applications', 'N

In [37]:
# features for sentence sent_pos
# each word's features are stored in a dict
for i in range(len(sent_pos)):
    print(npchunk_features(sent_pos, i, history=[]))
    print(' ')

{'pos': 'NN', 'prevpos': '<START>', 'word': 'empresas', 'word_is_disease_indicator': 'empresas'}
 
{'pos': 'NN', 'prevpos': 'NN', 'word': 'ligand', 'word_is_disease_indicator': 'ligand'}
 
{'pos': 'VBN', 'prevpos': 'NN', 'word': 'identified', 'word_is_disease_indicator': 'identified'}
 
{'pos': 'NN', 'prevpos': 'VBN', 'word': 'regulation', 'word_is_disease_indicator': 'regulation'}
 
{'pos': 'DT', 'prevpos': 'NN', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'NN', 'prevpos': 'DT', 'word': 'dónde', 'word_is_disease_indicator': 'dónde'}
 
{'pos': 'NN', 'prevpos': 'NN', 'word': 'proteolysis', 'word_is_disease_indicator': 'proteolysis'}
 
{'pos': 'TO', 'prevpos': 'NN', 'word': 'to', 'word_is_disease_indicator': 'to'}
 
{'pos': 'VB', 'prevpos': 'TO', 'word': 'agbr', 'word_is_disease_indicator': 'agbr'}
 
{'pos': 'NN', 'prevpos': 'VB', 'word': 'temperature', 'word_is_disease_indicator': 'temperature'}
 
{'pos': 'TO', 'prevpos': 'NN', 'word': 'to', 'word_is_disease_indicator':

 
{'pos': 'VBN', 'prevpos': 'VBD', 'word': 'study', 'word_is_disease_indicator': 'study'}
 
{'pos': 'DT', 'prevpos': 'VBN', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'NN', 'prevpos': 'DT', 'word': 'memory', 'word_is_disease_indicator': 'memory'}
 
{'pos': 'IN', 'prevpos': 'NN', 'word': 'of', 'word_is_disease_indicator': 'of'}
 
{'pos': 'NNS', 'prevpos': 'IN', 'word': 'abortions', 'word_is_disease_indicator': 'abortions'}
 
{'pos': 'VBP', 'prevpos': 'NNS', 'word': 'sawdust', 'word_is_disease_indicator': 'sawdust'}
 
{'pos': 'DT', 'prevpos': 'VBP', 'word': 'a', 'word_is_disease_indicator': 'a'}
 
{'pos': 'NN', 'prevpos': 'DT', 'word': 'lecturer', 'word_is_disease_indicator': 'lecturer'}
 
{'pos': 'DT', 'prevpos': 'NN', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'CC', 'prevpos': 'DT', 'word': 'and', 'word_is_disease_indicator': 'and'}
 
{'pos': 'JJR', 'prevpos': 'CC', 'word': 'less', 'word_is_disease_indicator': 'less'}
 
{'pos': 'JJ', 'prevpos': 'JJR'

{'pos': 'NNS', 'prevpos': 'NN', 'word': 'results', 'word_is_disease_indicator': 'results'}
 
{'pos': 'JJ', 'prevpos': 'NNS', 'word': 'impairment', 'word_is_disease_indicator': 'impairment'}
 
{'pos': 'NN', 'prevpos': 'JJ', 'word': 'effect', 'word_is_disease_indicator': 'effect'}
 
{'pos': 'VBD', 'prevpos': 'NN', 'word': 'observed', 'word_is_disease_indicator': 'observed'}
 
{'pos': 'IN', 'prevpos': 'VBD', 'word': 'that', 'word_is_disease_indicator': 'that'}
 
{'pos': 'IN', 'prevpos': 'IN', 'word': 'of', 'word_is_disease_indicator': 'of'}
 
{'pos': 'DT', 'prevpos': 'IN', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'JJ', 'prevpos': 'DT', 'word': 'potential', 'word_is_disease_indicator': 'potential'}
 
{'pos': 'RB', 'prevpos': 'JJ', 'word': 'unfortunately', 'word_is_disease_indicator': 'unfortunately'}
 
{'pos': 'DT', 'prevpos': 'RB', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'NN', 'prevpos': 'DT', 'word': 'cl', 'word_is_disease_indicator': 'cl'}
 
{'po

 
{'pos': 'VB', 'prevpos': 'TO', 'word': 'judgment', 'word_is_disease_indicator': 'judgment'}
 
{'pos': 'NN', 'prevpos': 'VB', 'word': 'p37', 'word_is_disease_indicator': 'p37'}
 
{'pos': 'JJ', 'prevpos': 'NN', 'word': 'unexplained', 'word_is_disease_indicator': 'unexplained'}
 
{'pos': 'WDT', 'prevpos': 'JJ', 'word': 'which', 'word_is_disease_indicator': 'which'}
 
{'pos': 'VBP', 'prevpos': 'WDT', 'word': 'born', 'word_is_disease_indicator': 'born'}
 
{'pos': 'NN', 'prevpos': 'VBP', 'word': 'plasma', 'word_is_disease_indicator': 'plasma'}
 
{'pos': 'NN', 'prevpos': 'NN', 'word': 'median', 'word_is_disease_indicator': 'median'}
 
{'pos': 'PRP', 'prevpos': 'NN', 'word': 'they', 'word_is_disease_indicator': 'they'}
 
{'pos': 'NN', 'prevpos': 'PRP', 'word': 'treatment', 'word_is_disease_indicator': 'treatment'}
 
{'pos': 'IN', 'prevpos': 'NN', 'word': 'in', 'word_is_disease_indicator': 'in'}
 
{'pos': 'JJ', 'prevpos': 'IN', 'word': 'duration-matched', 'word_is_disease_indicator': 'duratio

 
{'pos': 'VBP', 'prevpos': 'NNS', 'word': 'hypothalamic', 'word_is_disease_indicator': 'hypothalamic'}
 
{'pos': 'VBP', 'prevpos': 'VBP', 'word': 'have', 'word_is_disease_indicator': 'have'}
 
{'pos': 'TO', 'prevpos': 'VBP', 'word': 'to', 'word_is_disease_indicator': 'to'}
 
{'pos': 'IN', 'prevpos': 'TO', 'word': 'on', 'word_is_disease_indicator': 'on'}
 
{'pos': 'CC', 'prevpos': 'IN', 'word': 'and', 'word_is_disease_indicator': 'and'}
 
{'pos': 'VB', 'prevpos': 'CC', 'word': 'tests..', 'word_is_disease_indicator': 'tests..'}
 
{'pos': 'DT', 'prevpos': 'VB', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'VBZ', 'prevpos': 'DT', 'word': 'is', 'word_is_disease_indicator': 'is'}
 
{'pos': 'NNS', 'prevpos': 'VBZ', 'word': 'investigations', 'word_is_disease_indicator': 'investigations'}
 
{'pos': 'JJ', 'prevpos': 'NNS', 'word': 'simplex', 'word_is_disease_indicator': 'simplex'}
 
{'pos': 'IN', 'prevpos': 'JJ', 'word': 'against', 'word_is_disease_indicator': 'against'}
 
{'pos

{'pos': 'CD', 'prevpos': 'VBP', 'word': 'five', 'word_is_disease_indicator': 'five'}
 
{'pos': 'NN', 'prevpos': 'CD', 'word': 'column', 'word_is_disease_indicator': 'column'}
 
{'pos': 'RBR', 'prevpos': 'NN', 'word': 'further', 'word_is_disease_indicator': 'further'}
 
{'pos': 'NN', 'prevpos': 'RBR', 'word': 'lo', 'word_is_disease_indicator': 'lo'}
 
{'pos': 'NNS', 'prevpos': 'NN', 'word': 'lsd', 'word_is_disease_indicator': 'lsd'}
 
{'pos': 'VBN', 'prevpos': 'NNS', 'word': 'proposed', 'word_is_disease_indicator': 'proposed'}
 
{'pos': 'TO', 'prevpos': 'VBN', 'word': 'to', 'word_is_disease_indicator': 'to'}
 
{'pos': 'DT', 'prevpos': 'TO', 'word': 'no', 'word_is_disease_indicator': 'no'}
 
{'pos': 'JJ', 'prevpos': 'DT', 'word': 'practical', 'word_is_disease_indicator': 'practical'}
 
{'pos': 'IN', 'prevpos': 'JJ', 'word': 'in', 'word_is_disease_indicator': 'in'}
 
{'pos': 'DT', 'prevpos': 'IN', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'JJ', 'prevpos': 'DT', 'word': 

 
{'pos': 'NN', 'prevpos': 'JJ', 'word': 'order', 'word_is_disease_indicator': 'order'}
 
{'pos': 'NN', 'prevpos': 'NN', 'word': 'design', 'word_is_disease_indicator': 'design'}
 
{'pos': 'NN', 'prevpos': 'NN', 'word': 'concentration', 'word_is_disease_indicator': 'concentration'}
 
{'pos': 'NNS', 'prevpos': 'NN', 'word': 'patients', 'word_is_disease_indicator': 'patients'}
 
{'pos': 'JJ', 'prevpos': 'NNS', 'word': 'north', 'word_is_disease_indicator': 'north'}
 
{'pos': 'VBP', 'prevpos': 'JJ', 'word': 'gender', 'word_is_disease_indicator': 'gender'}
 
{'pos': 'DT', 'prevpos': 'VBP', 'word': 'the', 'word_is_disease_indicator': 'the'}
 
{'pos': 'IN', 'prevpos': 'DT', 'word': 'between', 'word_is_disease_indicator': 'between'}
 
{'pos': 'JJ', 'prevpos': 'IN', 'word': 'methodological', 'word_is_disease_indicator': 'methodological'}
 
{'pos': 'VBN', 'prevpos': 'JJ', 'word': 'treated', 'word_is_disease_indicator': 'treated'}
 
{'pos': 'JJ', 'prevpos': 'VBN', 'word': 'important', 'word_is_dis

 
{'pos': 'IN', 'prevpos': 'JJ', 'word': 'under', 'word_is_disease_indicator': 'under'}
 
{'pos': 'IN', 'prevpos': 'IN', 'word': 'in', 'word_is_disease_indicator': 'in'}
 
{'pos': 'CC', 'prevpos': 'IN', 'word': 'and', 'word_is_disease_indicator': 'and'}
 
{'pos': 'IN', 'prevpos': 'CC', 'word': 'for', 'word_is_disease_indicator': 'for'}
 
{'pos': 'IN', 'prevpos': 'IN', 'word': 'with', 'word_is_disease_indicator': 'with'}
 
{'pos': 'NNS', 'prevpos': 'IN', 'word': 'monomers', 'word_is_disease_indicator': 'monomers'}
 
{'pos': 'JJ', 'prevpos': 'NNS', 'word': 'post-label', 'word_is_disease_indicator': 'post-label'}
 
{'pos': 'JJ', 'prevpos': 'JJ', 'word': 'available', 'word_is_disease_indicator': 'available'}
 
{'pos': 'JJ', 'prevpos': 'JJ', 'word': 'logistic', 'word_is_disease_indicator': 'logistic'}
 
{'pos': 'TO', 'prevpos': 'JJ', 'word': 'to', 'word_is_disease_indicator': 'to'}
 
{'pos': 'VBD', 'prevpos': 'TO', 'word': 'were', 'word_is_disease_indicator': 'were'}
 
{'pos': 'JJ', 'prevpo

{'pos': 'VBD', 'prevpos': 'NN', 'word': 'increased', 'word_is_disease_indicator': 'increased'}
 
{'pos': 'VBN', 'prevpos': 'VBD', 'word': 'shown', 'word_is_disease_indicator': 'shown'}
 
{'pos': 'JJ', 'prevpos': 'VBN', 'word': 'due', 'word_is_disease_indicator': 'due'}
 
{'pos': 'IN', 'prevpos': 'JJ', 'word': 'after', 'word_is_disease_indicator': 'after'}
 
{'pos': 'JJ', 'prevpos': 'IN', 'word': 'successful', 'word_is_disease_indicator': 'successful'}
 
{'pos': 'NNS', 'prevpos': 'JJ', 'word': 'cells', 'word_is_disease_indicator': 'cells'}
 
{'pos': 'NNS', 'prevpos': 'NNS', 'word': 'patients', 'word_is_disease_indicator': 'patients'}
 
{'pos': 'DT', 'prevpos': 'NNS', 'word': 'an', 'word_is_disease_indicator': 'an'}
 
{'pos': 'IN', 'prevpos': 'DT', 'word': 'of', 'word_is_disease_indicator': 'of'}
 
{'pos': 'JJ', 'prevpos': 'IN', 'word': 'novel', 'word_is_disease_indicator': 'novel'}
 
{'pos': 'JJ', 'prevpos': 'JJ', 'word': 'nps', 'word_is_disease_indicator': 'nps'}
 
{'pos': 'NN', 'prevp

In [38]:
class ConsecutiveNPChunkTagger(nltk.TaggerI): 

    def __init__(self, train_sents):
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            # compute features for each word
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = npchunk_features(untagged_sent, i, history) 
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)

    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = npchunk_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

class ConsecutiveNPChunker(nltk.ChunkParserI): 
    def __init__(self, train_sents):
        tagged_sents = [[((w,t),c) for (w,t,c) in
                         nltk.chunk.tree2conlltags(sent)]
                        for sent in train_sents]
        self.tagger = ConsecutiveNPChunkTagger(tagged_sents)

    def parse(self, sentence):
        tagged_sents = self.tagger.tag(sentence)
        conlltags = [(w,t,c) for ((w,t),c) in tagged_sents]
        return nltk.chunk.conlltags2tree(conlltags)

In [39]:
# training the chunker 
chunker = ConsecutiveNPChunker(train_trees)

In [40]:
# evaluate the chunker

print(chunker.evaluate(test_trees))

ChunkParse score:
    IOB Accuracy:  98.3%%
    Precision:      5.9%%
    Recall:         1.8%%
    F-Measure:      2.7%%


In [41]:
print(unigram_chunker.tagger.tag(postestactual))

[('NN', 'O'), ('VB', 'O'), ('PRP$', 'O'), ('JJ', 'O'), ('NN', 'O'), ('NNS', 'O'), ('IN', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('JJ', 'O'), ('NN', 'O'), ('NN', 'O'), ('IN', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('TO', 'O'), ('DT', 'O'), ('NN', 'O'), ('IN', 'O'), ('NN', 'O'), ('NN', 'O'), ('IN', 'O'), ('DT', 'O'), ('NN', 'O'), ('NN', 'O'), ('IN', 'O'), ('DT', 'O'), ('JJ', 'O'), ('NN', 'O'), ('IN', 'O'), ('NN', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('IN', 'O'), ('NN', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('PRP', 'O'), ('VBP', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('WDT', 'O'), ('MD', 'O'), ('VB', 'O'), ('VBN', 'O'), ('IN', 'O'), ('DT', 'O'), ('NN', 'O'), ('IN', 'O'), ('CD', 'O'), ('NN', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('PRP', 'O'), ('VBP', 'O'), ('IN', 'O'), ('DT', 'O'), ('JJ', 'O'), ('NNS', 'O'), ('VBG', 'O'), ('JJ', 'O'), ('DT', 'O'), ('NN', 'O'), ('VBP', 'O'), ('DT', 'O'), ('NNS', 'O'), ('VBG', 'O'), ('IN', 'O'), ('DT', 'O'), ('NN', 'O'), ('IN', 'O'), ('IN', 'O'), ('JJS', 'O'), ('CD', 'O'), ('JJ', 'O'), ('N

In [42]:
outputfinal=unigram_chunker.tagger.tag(postestactual)

In [43]:
dfoutput = pd.DataFrame (outputfinal)

In [44]:
dfoutput.shape

(4087, 2)

In [45]:
filepath = 'IOBPrediction.xlsx'
dfoutput.to_excel(filepath, index=False)