In [None]:
#Training our own chunker
#conl2000 corpus - computional natural language from wall street data with POS tags

In [2]:
from nltk.corpus import conll2000
chunked_sent = conll2000.chunked_sents()[0]
print(chunked_sent)

(S
  (NP Confidence/NN)
  (PP in/IN)
  (NP the/DT pound/NN)
  (VP is/VBZ widely/RB expected/VBN to/TO take/VB)
  (NP another/DT sharp/JJ dive/NN)
  if/IN
  (NP trade/NN figures/NNS)
  (PP for/IN)
  (NP September/NNP)
  ,/,
  due/JJ
  (PP for/IN)
  (NP release/NN)
  (NP tomorrow/NN)
  ,/,
  (VP fail/VB to/TO show/VB)
  (NP a/DT substantial/JJ improvement/NN)
  (PP from/IN)
  (NP July/NNP and/CC August/NNP)
  (NP 's/POS near-record/JJ deficits/NNS)
  ./.)


In [3]:
#Iov format
from nltk.chunk import tree2conlltags, conlltags2tree
iob_tagged = tree2conlltags(chunked_sent)
print(iob_tagged)

[('Confidence', 'NN', 'B-NP'), ('in', 'IN', 'B-PP'), ('the', 'DT', 'B-NP'), ('pound', 'NN', 'I-NP'), ('is', 'VBZ', 'B-VP'), ('widely', 'RB', 'I-VP'), ('expected', 'VBN', 'I-VP'), ('to', 'TO', 'I-VP'), ('take', 'VB', 'I-VP'), ('another', 'DT', 'B-NP'), ('sharp', 'JJ', 'I-NP'), ('dive', 'NN', 'I-NP'), ('if', 'IN', 'O'), ('trade', 'NN', 'B-NP'), ('figures', 'NNS', 'I-NP'), ('for', 'IN', 'B-PP'), ('September', 'NNP', 'B-NP'), (',', ',', 'O'), ('due', 'JJ', 'O'), ('for', 'IN', 'B-PP'), ('release', 'NN', 'B-NP'), ('tomorrow', 'NN', 'B-NP'), (',', ',', 'O'), ('fail', 'VB', 'B-VP'), ('to', 'TO', 'I-VP'), ('show', 'VB', 'I-VP'), ('a', 'DT', 'B-NP'), ('substantial', 'JJ', 'I-NP'), ('improvement', 'NN', 'I-NP'), ('from', 'IN', 'B-PP'), ('July', 'NNP', 'B-NP'), ('and', 'CC', 'I-NP'), ('August', 'NNP', 'I-NP'), ("'s", 'POS', 'B-NP'), ('near-record', 'JJ', 'I-NP'), ('deficits', 'NNS', 'I-NP'), ('.', '.', 'O')]


In [5]:
len(conll2000.chunked_words()), len(conll2000.chunked_sents())

(166433, 10948)

In [7]:
import random
from sklearn.model_selection import train_test_split
shuffled_conll_sents = list(conll2000.chunked_sents())
train_set, test_set = train_test_split(shuffled_conll_sents, test_size=0.1)

In [8]:
from nltk import ChunkParserI, TrigramTagger

In [9]:
class TrigramChunkParser(ChunkParserI):
    def __init__(self, train_set):
        train_data = [[(pos_tag, chunk_tag) for word, pos_tag, chunk_tag in tree2conlltags(sent)] for sent in train_set]
        self.tagger = TrigramTagger(train_data)
    
    def parse(self, sentence):
        pos_tags = [pos for word, pos in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        conlltags = [(word, pos_tag, chunk_tag) 
                     for ((word, pos_tag),(pos_tag, chunk_tag)) in zip(sentence, tagged_pos_tags)]
        return conlltags2tree(conlltags)

In [10]:
trigram_chunker = TrigramChunkParser(train_set)

In [11]:
print(trigram_chunker.evaluate(test_set))

ChunkParse score:
    IOB Accuracy:  87.5%%
    Precision:     80.2%%
    Recall:        83.8%%
    F-Measure:     81.9%%
