# Examples of supervised classification

## Sentence Segmentation

Sentence segmentation can be loooked as classification task for punctuation, whenever we encounter a symbol that could possibly end a sentence, such a period or a question mark, we have to decide whether it terminates the preceding sentence

In [None]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
sents = nltk.corpus.treebank_raw.sents()

In [None]:
tokens=[]
boundaries = set()
offset=0
for sent in sents:
  tokens.extend(sent)
  offset +=len(sent)
  boundaries.add(offset-1)

In [None]:
def punct_features(tokens,i):
  return { 'next-word-capitalized':tokens[i+1][0].isupper(),
          'prev-word':tokens[i-1].lower(),
          'punct':tokens[i],
          'prev-word-is-one-char': len(tokens[i-1])==1
          }

In [None]:
featuresets = [(punct_features(tokens,i),(i in boundaries)) 
              for i in range(1,len(tokens)-1) if tokens[i] in '.?!']

In [None]:
size = int(len(featuresets) * 0.1)

In [None]:
train_set , test_set = featuresets[size:],featuresets[:size]

In [None]:
classifier =  nltk.NaiveBayesClassifier.train(train_set)

In [None]:
nltk.classify.accuracy(classifier,test_set)

0.936026936026936

In [None]:
def segment_sentences(words):
  start = 0
  sents = []
  for i, word in enumerate(words):
    if word in '.?!' and classifier.classify(punct_features(words,i))==True:
      sents.append(words[start:i+1])
      start = i+1
    if start < len(words):
      sents.append(words[start:])
    return sents

## Identifying Dialogue Act Types

In [None]:
nltk.download