# Import useful libraries

In [1]:
import nltk
import re, pprint
# import sys
import os
import csv
# import argparse
from nltk.stem.snowball import SnowballStemmer
import random
from nltk.classify import SklearnClassifier
# from sklearn.naive_bayes import BernoulliNB
# from sklearn.svm import SVC
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np

## Preprocess

In [2]:
sentence = "The big brown fox jumped over a lazy dog."
sentence2 = "Buffalo buffalo Buffalo buffalo buffalo buffalo Buffalo buffalo"

In [3]:
#convert sentence to lower case
'This' == 'this'
print('AbcdEFgH'.lower())
sentence.lower()

abcdefgh


'the big brown fox jumped over a lazy dog.'

### Tokenize - extract individual words

In [4]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(sentence)
tokens

['The', 'big', 'brown', 'fox', 'jumped', 'over', 'a', 'lazy', 'dog']

### Filter words to remove non-useful words

In [5]:
filtered_words = [w for w in tokens if not w in stopwords.words('english')]
filtered_words

['The', 'big', 'brown', 'fox', 'jumped', 'lazy', 'dog']

In [6]:
def preprocess(sentence):
    sentence = sentence.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(sentence)
    filtered_words = [w for w in tokens if not w in stopwords.words('english')]
    return filtered_words

In [7]:
preprocessed_sentence = preprocess(sentence)
print(preprocessed_sentence)

['big', 'brown', 'fox', 'jumped', 'lazy', 'dog']


## Tagging

In [8]:
tags = nltk.pos_tag(preprocessed_sentence)
print(tags)

[('big', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumped', 'VBD'), ('lazy', 'JJ'), ('dog', 'NN')]


## Extracting only Nouns and Verb nouns

In [9]:
def extract_features(sentences):
    features = []
    for tagged_word in sentences:
        word, tag = tagged_word
        if tag=='NN' or tag == 'VBN' or tag == 'NNS' or tag == 'VBP' or tag == 'RB' or tag == 'VBZ' or tag == 'VBG' or tag =='PRP' or tag == 'JJ':
            features.append(word)
    return features

In [10]:
extract_features(tags)

['big', 'brown', 'fox', 'lazy', 'dog']

## Lemmatize words

In [11]:
lmtzr = WordNetLemmatizer()
print lmtzr.lemmatize('cacti')
print lmtzr.lemmatize('willing')
print lmtzr.lemmatize('feet')
print lmtzr.lemmatize('stemmed')

cactus
willing
foot
stemmed


## Stem words

In [12]:
words_for_stemming = ['stem', 'stemming', 'stemmed', 'stemmer', 'stems','feet','willing']

In [13]:
stemmer = SnowballStemmer("english")
[stemmer.stem(x) for x in words_for_stemming]

[u'stem', u'stem', u'stem', u'stemmer', u'stem', u'feet', u'will']

## Putting it all together

In [14]:
def extract_feature(text):
    words = preprocess(text)
#     print('words: ',words)
    tags = nltk.pos_tag(words)
#     print('tags: ',tags)
    extracted_features = extract_features(tags)#rename to extract keys based on tags
#     print('Extracted features: ',extracted_features)
    stemmed_words = [stemmer.stem(x) for x in extracted_features]
#     print(stemmed_words)
    # join with space
#     return (" ".join(stemmed_words))
   
    return stemmed_words

In [15]:
words = extract_feature(sentence)
print words

[u'big', u'brown', u'fox', u'lazi', u'dog']


## Implementing bag of words

In [16]:
def word_feats(words):
    return dict([(word, True) for word in words])

In [17]:
word_feats(words)

{u'big': True, u'brown': True, u'dog': True, u'fox': True, u'lazi': True}

## Parsing the whole document

In [30]:
def extract_feature_from_doc(data):
    result = []
    corpus = []
    for (text,category,answer) in data:
#         corpus.append(text)
#         print(corpus)
        features = extract_feature(text)
#         if len(features1) != 0:
#             document_words = set(features1)
#             features = {}
#             for word in document_words:
#                 features['contains(%s)' % word] = (word in document_words)
#             return features
#         print('\n', features)
        corpus.append(features)
        result.append((word_feats(features), category))
#         result.append((" ".join(features), answer))
    return (result, sum(corpus,[]))

In [27]:
extract_feature_from_doc([['this is text','category','answer to give']])

('\n', [u'text'])


([({u'text': True}, 'category')], [u'text'])

In [28]:
def get_content(filename):
    doc = os.path.join(filename)
    with open(doc, 'r') as content_file:
        lines = csv.reader(content_file,delimiter='|')
        data = [x for x in lines if len(x) == 3]
        return data

In [29]:
filename = 'leaves.txt'
data = get_content(filename)

In [31]:
features_data, corpus = extract_feature_from_doc(data)

In [33]:
print(features_data[10])

({u'good': True, u'morn': True}, 'Morning')


# Train a model using these fetures

In [46]:
## split data into train and test sets
split_ratio = 0.8

In [47]:
def split_dataset(data, split_ratio):
    random.shuffle(data)
    data_length = len(data)
    train_split = int(data_length * split_ratio)
    return (data[:train_split]), (data[train_split:])

In [48]:
training_data, test_data = split_dataset(features_data, split_ratio)

In [49]:
def train_using_decision_tree(training_data, test_data):
    # entropy_cutoff=0.1,support_cutoff=0.7 gives awesome results
    classifier = nltk.classify.DecisionTreeClassifier.train(training_data, verbose = True)
#     print(classifier)
#     print(classifier.pretty_format(width=70, prefix='', depth=4))
    classifier_name = type(classifier).__name__
    training_set_accuracy = nltk.classify.accuracy(classifier, training_data)
    print('training set accuracy: ', training_set_accuracy)
    test_set_accuracy = nltk.classify.accuracy(classifier, test_data)
    print('test set accuracy: ', test_set_accuracy)
    return classifier, classifier_name, test_set_accuracy, training_set_accuracy

In [58]:
classifier, classifier_name, test_set_accuracy, training_set_accuracy = train_using_decision_tree(training_data, test_data)

best stump for    116 toks uses option               err=0.6983
best stump for     76 toks uses forward              err=0.6579
best stump for     62 toks uses taken                err=0.6613
best stump for     51 toks uses annual               err=0.6078
best stump for     36 toks uses leav                 err=0.6111
best stump for     24 toks uses help                 err=0.6250
best stump for     18 toks uses name                 err=0.5000
best stump for     15 toks uses great                err=0.4667
best stump for     13 toks uses thank                err=0.3846
best stump for     11 toks uses even                 err=0.3636
best stump for     10 toks uses good                 err=0.3000
best stump for      6 toks uses thank                err=0.3333
best stump for     12 toks uses type                 err=0.0000
best stump for     15 toks uses None                 err=0.2000
best stump for     11 toks uses annual               err=0.0000
best stump for     40 toks uses taken   

In [53]:
# save the data
np.save('training_data', training_data)
np.save('test_data', test_data)

In [57]:
# training_data = np.load('training_data.npy')

In [60]:
print(classifier.pretty_format())

option=None? .......................................... Default-Balance-Annual-Leaves
  forward=None? ....................................... Default-Balance-Annual-Leaves
    taken=None? ....................................... Default-Balance-Annual-Leaves
      annual=None? .................................... Default-Balance-Annual-Leaves
      annual=True? .................................... Balance-Annual-Leaves
    taken=True? ....................................... Utilized-Annual-Leaves
      annual=None? .................................... Default-Utilized-Annual-Leaves
      annual=True? .................................... Utilized-Annual-Leaves
  forward=True? ....................................... CF
option=True? .......................................... Utilized-Optional-Leaves
  taken=None? ......................................... Utilized-Optional-Leaves
    use=None? ......................................... Balance-Optional-Leaves
    use=True? ..................