In [1]:
import pandas as pd
import nltk
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import nltk.collocations as collocations
import re
from sklearn.model_selection import KFold
from sklearn.metrics import precision_score, recall_score, f1_score
import pickle

In [3]:
#f = open('./movie_reviews/train.tsv', 'r')
f = open('train.tsv', 'r')
# loop over lines in the file and use the first limit of them
phrasedata = []
for line in f:
  # ignore the first line starting with Phrase and read all lines
  if (not line.startswith('Phrase')):
    # remove final end of line character
    line = line.strip()
    # each line has 4 items separated by tabs
    # ignore the phrase and sentence ids, and keep the phrase and sentiment
    phrasedata.append(line.split('\t')[2:4])
f.close()

In [4]:
import random

In [5]:
random.shuffle(phrasedata)
phraselist = phrasedata
for phrase in phraselist[:10]:
    print(phrase)

['The somber pacing and lack', '1']
['in her most charmless', '1']
['have their kids', '2']
['feeling this movie', '3']
['the action looks fake', '1']
['the kind of film that should be the target of something', '2']
['thrills when it should be most in the mind of the killer', '1']
['a biting satire that has no teeth', '1']
['much as it is for Angelique', '2']
['absorbing characters', '3']


In [6]:
phrasedocs = []
for phrase in phraselist:
    tokens = nltk.word_tokenize(phrase[0])
    phrasedocs.append((tokens, int(phrase[1])))

In [7]:
for phrase in phrasedocs[:10]:
    print(phrase)

(['The', 'somber', 'pacing', 'and', 'lack'], 1)
(['in', 'her', 'most', 'charmless'], 1)
(['have', 'their', 'kids'], 2)
(['feeling', 'this', 'movie'], 3)
(['the', 'action', 'looks', 'fake'], 1)
(['the', 'kind', 'of', 'film', 'that', 'should', 'be', 'the', 'target', 'of', 'something'], 2)
(['thrills', 'when', 'it', 'should', 'be', 'most', 'in', 'the', 'mind', 'of', 'the', 'killer'], 1)
(['a', 'biting', 'satire', 'that', 'has', 'no', 'teeth'], 1)
(['much', 'as', 'it', 'is', 'for', 'Angelique'], 2)
(['absorbing', 'characters'], 3)


In [8]:
docs = []
for phrase in phrasedocs:
    lowerphrase = ([w.lower() for w in phrase[0]], phrase[1])
    docs.append(lowerphrase)
for phrase in docs[:10]:
    print(phrase)

(['the', 'somber', 'pacing', 'and', 'lack'], 1)
(['in', 'her', 'most', 'charmless'], 1)
(['have', 'their', 'kids'], 2)
(['feeling', 'this', 'movie'], 3)
(['the', 'action', 'looks', 'fake'], 1)
(['the', 'kind', 'of', 'film', 'that', 'should', 'be', 'the', 'target', 'of', 'something'], 2)
(['thrills', 'when', 'it', 'should', 'be', 'most', 'in', 'the', 'mind', 'of', 'the', 'killer'], 1)
(['a', 'biting', 'satire', 'that', 'has', 'no', 'teeth'], 1)
(['much', 'as', 'it', 'is', 'for', 'angelique'], 2)
(['absorbing', 'characters'], 3)


In [9]:
all_words_list = [word for (sent, cat) in docs for word in sent]
all_words = nltk.FreqDist(all_words_list)
print(len(all_words))

16537


In [10]:
word_items = all_words.most_common(1000)
word_features = [word for (word, count) in word_items]

In [11]:
import numpy as np
doc_chunks = np.array_split(np.array(docs), 100)

  doc_chunks = np.array_split(np.array(docs), 100)


In [12]:
len(doc_chunks)

100

In [13]:
doc_chunks[0][0][0]

['the', 'somber', 'pacing', 'and', 'lack']

In [14]:
docs[0]

(['the', 'somber', 'pacing', 'and', 'lack'], 1)

In [15]:
def kfold_naive_bayes(feature_sets, splits=10):
    kf = KFold(n_splits=splits)
    scores = []

    for train, test in kf.split(feature_sets):
        classifier = nltk.NaiveBayesClassifier.train(
            np.array(feature_sets)[train]
        )
        scores.append(
            nltk.classify.accuracy(classifier, 
                                   np.array(feature_sets)[test])
        )
        
    return classifier, scores

In [16]:
def score_model(true_labels, predictions):
    precision = precision_score(true_labels, predictions, average='macro').round(4)
    recall = recall_score(true_labels, predictions, average='macro').round(4)
    f_measure = f1_score(true_labels, predictions, average='macro').round(4)
    print("Precision: ", precision)
    print("Recall: ", recall)
    print("F Measure: ", f_measure)
    return precision, recall, f_measure

In [17]:
# this function takes a document list of words and returns a feature dictionary
# it runs the default pos tagger (the Stanford tagger) on the document
#   and counts 4 types of pos tags to use as features
def POS_features(document, word_features):
    document_words = set(document)
    tagged_words = nltk.pos_tag(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    numNoun = 0
    numVerb = 0
    numAdj = 0
    numAdverb = 0
    for (word, tag) in tagged_words:
        if tag.startswith('N'): numNoun += 1
        if tag.startswith('V'): numVerb += 1
        if tag.startswith('J'): numAdj += 1
        if tag.startswith('R'): numAdverb += 1
    features['nouns'] = numNoun
    features['verbs'] = numVerb
    features['adjectives'] = numAdj
    features['adverbs'] = numAdverb
    return features

In [18]:
%%time
POS_chunks = []
for chunk in doc_chunks:
    POS_chunks.append(
        [(POS_features(chunk_sent[0], word_features), chunk_sent[1]) 
         for chunk_sent in chunk]
    )

Wall time: 3min 21s


In [19]:
len(POS_chunks)

100

In [20]:
import itertools
len(list(itertools.chain.from_iterable(POS_chunks)))

156060

In [21]:
%%time
featuresets3 = list(itertools.chain.from_iterable(POS_chunks))

Wall time: 8.98 ms


In [22]:
len(featuresets3)

156060

In [23]:
featuresets3[0]

({'contains(the)': True,
  'contains(,)': False,
  'contains(a)': False,
  'contains(of)': False,
  'contains(and)': True,
  'contains(to)': False,
  'contains(.)': False,
  "contains('s)": False,
  'contains(in)': False,
  'contains(is)': False,
  'contains(that)': False,
  'contains(it)': False,
  'contains(as)': False,
  'contains(with)': False,
  'contains(for)': False,
  'contains(its)': False,
  'contains(film)': False,
  'contains(an)': False,
  'contains(movie)': False,
  'contains(this)': False,
  'contains(but)': False,
  'contains(be)': False,
  'contains(you)': False,
  'contains(on)': False,
  "contains(n't)": False,
  'contains(by)': False,
  'contains(more)': False,
  'contains(his)': False,
  'contains(about)': False,
  'contains(one)': False,
  'contains(``)': False,
  'contains(not)': False,
  'contains(at)': False,
  'contains(or)': False,
  'contains(from)': False,
  'contains(than)': False,
  'contains(--)': False,
  'contains(all)': False,
  'contains(have)': Fals

In [24]:
%%time
uni_10fold_results_pos = kfold_naive_bayes(featuresets3, splits=10)

Wall time: 41min 44s


In [28]:
np.mean(uni_10fold_results_pos[1])

0.5359669357939254

In [25]:
with open('uni_model_5fold.pickle', 'wb') as f:
#with open("./models/uni_model_5fold.pickle", 'wb') as f:
    pickle.dump(uni_10fold_results_pos[0], f)

In [26]:
%%time
uni_10fold_preds_labs_pos = [(uni_10fold_results_pos[0].classify(features), label) 
                        for features, label in featuresets3] 

Wall time: 13min 5s


In [27]:
preds = []
trues = []
for pred, true in uni_10fold_preds_labs_pos:
    preds.append(pred)
    trues.append(true)

uni_10fold_preds_labs_pos = [trues, preds]
uni_10fold_scores = score_model(uni_10fold_preds_labs_pos[0], uni_10fold_preds_labs_pos[1])

Precision:  0.4085
Recall:  0.3955
F Measure:  0.3866
