In [44]:
import nltk.classify
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from nltk.metrics import precision, recall

In [1]:
import csv
import random
import sys
from string import punctuation
import pickle

from nltk import download, word_tokenize
from nltk.corpus import stopwords

# Download the nltk stopwords corpus if needed
download('stopwords')


def create_dataset(filepath):
    ''' Reads and cleans the csv file and structures the datapoints ''' 
    dataset = []

    # The translation and set are both a lot faster O(1) 
    # when compared to checking a list or string O(n).
    punct_translation = str.maketrans('', '', punctuation)
    stoplist = set(stopwords.words('english'))

    with open(filepath, 'r', encoding='latin-1') as f:
        reader = csv.reader(f, delimiter=",", )
        
        # Skip the header row
        next(reader, None)

        # Items per row:
        #   0 -> review id
        #   1 -> rating between 1-5
        #   2 -> year and month
        #   3 -> location of reviewer
        #   4 -> review text
        #   5 -> Disneyland location
        for row in reader:
            rating = int(row[1])

            if rating < 3:
                rating_label = 'negative'
            elif rating == 3:
                rating_label = 'indifferent'
            else:
                rating_label = 'positive'

            review_text = row[4] \
                .translate(punct_translation) \
                .lower() \
                .strip()

            tokenized = [
                token for token in word_tokenize(review_text)
                if token not in stoplist
            ]

            bag_of_words = ({t: True for t in tokenized}, rating_label)

            dataset.append(
                (tokenized, bag_of_words, rating_label, row[2], row[3], row[5])
            )


    return dataset


def split_train_test(feats, split=0.8):
    ''' Creates test, train and dev splits from the dataset ''' 
    random.Random(1).shuffle(feats)

    cutoff = int(len(feats) * split)
    tenpercent = int((len(feats) - cutoff) / 2)
    split = cutoff + tenpercent

    train_feats = feats[:cutoff]
    test_feats = feats[cutoff:split]
    dev_feats = feats[split:]

    print("  Training set: %i" % len(train_feats))
    print("  Test set: %i" % len(test_feats))
    print("  Development set: %i" % len(dev_feats))

    return train_feats, test_feats, dev_feats

train_feats, test_feats, dev_feats = split_train_test(
    create_dataset('../data/DisneylandReviews.csv')
)

[nltk_data] Downloading package stopwords to /home/wessel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  Training set: 34108
  Test set: 4264
  Development set: 4264


In [42]:
def precision_recall(classifier, testfeats):
	refsets = defaultdict(set)
	testsets = defaultdict(set)
	
	for i, (feats, label) in enumerate(testfeats):
		refsets[label].add(i)
		observed = classifier.classify(feats)
		testsets[observed].add(i)
	
	precisions = {}
	recalls = {}
	
	for label in classifier.labels():
		precisions[label] = precision(refsets[label], testsets[label])
		recalls[label] = recall(refsets[label], testsets[label])
	
	return precisions, recalls

In [35]:
def calculate_f(precisions, recalls):
    f_measures = {}

    for category in precisions.keys():
        # This is done to prevent the program from crashing when 
        # no measure is provided for a particular category
        if not precisions[category] or not recalls[category]:
            f_measures[category] = None
            continue

        f_measures[category] = round(
            2 * ((precisions[category] * recalls[category]) /
                 (precisions[category] + recalls[category])), 6)

    return f_measures

In [36]:
def evaluation(classifier, test_feats, categories):
    """ Taken from assignment 1, calculates and prints evaluation measures """
    print("\n##### Evaluation...")
    print("  Accuracy: %f" % nltk.classify.accuracy(classifier, test_feats))
    precisions, recalls = precision_recall(classifier, test_feats)
    f_measures = calculate_f(precisions, recalls)

    print(" |-----------|-----------|-----------|-----------|")
    print(" |%-11s|%-11s|%-11s|%-11s|" %
          ("category", "precision", "recall", "F-measure"))
    print(" |-----------|-----------|-----------|-----------|")
    for category in categories:
        if precisions[category] is None:
            print(" |%-11s|%-11s|%-11s|%-11s|" % (category, "NA", "NA", "NA"))
        else:
            print(" |%-11s|%-11f|%-11f|%-11s|" %
                  (category,
                   precisions[category],
                   recalls[category],
                   f_measures[category])
                  )
    print(" |-----------|-----------|-----------|-----------|")

In [46]:
def train_svm(train_feats):
    ''' Trains and returns a linear SVM classifier '''
    return SklearnClassifier(LinearSVC(dual=False)).train(train_feats)

def train_knn(train_feats):
    ''' Trains and returns a KNN classifier '''
    return SklearnClassifier(KNeighborsClassifier()).train(train_feats)

In [47]:
only_bow_test = [item[1] for item in test_feats]
only_bow_train = [item[1] for item in train_feats]

In [50]:
svm_classifier = train_svm(only_bow_train)

In [51]:
knn_classifier = train_knn(only_bow_train)

In [52]:
evaluation(svm_classifier, only_bow_test, ['positive', 'indifferent', 'negative'])
evaluation(knn_classifier, only_bow_test, ['positive', 'indifferent', 'negative'])


##### Evaluation...
  Accuracy: 0.809334
 |-----------|-----------|-----------|-----------|
 |category   |precision  |recall     |F-measure  |
 |-----------|-----------|-----------|-----------|
 |positive   |0.891508   |0.931178   |0.910911   |
 |indifferent|0.338747   |0.275992   |0.304167   |
 |negative   |0.532051   |0.456044   |0.491124   |
 |-----------|-----------|-----------|-----------|

##### Evaluation...
  Accuracy: 0.787523
 |-----------|-----------|-----------|-----------|
 |category   |precision  |recall     |F-measure  |
 |-----------|-----------|-----------|-----------|
 |positive   |0.795530   |0.992584   |0.883199   |
 |indifferent|0.088235   |0.005671   |0.010657   |
 |negative   |0.375000   |0.024725   |0.046392   |
 |-----------|-----------|-----------|-----------|


In [None]:
# import spacy
# spacy.load('en_core_web_md')
# 
# TODO: word embeddings aan de dataset toevoegen
# Denk dat bij het inlezen dit het beste is om te doen, dan hoeven we maar 1x door alles
# heen te loopen. Even uitzoeken hoe de embedding vectors meegegeven moeten worden 
# (dict net als BOW of tuple met (<lijst met vectors>, <label>))