In [20]:
import nltk.classify
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from nltk.metrics import precision, recall

In [21]:
import spacy

# We only need the sentence splitting and word embedding features, disabling the
# unneed features makes this a lot faster.
nlp = spacy.load('en_core_web_md', disable=['tagger', 'ner'])

'''
TODO: word embeddings aan de dataset toevoegen
Denk dat bij het inlezen dit het beste is om te doen, dan hoeven we maar 1x door alles
heen te loopen. Even uitzoeken hoe de embedding vectors meegegeven moeten worden 
(dict net als BOW of tuple met (<lijst met vectors>, <label>))

Even experimenteren met wat beter werkt: 
    - soort bow van individuele token vectors per review
    - of een gecombineerde methode gebruiken 
        -> gemiddelde van alle token vectors
        -> doc vector
        -> ???

'''

In [56]:
import csv
import random
import sys
from string import punctuation
import pickle

from nltk import download, word_tokenize
from nltk.corpus import stopwords

# Download the nltk stopwords corpus if needed
download('stopwords')


def create_dataset(filepath):
    ''' Reads and cleans the csv file and structures the datapoints ''' 
    dataset = []

    # The translation and set are both a lot faster O(1) 
    # when compared to checking a list or string O(n).
    punct_translation = str.maketrans('', '', punctuation)
    stoplist = set(stopwords.words('english'))

    with open(filepath, 'r', encoding='latin-1') as f:
        reader = csv.reader(f, delimiter=",", )
        
        # Skip the header row
        next(reader, None)

        # Items per row:
        #   0 -> review id
        #   1 -> rating between 1-5
        #   2 -> year and month
        #   3 -> location of reviewer
        #   4 -> review text
        #   5 -> Disneyland location
        for row in reader:
            rating = int(row[1])

            if rating < 3:
                rating_label = 'negative'
            elif rating == 3:
                rating_label = 'indifferent'
            else:
                rating_label = 'positive'

            review_text = row[4] \
                .translate(punct_translation) \
                .lower() \
                .strip()

            tokenized = [
                token for token in word_tokenize(review_text)
                if token not in stoplist
            ]

            bag_of_words = ({t: True for t in tokenized}, rating_label)

            dataset.append(
                {'tokenized': tokenized, 
                 'bag_of_words': bag_of_words, 
                 'rating_label': rating_label, 
                 'year_month': row[2], 
                 'reviewer_location': row[3], 
                 'review_text': row[4], 
                 'disneyland_location': row[5],
                 'doc_vector': nlp(' '.join(tokenized)).vector
                }
            )


    return dataset


def split_train_test(feats, split=0.8):
    ''' Creates test, train and dev splits from the dataset ''' 
    random.Random(1).shuffle(feats)

    cutoff = int(len(feats) * split)
    tenpercent = int((len(feats) - cutoff) / 2)
    split = cutoff + tenpercent

    train_feats = feats[:cutoff]
    test_feats = feats[cutoff:split]
    dev_feats = feats[split:]

    print("  Training set: %i" % len(train_feats))
    print("  Test set: %i" % len(test_feats))
    print("  Development set: %i" % len(dev_feats))

    return train_feats, test_feats, dev_feats
dataset = create_dataset('../data/DisneylandReviews.csv')

train_feats, test_feats, dev_feats = split_train_test(dataset)

[nltk_data] Downloading package stopwords to /home/wessel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  Training set: 34124
  Test set: 4266
  Development set: 4266


In [23]:
def precision_recall(classifier, testfeats):
	refsets = defaultdict(set)
	testsets = defaultdict(set)
	
	for i, (feats, label) in enumerate(testfeats):
		refsets[label].add(i)
		observed = classifier.classify(feats)
		testsets[observed].add(i)
	
	precisions = {}
	recalls = {}
	
	for label in classifier.labels():
		precisions[label] = precision(refsets[label], testsets[label])
		recalls[label] = recall(refsets[label], testsets[label])
	
	return precisions, recalls

In [24]:
def calculate_f(precisions, recalls):
    f_measures = {}

    for category in precisions.keys():
        # This is done to prevent the program from crashing when 
        # no measure is provided for a particular category
        if not precisions[category] or not recalls[category]:
            f_measures[category] = None
            continue

        f_measures[category] = round(
            2 * ((precisions[category] * recalls[category]) /
                 (precisions[category] + recalls[category])), 6)

    return f_measures

In [25]:
def evaluation(classifier, test_feats, categories):
    """ Taken from assignment 1, calculates and prints evaluation measures """
    print("\n##### Evaluation...")
    print("  Accuracy: %f" % nltk.classify.accuracy(classifier, test_feats))
    precisions, recalls = precision_recall(classifier, test_feats)
    f_measures = calculate_f(precisions, recalls)

    print(" |-----------|-----------|-----------|-----------|")
    print(" |%-11s|%-11s|%-11s|%-11s|" %
          ("category", "precision", "recall", "F-measure"))
    print(" |-----------|-----------|-----------|-----------|")
    for category in categories:
        if precisions[category] is None:
            print(" |%-11s|%-11s|%-11s|%-11s|" % (category, "NA", "NA", "NA"))
        else:
            print(" |%-11s|%-11f|%-11f|%-11s|" %
                  (category,
                   precisions[category],
                   recalls[category],
                   f_measures[category])
                  )
    print(" |-----------|-----------|-----------|-----------|")

In [26]:
def train_svm(train_feats):
    ''' Trains and returns a linear SVM classifier '''
    return SklearnClassifier(LinearSVC(dual=False)).train(train_feats)

def train_knn(train_feats):
    ''' Trains and returns a KNN classifier '''
    return SklearnClassifier(KNeighborsClassifier()).train(train_feats)

In [27]:
only_bow_test = [item['bag_of_words'] for item in test_feats]
only_bow_train = [item['bag_of_words'] for item in train_feats]

In [28]:
svm_classifier = train_svm(only_bow_train)

In [29]:
knn_classifier = train_knn(only_bow_train)

In [30]:
evaluation(svm_classifier, only_bow_test, ['positive', 'indifferent', 'negative'])
evaluation(knn_classifier, only_bow_test, ['positive', 'indifferent', 'negative'])


##### Evaluation...
  Accuracy: 0.812705
 |-----------|-----------|-----------|-----------|
 |category   |precision  |recall     |F-measure  |
 |-----------|-----------|-----------|-----------|
 |positive   |0.889932   |0.932638   |0.910784   |
 |indifferent|0.364425   |0.321224   |0.341463   |
 |negative   |0.588235   |0.438144   |0.502216   |
 |-----------|-----------|-----------|-----------|

##### Evaluation...
  Accuracy: 0.786451
 |-----------|-----------|-----------|-----------|
 |category   |precision  |recall     |F-measure  |
 |-----------|-----------|-----------|-----------|
 |positive   |0.791835   |0.994337   |0.881607   |
 |indifferent|0.263158   |0.019120   |0.035651   |
 |negative   |0.600000   |0.023196   |0.044665   |
 |-----------|-----------|-----------|-----------|


In [57]:
only_we_test = [(item['doc_vector'], item['rating_label']) for item in test_feats]
only_we_train = [(item['doc_vector'], item['rating_label']) for item in train_feats]

only_vec_train = [i[0] for i in only_we_train]
only_label_train = [j[1] for j in only_we_train]

only_vec_test = [i[0] for i in only_we_test]
only_label_test = [j[1] for j in only_we_test]

In [58]:
svm_classifier = LinearSVC().fit(only_vec_train, only_label_train)

In [61]:
svm_classifier.score(only_vec_test, only_label_test)

0.8211439287388654

# Acc met onbewerkte, ruwe review text als doc vector (spacy schoont en splitst)
- SVM: 0.8218471636193155
- KNN: 0.7740271917487107

# Acc met tokenized en geschoonde tokens als doc vector 
- SVM: 0.8211439287388654
- KNN: 0.7805907172995781

# TODO
- Uitproberen met individuele token vectors en niet de 'platgeslagen' doc vector.
- Andere items uit de dataset als features toevoegen.

In [60]:
knn_classifier = KNeighborsClassifier().fit(only_vec_train, only_label_train)
knn_classifier.score(only_vec_test, only_label_test)

0.7805907172995781

In [55]:
import numpy as np

def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))