In [1]:
import nltk.classify
from nltk.classify.scikitlearn import SklearnClassifier
from nltk.tokenize import word_tokenize
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from collections import defaultdict
from nltk.metrics import precision, recall

In [2]:
import spacy

# We only need the sentence splitting and word embedding features, disabling the
# unneed features makes this a lot faster.
nlp = spacy.load('en_core_web_md', disable=['tagger', 'ner'])

'''
TODO: word embeddings aan de dataset toevoegen
Denk dat bij het inlezen dit het beste is om te doen, dan hoeven we maar 1x door alles
heen te loopen. Even uitzoeken hoe de embedding vectors meegegeven moeten worden 
(dict net als BOW of tuple met (<lijst met vectors>, <label>))

Even experimenteren met wat beter werkt: 
    - soort bow van individuele token vectors per review
    - of een gecombineerde methode gebruiken 
        -> gemiddelde van alle token vectors
        -> doc vector
        -> ???

'''

'\nTODO: word embeddings aan de dataset toevoegen\nDenk dat bij het inlezen dit het beste is om te doen, dan hoeven we maar 1x door alles\nheen te loopen. Even uitzoeken hoe de embedding vectors meegegeven moeten worden \n(dict net als BOW of tuple met (<lijst met vectors>, <label>))\n\nEven experimenteren met wat beter werkt: \n    - soort bow van individuele token vectors per review\n    - of een gecombineerde methode gebruiken \n        -> gemiddelde van alle token vectors\n        -> doc vector\n        -> ???\n\n'

In [181]:
import csv
import random
import sys
from string import punctuation
import pickle

from nltk import download, word_tokenize
from nltk.corpus import stopwords

# Download the nltk stopwords corpus if needed
download('stopwords')


def create_dataset(filepath):
    ''' Reads and cleans the csv file and structures the datapoints ''' 
    dataset = []

    # The translation and set are both a lot faster O(1) 
    # when compared to checking a list or string O(n).
    punct_translation = str.maketrans('', '', punctuation)
    stoplist = set(stopwords.words('english'))

    with open(filepath, 'r', encoding='latin-1') as f:
        reader = csv.reader(f, delimiter=",", )
        
        # Skip the header row
        next(reader, None)

        # Items per row:
        #   0 -> review id
        #   1 -> rating between 1-5
        #   2 -> year and month
        #   3 -> location of reviewer
        #   4 -> review text
        #   5 -> Disneyland location
        for row in reader:
            rating = int(row[1])

            if rating < 3:
                rating_label = 'negative'
            elif rating == 3:
                rating_label = 'indifferent'
            else:
                rating_label = 'positive'

            review_text = row[4] \
                .translate(punct_translation) \
                .lower() \
                .strip() + f"{row[5].replace('Disneyland_', '')} {row[3]}"

            tokenized = [
                token for token in word_tokenize(review_text)
                if token not in stoplist
            ]

            bag_of_words = ({t: True for t in tokenized}, rating)

            dataset.append(
                {
                    'tokenized': tokenized, 
                    'bag_of_words': bag_of_words, 
                    'rating_label': rating, 
                    'year_month': row[2], 
                    'reviewer_location': row[3], 
                    'review_text': row[4], 
                    'disneyland_location': row[5],
                    'doc_vector': nlp(' '.join(tokenized)).vector
                }
            )


    return dataset


def split_train_test(feats, split=0.8):
    ''' Creates test, train and dev splits from the dataset ''' 
    random.Random(1).shuffle(feats)

    cutoff = int(len(feats) * split)
    tenpercent = int((len(feats) - cutoff) / 2)
    split = cutoff + tenpercent

    train_feats = feats[:cutoff]
    test_feats = feats[cutoff:split]
    dev_feats = feats[split:]

    print("  Training set: %i" % len(train_feats))
    print("  Test set: %i" % len(test_feats))
    print("  Development set: %i" % len(dev_feats))

    return train_feats, test_feats, dev_feats
dataset = create_dataset('../data/DisneylandReviews.csv')

train_feats, test_feats, dev_feats = split_train_test(dataset)

[nltk_data] Downloading package stopwords to /home/wessel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  Training set: 34124
  Test set: 4266
  Development set: 4266


In [141]:
def precision_recall(classifier, testfeats):
	refsets = defaultdict(set)
	testsets = defaultdict(set)
	
	for i, (feats, label) in enumerate(testfeats):
		refsets[label].add(i)
		observed = classifier.classify(feats)
		testsets[observed].add(i)
	
	precisions = {}
	recalls = {}
	
	for label in classifier.labels():
		precisions[label] = precision(refsets[label], testsets[label])
		recalls[label] = recall(refsets[label], testsets[label])
	
	return precisions, recalls

In [142]:
def calculate_f(precisions, recalls):
    f_measures = {}

    for category in precisions.keys():
        # This is done to prevent the program from crashing when 
        # no measure is provided for a particular category
        if not precisions[category] or not recalls[category]:
            f_measures[category] = None
            continue

        f_measures[category] = round(
            2 * ((precisions[category] * recalls[category]) /
                 (precisions[category] + recalls[category])), 6)

    return f_measures

In [143]:
def evaluation(classifier, test_feats, categories):
    """ Taken from assignment 1, calculates and prints evaluation measures """
    print("\n##### Evaluation...")
    print("  Accuracy: %f" % nltk.classify.accuracy(classifier, test_feats))
    precisions, recalls = precision_recall(classifier, test_feats)
    f_measures = calculate_f(precisions, recalls)

    print(" |-----------|-----------|-----------|-----------|")
    print(" |%-11s|%-11s|%-11s|%-11s|" %
          ("category", "precision", "recall", "F-measure"))
    print(" |-----------|-----------|-----------|-----------|")
    for category in categories:
        if precisions[category] is None:
            print(" |%-11s|%-11s|%-11s|%-11s|" % (category, "NA", "NA", "NA"))
        else:
            print(" |%-11s|%-11f|%-11f|%-11s|" %
                  (category,
                   precisions[category],
                   recalls[category],
                   f_measures[category])
                  )
    print(" |-----------|-----------|-----------|-----------|")

In [None]:
def train_svm(train_feats):
    ''' Trains and returns a linear SVM classifier '''
    return SklearnClassifier(LinearSVC(dual=False)).train(train_feats)

def train_knn(train_feats):
    ''' Trains and returns a KNN classifier '''
    return SklearnClassifier(KNeighborsClassifier()).train(train_feats)

In [None]:
only_bow_test = [item['bag_of_words'] for item in test_feats]
only_bow_train = [item['bag_of_words'] for item in train_feats]

In [None]:
svm_classifier = train_svm(only_bow_train)

In [None]:
knn_classifier = train_knn(only_bow_train)

In [None]:
evaluation(svm_classifier, only_bow_test, ['positive', 'indifferent', 'negative'])
evaluation(knn_classifier, only_bow_test, ['positive', 'indifferent', 'negative'])

In [182]:
only_we_test = [(item['doc_vector'], item['rating_label']) for item in test_feats]
only_we_train = [(item['doc_vector'], item['rating_label']) for item in train_feats]

only_vec_train = [i[0] for i in only_we_train]
only_label_train = [j[1] for j in only_we_train]

only_vec_test = [i[0] for i in only_we_test]
only_label_test = [j[1] for j in only_we_test]

In [183]:
svm_classifier = LinearSVC().fit(only_vec_train, only_label_train)

In [184]:
svm_classifier.score(only_vec_test, only_label_test)

0.5977496483825597

In [154]:
from sklearn.tree import DecisionTreeClassifier

a = DecisionTreeClassifier().fit(only_vec_train, only_label_train)
a.score(only_vec_test, only_label_test)

0.43694327238631037

In [175]:
countries = {i['reviewer_location'] for i in dataset}
disneylands = {i['disneyland_location'].replace('Disneyland_', '') for i in dataset}

disneyland_lookup = { disneyland: nlp(disneyland).vector for disneyland in disneylands }
country_lookup = { country: nlp(country).vector for country in countries }

In [176]:
combined_features_train = []
labels_train = []

for row in train_feats:
    # We know these are always present in de dataset
    features = row['doc_vector']

    # if row['year_month'] == 'missing':
    #     features = np.concatenate((features, np.array([0, 0])))
    # else:
    #     # example format: 2019-4
    #     year, month = row['year_month'].split('-')
    #     features = np.concatenate((features, np.array([int(year), int(month)])))
    features = np.append(
        features, 
        disneyland_lookup[row['disneyland_location'].replace('Disneyland_', '')]
    )
    features = np.append(features, country_lookup[row['reviewer_location']])

    combined_features_train.append(features)
    labels_train.append(row['rating_label'])

In [177]:
combined_features_test = []
labels_test = []
for row in test_feats:
    # We know these are always present in de dataset
    features = row['doc_vector']
    # features = np.array([])

    # The feature arrays need to have the same shape, that is why we
    # insert 0s in here
    # if row['year_month'] == 'missing':
    #     features = np.concatenate((features, np.array([0, 0])))
    # else:
    #     # example format: 2019-4
    #     year, month = row['year_month'].split('-')
    #     features = np.concatenate((features, np.array([int(year), int(month)])))

    features = np.append(
        features, 
        disneyland_lookup[row['disneyland_location'].replace('Disneyland_', '')]
    )
    features = np.append(features, country_lookup[row['reviewer_location']])

    combined_features_test.append(features)
    labels_test.append(row['rating_label'])

In [178]:
svm_extra_features = LinearSVC().fit(combined_features_train, labels_train)
svm_extra_features.score(combined_features_test, labels_test)



0.5925925925925926

[1. 1.]


In [118]:
ex = dev_feats[9]

print(ex)

f = np.array([disneyland_lookup[ex['disneyland_location']], country_lookup[ex['reviewer_location']]])

print(f)

svm_extra_features.predict(f.reshape(1,-1))

{'tokenized': ['convenience', 'staffs', 'friendly', 'many', 'rude', 'mainland', 'ppl', 'didnt', 'need', 'line', 'long', 'time', 'rides', 'several', 'restaurants', 'provide', 'nice', 'food'], 'bag_of_words': ({'convenience': True, 'staffs': True, 'friendly': True, 'many': True, 'rude': True, 'mainland': True, 'ppl': True, 'didnt': True, 'need': True, 'line': True, 'long': True, 'time': True, 'rides': True, 'several': True, 'restaurants': True, 'provide': True, 'nice': True, 'food': True}, 'positive'), 'rating_label': 'positive', 'year_month': '2016-12', 'reviewer_location': 'Hong Kong', 'review_text': 'Convenience, Staffs are friendly ,, there are not many rude mainland ppl,, didnt need to line up for long time for rides,   There are several restaurants provide nice food.', 'disneyland_location': 'Disneyland_HongKong', 'doc_vector': array([ 8.26215744e-03,  2.48817932e-02, -1.40969366e-01, -2.98998266e-01,
        2.03529805e-01,  1.41831204e-01,  7.97833502e-02, -1.58968836e-01,
      

array(['positive'], dtype='<U11')

# Acc met onbewerkte, ruwe review text als doc vector (spacy schoont en splitst)
- SVM: 0.8218471636193155
- KNN: 0.7740271917487107

# Acc met tokenized en geschoonde tokens als doc vector 
- SVM: 0.8211439287388654
- KNN: 0.7805907172995781

# TODO
- Uitproberen met individuele token vectors en niet de 'platgeslagen' doc vector.
- Andere items uit de dataset als features toevoegen.

In [155]:
knn_classifier = KNeighborsClassifier().fit(only_vec_train, only_label_train)
knn_classifier.score(only_vec_test, only_label_test)

0.4997655883731833

In [16]:
import numpy as np

def cosine_similarity(a, b):
    return a.dot(b)/np.sqrt(a.dot(a) * b.dot(b))

In [17]:
from sklearn.metrics import precision_score, recall_score

def sklearn_model_evaluation(model, x_test, y_test):
    # y_pred = model.predict(x_test)
    # precision = precision_score(y_test, y_pred, average='micro')
    # recall = recall_score(y_test, y_pred, average='micro')
    accuracy = model.score(x_test, y_test)

    print("\n##### Evaluation...")
    print(f"  Accuracy: {accuracy}") 
    print(f"Precision: {precision}\nRecall: {recall}\nF-score: ")


In [18]:
print(prec svm_classifier, only_we_train)
print(prec knn_classifier, only_we_train)

SyntaxError: invalid syntax (<ipython-input-18-cc24e98a8505>, line 1)

In [67]:
sklearn_model_evaluation(svm_classifier, only_vec_test, only_label_test)
sklearn_model_evaluation(knn_classifier, only_vec_test, only_label_test)


##### Evaluation...
  Accuracy: 0.8211439287388654
Precision: 0.8211439287388654
Recall: 0.8211439287388654
F-score: 

##### Evaluation...
  Accuracy: 0.7805907172995781
Precision: 0.7805907172995781
Recall: 0.7805907172995781
F-score: 


In [None]:
individuel token vectors als feature toevoegen
extra punten uit dataset toevoegen, kijken of daar regression of correlation mee gedaan kan worden

In [10]:
sorted([item['year_month'] for item in dataset if item['year_month'] != 'missing'])[-1]

'2019-5'

In [159]:
from sklearn.dummy import DummyClassifier

baseline = DummyClassifier(strategy="most_frequent")
baseline.fit(only_vec_train, only_label_train)
baseline.score(only_vec_test, only_label_test)

0.540084388185654

In [156]:
from sklearn.metrics import accuracy_score, confusion_matrix

confusion_matrix(labels_test, svm_classifier.predict(only_vec_test))

ValueError: Mix of label input types (string and number)