# Import Libraries

In [10]:
# add more if needed

# data loading
from pandas.io.json import json_normalize
import json
import pandas as pd
import os, random
from os import listdir

# data inspection
from collections import Counter

# preprocessing
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import dok_matrix

# Word embeddings
from gensim.models import Word2Vec


# clustering
from sklearn import cluster
from sklearn import metrics

# computations
import numpy as np
import math
from numpy import asarray
from numpy import zeros
from keras.utils import np_utils

# visualization
import matplotlib
from matplotlib import pyplot as plt
from pprint import pprint
import time
from datetime import datetime

# data saving
import pickle

# progress
from tqdm import tqdm_notebook as tqdm

# classification
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
from sklearn.svm import SVC

# Load Data

In [2]:
file_path = "data/winemag-data-130k-v2.json"

with open(file_path) as f:
    data = json.load(f)
    
dataset = pd.DataFrame.from_dict(json_normalize(data), orient='columns')

# Load Pickle Files

In [4]:
tokens = pickle.load(open("pickle_files/tokens.p","rb"))

In [5]:
pos_tags = pickle.load(open("pickle_files/pos_tags.p","rb"))

In [6]:
dataset['tokens'] = tokens
dataset['pos_tags'] = pos_tags

# Compute Content Words

IMPORTANT TO NOTE: while Ilja stated that they filtered on content words, they did not mention what content words. Only that they filtered on adjectives, nouns and verbs. These were the tags we thought relate to those.

In [7]:
def isContentWord(pos_tag):
    content_tags = ["JJ", "JJR", "JJS", "NN", "NNP", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    if pos_tag in content_tags:
        return True
    else:
        return False

Next piece of code collects all content words in the dataset, not considering if there are duplicates.

In [8]:
content_words = []
for i, token_list in enumerate(tokens):
    pos_tag_list = pos_tags[i]
    for j, word in enumerate(token_list):
        pos_tag = pos_tag_list[j]
        if isContentWord(pos_tag):
            content_words.append(word)

Next, the Counter helps to count the content word occurences. Keys are content words, values are counts.

In [9]:
content_counts = Counter(content_words)

Here we filter the content words on more than 2 occurences and only append them to filtered_content_words once.

In [11]:
filtered_content_words = []
for word in tqdm(content_words):
    if content_counts[word] > 2:
        if not word in filtered_content_words:
            filtered_content_words.append(word)




In [12]:
CONTENT_COUNT = len(np.unique(filtered_content_words))

content_word_dict was created to combine a specific index to each content word. Keys are content_words, values are specific index.

In [13]:
content_word_dict = {}
for i, word in enumerate(filtered_content_words):
    content_word_dict[word] = i

Finally, content_tokens is a list similar to tokens. It contains N lists were N is equal to the number of reviews in the dataset. Each list corresponds to a review, but this time it only contains the content words.

In [14]:
content_tokens = []
for token_list in tqdm(tokens):
    filtered_tokens = []
    for token in token_list:
        if token in content_word_dict:
            filtered_tokens.append(token)
    content_tokens.append(filtered_tokens)




In [15]:
dataset['content_tokens'] = content_tokens

# Compute Categories

In [16]:
scores = dataset['points'].tolist()

In [17]:
def getCategory(scores):
    category_string = []
    category_int = []
    for score in scores:
        score = int(score)
        if score < 80:
            category_string.append("unacceptable")
            category_int.append(0)
        elif score >= 80 and score <= 82:
            category_string.append("acceptable")
            category_int.append(1)
        elif score >= 83 and score <= 86:
            category_string.append("good")
            category_int.append(2)
        elif score >= 87 and score <= 89:
            category_string.append("very good")
            category_int.append(3)
        elif score >= 90 and score <= 93:
            category_string.append("excellent")
            category_int.append(4)
        elif score >= 94 and score <= 97:
            category_string.append("superb")
            category_int.append(5)
        elif score >= 98 and score <= 100:
            category_string.append("classic")
            category_int.append(6)
    return category_string, category_int

In [18]:
categories, labels = getCategory(scores)

In [19]:
dataset['category'] = categories
dataset['labels'] = labels

# Compute BoW Feature Vectors

In [20]:
def createCorpus(tokens):
    corpus = []
    for token_list in tqdm(tokens):
        content_tokens = []
        for token in token_list:
            if token not in content_counts:
                continue
            else:
                content_tokens.append(token)
        doc = " ".join(content_tokens)
        corpus.append(doc)
    return corpus

In [21]:
corpus = createCorpus(tokens)




In [22]:
bag_of_words_vectorizer = CountVectorizer(min_df=2)
bow_feature_vector = bag_of_words_vectorizer.fit_transform(corpus)

# Filter Data

In [23]:
varieties = dataset['variety'].tolist()
wine_count = Counter(varieties)
nr_reviews = len(varieties)
threshold = 200

In [24]:
filtered_keys = []
for key, item in wine_count.items():
    if item < threshold:
        filtered_keys.append(key)

In [25]:
idx = 0
indices = []
for v in varieties:
    if v in filtered_keys:
        indices.append(idx)
    idx += 1

In [26]:
dataset_filtered = dataset.drop(dataset.index[indices]).copy()
dataset_filtered = dataset_filtered.reset_index()

print("New Total reviews: %s"%(len(dataset_filtered)))

New Total reviews: 118263


# Split Data

In [27]:
wines = np.unique(dataset_filtered['variety'].tolist())

In [28]:
wine_indices = {}
for wine in wines:
    indices = dataset_filtered.index[dataset_filtered['variety'] == wine].tolist()
    wine_indices[wine] = indices

In [29]:
train_indices = {}
test_indices = {}
for wine in wines:
    indices = wine_indices[wine]
    nr_indices = len(indices)
    train_indices[wine] = indices[:round(nr_indices*0.8)]
    test_indices[wine] = indices[round(nr_indices*0.8):]
    
    nr_train_indices = len(indices[:round(nr_indices*0.8)])
    nr_test_indices = len(indices[round(nr_indices*0.8):])

In [30]:
tr_indices = []
for _, indices in train_indices.items():
    tr_indices = tr_indices + indices

In [31]:
t_indices = []
for _, indices in test_indices.items():
    t_indices = t_indices + indices

In [32]:
trainset = dataset_filtered.iloc[tr_indices,:].copy()
testset = dataset_filtered.iloc[t_indices,:].copy()

# Define SVM

In [33]:
def classify(train_features,train_labels,test_features):
    clf = SVC(kernel='rbf', verbose=True)
    clf.fit(train_features, train_labels)
    print("\ndone fitting classifier\n")
    return clf.predict(test_features)

In [34]:
def evaluate(y_true,y_pred):
    recall = sklearn.metrics.recall_score(y_true, y_pred, average='macro')
    print("Recall: %f" % recall)

    precision = sklearn.metrics.precision_score(y_true, y_pred, average='macro')
    print("Precision: %f" % precision)

    f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
    print("F1-score: %f" % f1_score)

    return recall, precision, f1_score

In [35]:
def main(train_features,train_data,test_features,test_data):
    train_labels = train_data['labels'].tolist()

    test_labels = test_data['labels'].tolist()
        
    y_pred = classify(train_features,train_labels,test_features)
        
    recall, precision, f1_score = evaluate(test_labels, y_pred)
    
    print("recall: %s"%(recall))
    print("precision: %s"%(precision))
    print("f1 score: %s"%(f1_score))
    
    return recall, precision, f1_score

# Obtain features

In [36]:
train_tokens = trainset['tokens'].tolist()
train_corpus = createCorpus(train_tokens)




In [37]:
train_features = bag_of_words_vectorizer.transform(train_corpus)

In [38]:
test_tokens = testset['tokens'].tolist()
test_corpus = createCorpus(test_tokens)




In [39]:
test_features = bag_of_words_vectorizer.transform(test_corpus)

In [None]:
start = datetime.now()
print(start)
recall, precision, f1_score = main(train_features,trainset,test_features,testset)
end = datetime.now()
print(end)

2018-06-20 21:19:00.742128
[LibSVM]

# Perform Gridsearch

In [40]:
random_sample = trainset.sample(5000)

In [43]:
random_tokens = random_sample['tokens'].tolist()
random_corpus = createCorpus(random_tokens)




In [44]:
random_features = bag_of_words_vectorizer.transform(random_corpus)

In [45]:
random_labels = random_sample['labels'].tolist()

In [83]:
parameters = {'kernel':['rbf'], 'C': np.arange(1,40,2), 'gamma': np.linspace(0.0, 0.2, 11)}

In [84]:
svc = SVC()

In [85]:
clf = GridSearchCV(svc, parameters, verbose=10, n_jobs=4)

In [86]:
clf.fit(random_features,random_labels)

Fitting 3 folds for each of 220 candidates, totalling 660 fits


[Parallel(n_jobs=4)]: Done   5 tasks      | elapsed:   29.0s
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:   46.8s
[Parallel(n_jobs=4)]: Done  17 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  3.1min
[Parallel(n_jobs=4)]: Done  53 tasks      | elapsed:  4.0min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:  4.8min
[Parallel(n_jobs=4)]: Done  77 tasks      | elapsed:  5.8min
[Parallel(n_jobs=4)]: Done  90 tasks      | elapsed:  6.8min
[Parallel(n_jobs=4)]: Done 105 tasks      | elapsed:  7.9min
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  9.0min
[Parallel(n_jobs=4)]: Done 137 tasks      | elapsed: 10.2min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 11.5min
[Parallel(n_jobs=4)]: Done 173 tasks      | elapsed: 12.8min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 14.3min
[Parallel(n_jobs=4)]: Do

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'kernel': ['rbf'], 'gamma': array([0.  , 0.02, 0.04, 0.06, 0.08, 0.1 , 0.12, 0.14, 0.16, 0.18, 0.2 ]), 'C': array([ 1,  3,  5,  7,  9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33,
       35, 37, 39])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=10)

In [87]:
params = clf.best_params_

In [88]:
print(params)

{'kernel': 'rbf', 'gamma': 0.02, 'C': 5}
