# Import Libraries

In [1]:
# add more if needed

# data loading
from pandas.io.json import json_normalize
import json
import pandas as pd
import os, random
from os import listdir

# data inspection
from collections import Counter

# preprocessing
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from scipy.sparse import dok_matrix
from scipy.sparse import csc_matrix
import scipy.sparse as sparse
from gensim import corpora, models
import gensim

# Word embeddings
from gensim.models import Word2Vec


# clustering
from sklearn import cluster
from sklearn import metrics

# computations
import numpy as np
import math
from numpy import asarray
from numpy import zeros
from keras.utils import np_utils

# visualization
import matplotlib
from matplotlib import pyplot as plt
from pprint import pprint
import time
from datetime import datetime

# data saving
import pickle

# progress
from tqdm import tqdm_notebook as tqdm

# classification
import sklearn.datasets
import sklearn.metrics
import sklearn.model_selection
from sklearn.svm import SVC

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Load Data

In [2]:
file_path = "data/winemag-data-130k-v2.json"

with open(file_path) as f:
    data = json.load(f)
    
dataset = pd.DataFrame.from_dict(json_normalize(data), orient='columns')

# Load Pickle Files

In [3]:
tokens = pickle.load(open("pickle_files/tokens.p","rb"))

In [4]:
pos_tags = pickle.load(open("pickle_files/pos_tags.p","rb"))

In [5]:
dataset['tokens'] = tokens
dataset['pos_tags'] = pos_tags

In [6]:
lda_model = pickle.load(open("pickle_files/lda_model.p","rb"))

# Compute Content Words

In [7]:
def isContentWord(pos_tag):
    content_tags = ["JJ", "JJR", "JJS", "NN", "NNP", "NNS", "VB", "VBD", "VBG", "VBN", "VBP", "VBZ"]
    if pos_tag in content_tags:
        return True
    else:
        return False

In [8]:
content_words = []
for i, token_list in enumerate(tokens):
    pos_tag_list = pos_tags[i]
    for j, word in enumerate(token_list):
        pos_tag = pos_tag_list[j]
        if isContentWord(pos_tag):
            content_words.append(word)

In [9]:
content_counts = Counter(content_words)

In [10]:
filtered_content_words = []
for word in tqdm(content_words):
    if content_counts[word] > 2:
        if not word in filtered_content_words:
            filtered_content_words.append(word)




In [11]:
CONTENT_COUNT = len(np.unique(filtered_content_words))

In [12]:
content_word_dict = {}
for i, word in enumerate(filtered_content_words):
    content_word_dict[word] = i

In [13]:
content_tokens = []
for token_list in tqdm(tokens):
    filtered_tokens = []
    for token in token_list:
        if token in content_word_dict:
            filtered_tokens.append(token)
    content_tokens.append(filtered_tokens)




In [14]:
dataset['content_tokens'] = content_tokens

# Compute Labels

In [15]:
scores = dataset['points'].tolist()

In [16]:
def getCategory(scores):
    category_string = []
    category_int = []
    for score in scores:
        score = int(score)
        if score < 80:
            category_string.append("unacceptable")
            category_int.append(0)
        elif score >= 80 and score <= 82:
            category_string.append("acceptable")
            category_int.append(1)
        elif score >= 83 and score <= 86:
            category_string.append("good")
            category_int.append(2)
        elif score >= 87 and score <= 89:
            category_string.append("very good")
            category_int.append(3)
        elif score >= 90 and score <= 93:
            category_string.append("excellent")
            category_int.append(4)
        elif score >= 94 and score <= 97:
            category_string.append("superb")
            category_int.append(5)
        elif score >= 98 and score <= 100:
            category_string.append("classic")
            category_int.append(6)
    return category_string, category_int

In [17]:
categories, labels = getCategory(scores)

In [18]:
dataset['category'] = categories
dataset['labels'] = labels

# Compute BoW Corpus

In [19]:
def createCorpus(tokens):
    corpus = []
    for token_list in tqdm(tokens):
        content_tokens = []
        for token in token_list:
            if token not in content_counts:
                continue
            else:
                content_tokens.append(token)
        doc = " ".join(content_tokens)
        corpus.append(doc)
    return corpus

In [20]:
corpus = createCorpus(tokens)




In [21]:
bag_of_words_vectorizer = CountVectorizer(min_df=2)
bow_feature_vector = bag_of_words_vectorizer.fit_transform(corpus)

# Filter Data

In [22]:
varieties = dataset['variety'].tolist()
wine_count = Counter(varieties)
nr_reviews = len(varieties)
threshold = 200

In [23]:
filtered_keys = []
for key, item in wine_count.items():
    if item < threshold:
        filtered_keys.append(key)

In [24]:
idx = 0
indices = []
for v in varieties:
    if v in filtered_keys:
        indices.append(idx)
    idx += 1

In [25]:
dataset_filtered = dataset.drop(dataset.index[indices]).copy()
dataset_filtered = dataset_filtered.reset_index()

print("New Total reviews: %s"%(len(dataset_filtered)))

New Total reviews: 118263


# Split Data

In [26]:
wines = np.unique(dataset_filtered['variety'].tolist())

In [27]:
wine_indices = {}
for wine in wines:
    indices = dataset_filtered.index[dataset_filtered['variety'] == wine].tolist()
    wine_indices[wine] = indices

In [28]:
train_indices = {}
test_indices = {}
for wine in wines:
    indices = wine_indices[wine]
    nr_indices = len(indices)
    train_indices[wine] = indices[:round(nr_indices*0.8)]
    test_indices[wine] = indices[round(nr_indices*0.8):]
    
    nr_train_indices = len(indices[:round(nr_indices*0.8)])
    nr_test_indices = len(indices[round(nr_indices*0.8):])

In [29]:
tr_indices = []
for _, indices in train_indices.items():
    tr_indices = tr_indices + indices

In [30]:
t_indices = []
for _, indices in test_indices.items():
    t_indices = t_indices + indices

In [31]:
trainset = dataset_filtered.iloc[tr_indices,:].copy()
testset = dataset_filtered.iloc[t_indices,:].copy()

# Define SVM

In [32]:
def classify(train_features,train_labels,test_features):
    clf = SVC(kernel='rbf', C=5, gamma=0.02, verbose=True)
    clf.fit(train_features, train_labels)
    print("\ndone fitting classifier\n")
    return clf.predict(test_features)

In [33]:
def evaluate(y_true,y_pred):
    recall = sklearn.metrics.recall_score(y_true, y_pred, average='macro')
    print("Recall: %f" % recall)

    precision = sklearn.metrics.precision_score(y_true, y_pred, average='macro')
    print("Precision: %f" % precision)

    f1_score = sklearn.metrics.f1_score(y_true, y_pred, average='macro')
    print("F1-score: %f" % f1_score)

    return recall, precision, f1_score

In [34]:
def main(train_features,train_data,test_features,test_data):
    train_labels = train_data['labels'].tolist()

    test_labels = test_data['labels'].tolist()
        
    y_pred = classify(train_features,train_labels,test_features)
        
    recall, precision, f1_score = evaluate(test_labels, y_pred)
    
    print("recall: %s"%(recall))
    print("precision: %s"%(precision))
    print("f1 score: %s"%(f1_score))
    
    return recall, precision, f1_score

# Obtain BoW Features

In [35]:
train_tokens = trainset['tokens'].tolist()
train_corpus = createCorpus(train_tokens)
train_bow_features = bag_of_words_vectorizer.transform(train_corpus)




In [36]:
test_tokens = testset['tokens'].tolist()
test_corpus = createCorpus(test_tokens)
test_bow_features = bag_of_words_vectorizer.transform(test_corpus)




# Obtain LDA Features

In [37]:
def createCorpusLDA(tokens):
    dictionary = corpora.Dictionary(tokens)
    corpus = [dictionary.doc2bow(token_list) for token_list in tokens]
    return (dictionary,corpus)

In [38]:
train_lda_dictionary, train_lda_corpus = createCorpusLDA(train_tokens)

In [39]:
train_lda_features = dok_matrix((len(train_lda_corpus),100))

In [40]:
for i in tqdm(range(len(train_lda_corpus))):
    topic_distribution = lda_model[train_lda_corpus[i]]
    for (topic_nr,prob) in topic_distribution:
        train_lda_features[i, topic_nr] = prob




In [41]:
train_lda_features_csr = train_lda_features.tocsr()

In [42]:
print(type(train_lda_features_csr))

<class 'scipy.sparse.csr.csr_matrix'>


In [43]:
test_lda_dictionary, test_lda_corpus = createCorpusLDA(test_tokens)

In [44]:
test_lda_features = dok_matrix((len(test_lda_corpus),100))

In [45]:
for i in tqdm(range(len(test_lda_corpus))):
    topic_distribution = lda_model[test_lda_corpus[i]]
    for (topic_nr,prob) in topic_distribution:
        test_lda_features[i, topic_nr] = prob




In [46]:
test_lda_features_csr = test_lda_features.tocsr()

# Combine Features

In [47]:
train_combined_features = sparse.hstack((train_bow_features,train_lda_features_csr))

In [48]:
test_combined_features = sparse.hstack((test_bow_features,test_lda_features_csr))

# Run SVM

In [50]:
start = datetime.now()
print(start)
recall, precision, f1_score = main(train_combined_features,trainset,test_combined_features,testset)
end = datetime.now()
print(end)

2018-06-21 20:26:05.974046
[LibSVM]
done fitting classifier

Recall: 0.490655


  'precision', 'predicted', average, warn_for)


Precision: 0.590583


  'precision', 'predicted', average, warn_for)


F1-score: 0.523059
recall: 0.4906549730631818
precision: 0.5905828271353354
f1 score: 0.5230588081226927
2018-06-22 02:20:56.856677


# Save Results

In [51]:
with open("output/bow_lda_experiment.txt","w") as outfile:
    outfile.write("recall: %s\n"%(recall))
    outfile.write("precision: %s\n"%(precision))
    outfile.write("f1_score: %s\n"%(f1_score))