# This is for testing only

# Import Libraries

In [29]:
# add more if needed

# data loading
from pandas.io.json import json_normalize
import json
import pandas as pd
import os, random
from os import listdir

# data inspection
from collections import Counter

# preprocessing
import string
import nltk
from string import punctuation
from nltk.corpus import stopwords

# computations
import numpy as np
import math

# visualization
import matplotlib
from matplotlib import pyplot as plt
from pprint import pprint

# data saving
import pickle

# Load Data

In [4]:
file_path = "data/winemag-data-130k-v2.json"

with open(file_path) as f:
    data = json.load(f)
    
dataset = pd.DataFrame.from_dict(json_normalize(data), orient='columns')

In [19]:
dataset.head(6)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks
5,Spain,Blackberry and raspberry aromas show a typical...,Ars In Vitro,87,15.0,Northern Spain,Navarra,,Michael Schachner,@wineschach,Tandem 2011 Ars In Vitro Tempranillo-Merlot (N...,Tempranillo-Merlot,Tandem


# Inspect Data

In [6]:
varieties = dataset['variety'].tolist()

In [24]:
wine_count = Counter(varieties)

In [23]:
for key, item in wine_count.items():
    if item > 1300:
        print(key, item)

Rosé 3564
Sparkling Blend 2153
Nebbiolo 2804
Rhône-style Red Blend 1471
Sangiovese 2707
Pinot Gris 1455
Malbec 2652
Zinfandel 2714
Cabernet Franc 1353
Bordeaux-style Red Blend 6915
Pinot Noir 13272
Cabernet Sauvignon 9472
White Blend 2360
Red Blend 8946
Sauvignon Blanc 4967
Riesling 5189
Portuguese Red 2466
Grüner Veltliner 1345
Syrah 4142
Merlot 3102
Tempranillo 1810
Champagne Blend 1396
Chardonnay 11753


# Define Training and Test Set

# Preproces Data

## Example Code

In [9]:
descriptions = dataset['description'].tolist()

In [10]:
print(descriptions[0])

Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity.


In [11]:
test_string = descriptions[0]

### Remove Punctuation

In [12]:
translator = str.maketrans('', '', string.punctuation)
stripped_test_string = test_string.translate(translator)

### Make Lowercase

In [13]:
stripped_test_string = stripped_test_string.lower()

### tokenize

In [14]:
tokens = nltk.word_tokenize(stripped_test_string)

In [15]:
print(tokens)

['aromas', 'include', 'tropical', 'fruit', 'broom', 'brimstone', 'and', 'dried', 'herb', 'the', 'palate', 'isnt', 'overly', 'expressive', 'offering', 'unripened', 'apple', 'citrus', 'and', 'dried', 'sage', 'alongside', 'brisk', 'acidity']


## Real Deal

In [37]:
tokens = []
for description in descriptions:
    # remove punctuation
    description_without_punctuation = description.translate(translator)
    # make lowercase
    description_lower = description_without_punctuation.lower()
    # tokenize
    description_tokens = nltk.word_tokenize(description_lower)
    # remove remaining tokens that are not alphabetic
    description_tokens = [word for word in description_tokens if word.isalpha()]
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    description_tokens = [w for w in description_tokens if not w in stop_words]
    #filter out short tokens
    description_tokens = [word for word in description_tokens if len(word) > 1]
    tokens.append(description_tokens)

In [38]:
dataset['tokens'] = tokens

In [39]:
dataset.head(5)

Unnamed: 0,country,description,designation,points,price,province,region_1,region_2,taster_name,taster_twitter_handle,title,variety,winery,tokens
0,Italy,"Aromas include tropical fruit, broom, brimston...",Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia,"[aromas, include, tropical, fruit, broom, brim..."
1,Portugal,"This is ripe and fruity, a wine that is smooth...",Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos,"[ripe, fruity, wine, smooth, still, structured..."
2,US,"Tart and snappy, the flavors of lime flesh and...",,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm,"[tart, snappy, flavors, lime, flesh, rind, dom..."
3,US,"Pineapple rind, lemon pith and orange blossom ...",Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian,"[pineapple, rind, lemon, pith, orange, blossom..."
4,US,"Much like the regular bottling from 2012, this...",Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks,"[much, like, regular, bottling, comes, across,..."


In [27]:
nr_words = 0
for x in tokens:
    nr_words += len(x)

In [28]:
print(nr_words)

5250112


# Define Vocabulary

In [40]:
def process_reviews(dataset,vocab,is_train):
    # walk through all reviews in the dataset
    for index, row in dataset.iterrows():
        tokens = row['tokens']
        vocab.update(tokens)

In [41]:
# define vocab
vocab = Counter()
# add all reviews to vocab
process_reviews(dataset,vocab, True)

In [42]:
vocab.most_common(50)

[('wine', 78035),
 ('flavors', 62678),
 ('fruit', 45016),
 ('aromas', 39613),
 ('palate', 38083),
 ('acidity', 34958),
 ('finish', 34943),
 ('tannins', 30854),
 ('drink', 29966),
 ('cherry', 27381),
 ('ripe', 26990),
 ('black', 25392),
 ('notes', 19018),
 ('red', 18864),
 ('spice', 18778),
 ('rich', 17275),
 ('fresh', 16957),
 ('nose', 16910),
 ('oak', 16645),
 ('berry', 15488),
 ('dry', 15448),
 ('plum', 14117),
 ('soft', 13541),
 ('fruits', 13506),
 ('blend', 13058),
 ('apple', 12842),
 ('crisp', 12793),
 ('blackberry', 12703),
 ('offers', 12663),
 ('sweet', 12395),
 ('texture', 12382),
 ('white', 12307),
 ('shows', 11515),
 ('light', 11440),
 ('citrus', 11375),
 ('dark', 11329),
 ('bright', 10968),
 ('vanilla', 10689),
 ('well', 10606),
 ('cabernet', 10503),
 ('full', 10096),
 ('pepper', 9905),
 ('juicy', 9751),
 ('fruity', 9468),
 ('good', 9443),
 ('raspberry', 9297),
 ('firm', 9161),
 ('green', 9049),
 ('peach', 8520),
 ('touch', 8518)]

In [44]:
# keep tokens with a minimum occurance
min_occurance = 2
tokens = [k for k,c in vocab.items() if c >= min_occurance]
print(len(tokens))

24453


In [45]:
# save vocab to file
def save_list(lines, filename):
    # convert lines to a single blob of text
    data = '\n'.join(lines)
    # open file
    file = open(filename, 'w')
    # write text
    file.write(data)
    # close file
    file.close()
    
# save tokens to a vocabulary file
save_list(tokens,'vocab.txt')

## Encode Reviews as Sequence of Integers

In [46]:
# load vocab into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

# load the vocabulary
vocab_filename = 'vocab.txt'
vocab = load_doc(vocab_filename)
vocab = vocab.split()
vocab = set(vocab)

In [52]:
# turn a review into clean tokens
def clean_review(description,vocab):
    # remove punctuation
    description_without_punctuation = description.translate(translator)
    # make lowercase
    description_lower = description_without_punctuation.lower()
    # tokenize
    description_tokens = nltk.word_tokenize(description_lower)
    # filter out tokens not in vocab
    description_tokens = [w for w in description_tokens if w in vocab]
    description_tokens = ' '.join(description_tokens)
    
    return description_tokens

In [53]:
def process_reviews(dataset,vocab,is_train):
    documents = list()
    for index, row in dataset.iterrows():
        tokens = clean_review(row['description'],vocab)
        documents.append(tokens)
    return documents

In [54]:
docs = process_reviews(dataset,vocab,True)

In [60]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [57]:
# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(docs)

In [58]:
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(docs)

In [61]:
# pad sequences
max_length = max([len(s.split()) for s in docs])
Xdata = pad_sequences(encoded_docs, maxlen=max_length, padding='post')