# Exploring Wine Review Data

In [1]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import ComplementNB
import pickle
import nltk
from nltk.corpus import wordnet as wn

In [2]:
# Read data
df = pd.read_csv("../Resources/winemag-data-130k-cleanedPunctuation.csv")

In [3]:
del df['Unnamed: 0']
df.head()

Unnamed: 0,country,description,designation,points,price,province,region_one,region_two,taster_name,taster_twitter_handle,title,variety,winery
0,Italy,Aromas include tropical fruit broom brimstone ...,Vulkà Bianco,87,,Sicily & Sardinia,Etna,,Kerin O’Keefe,@kerinokeefe,Nicosia 2013 Vulkà Bianco (Etna),White Blend,Nicosia
1,Portugal,This is ripe and fruity a wine that is smooth ...,Avidagos,87,15.0,Douro,,,Roger Voss,@vossroger,Quinta dos Avidagos 2011 Avidagos Red (Douro),Portuguese Red,Quinta dos Avidagos
2,US,Tart and snappy the flavors of lime flesh and ...,,87,14.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Rainstorm 2013 Pinot Gris (Willamette Valley),Pinot Gris,Rainstorm
3,US,Pineapple rind lemon pith and orange blossom s...,Reserve Late Harvest,87,13.0,Michigan,Lake Michigan Shore,,Alexander Peartree,,St. Julian 2013 Reserve Late Harvest Riesling ...,Riesling,St. Julian
4,US,Much like the regular bottling from 2012 this ...,Vintner's Reserve Wild Child Block,87,65.0,Oregon,Willamette Valley,Willamette Valley,Paul Gregutt,@paulgwine,Sweet Cheeks 2012 Vintner's Reserve Wild Child...,Pinot Noir,Sweet Cheeks


In [4]:
df.describe().round()

Unnamed: 0,points,price
count,129970.0,120974.0
mean,88.0,35.0
std,3.0,41.0
min,80.0,4.0
25%,86.0,17.0
50%,88.0,25.0
75%,91.0,42.0
max,100.0,3300.0


# Data Pre-processing

In [5]:
# Filter needed columns
df_filtered = df[['description', 'variety','country','winery']]

In [6]:
# Drop NAs and duplicates
df_filtered = df_filtered.dropna(how='any')
df_filtered=df_filtered.drop_duplicates()
df_filtered.head()

Unnamed: 0,description,variety,country,winery
0,Aromas include tropical fruit broom brimstone ...,White Blend,Italy,Nicosia
1,This is ripe and fruity a wine that is smooth ...,Portuguese Red,Portugal,Quinta dos Avidagos
2,Tart and snappy the flavors of lime flesh and ...,Pinot Gris,US,Rainstorm
3,Pineapple rind lemon pith and orange blossom s...,Riesling,US,St. Julian
4,Much like the regular bottling from 2012 this ...,Pinot Noir,US,Sweet Cheeks


In [7]:
# Identify x and y
X = df_filtered['description']
y = df_filtered['variety']

In [8]:
# label encoder for y
from sklearn import preprocessing
from tensorflow.keras.utils import to_categorical
label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(y)
encoded_y = label_encoder.transform(y)

# Convert encoded labels to one-hot-encoding
y_categorical = to_categorical(encoded_y)
y_list=list(zip(y,encoded_y))
y_list=pd.DataFrame(y_list, columns=['Label','Class'])

In [9]:
# Text preprocessing
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(X)
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_counts)

In [10]:
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_categorical, test_size=0.2, random_state=100)

In [11]:
X_train.shape

(95935, 40858)

In [12]:
X_test.shape

(23984, 40858)

In [13]:
 y_train.shape

(95935, 701)

In [14]:
 y_test.shape

(23984, 701)

# Deep learning 

In [15]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(units=100, activation='relu', input_dim=40858))
model.add(Dense(units=701, activation='softmax'))

In [16]:
# Compile the model
model.compile(loss="categorical_crossentropy",
              optimizer="adam", metrics=['accuracy'])

In [17]:
# Use the training data to fit (train) the model
model.fit(
    X_train,
    y_train,
    epochs=3,
    shuffle=True,
    verbose=2
)

InvalidArgumentError: indices[1] = [0,39651] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]

In [None]:
# Save the model
model.save("dl_v2.h5")

In [18]:
# Evaluate the model using the training data
model_loss, model_accuracy = model.evaluate(X_test, y_test, verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

InvalidArgumentError: indices[1] = [0,38917] is out of order. Many sparse ops require sorted indices.
    Use `tf.sparse.reorder` to create a correctly ordered copy.

 [Op:SerializeManySparse]

In [1]:
import pickle
vectorizer_file = "tokenizer.sklearn"
pickle.dump(count_vect, open(vectorizer_file,'wb'))

tokenizer_file = "vectorizer.sklearn"
pickle.dump(tfidf_transformer, open(tokenizer_file,'wb'))

NameError: name 'count_vect' is not defined

In [None]:
model = load_model("dl_v2.h5")

In [None]:
# Reusing model
import pickle
from tensorflow.keras.models import load_model
vectorizer = pickle.load(open(vectorizer_file, 'rb'))
tokenizer = pickle.load(open(tokenizer_file, 'rb'))


In [None]:
user_input=['semisweetchocolate']
X_new = vectorizer.transform(user_input)
X_new = tokenizer.transform(X_new)
result = model.predict(X_new)
result

In [None]:
# Decode the result
predicted_class=model.predict_classes(X_new)
predicted_class=pd.DataFrame(predicted_class,columns=['Class'])
print(f"Predicted class: {model.predict_classes(X_new)}")

result = predicted_class.merge(y_list,on='Class',how='left').drop_duplicates()
result = result.Label
result