In [12]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, LSTM, Dense, Dropout, Concatenate, Input, Reshape 
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
from keras.models import Model

data = pd.read_csv('OSX_DS_assignment.csv')
X_title = data['review_title']
X_desc = data['review_description']
X_winery = data['winery']
X_points = data['points']

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['variety'])
num_classes = len(label_encoder.classes_)
y = to_categorical(y, num_classes=num_classes)

tokenizer = Tokenizer()
tokenizer.fit_on_texts(np.concatenate((X_title, X_desc, X_winery)))

X_title = tokenizer.texts_to_sequences(X_title)
X_desc = tokenizer.texts_to_sequences(X_desc)
X_winery = tokenizer.texts_to_sequences(X_winery)

max_len = 100
X_title = pad_sequences(X_title, maxlen=max_len)
X_desc = pad_sequences(X_desc, maxlen=max_len)
X_winery = pad_sequences(X_winery, maxlen=max_len)

max_title_len = 1
max_desc_len = 1
title_input = Input(shape=(max_title_len,), dtype='int32')
desc_input = Input(shape=(max_desc_len,), dtype='int32')
winery_input = Input(shape=(1,), dtype='int32')
points_input = Input(shape=(1,), dtype='float32')

title_embed = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_title_len)(title_input)
desc_embed = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_desc_len)(desc_input)
winery_embed = Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=1)(winery_input)

title_lstm = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(title_embed)
desc_lstm = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(desc_embed)
winery_lstm = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(winery_embed)

merged = Concatenate()([title_lstm, desc_lstm, winery_lstm, points_input])
merged = Dense(64, activation='relu')(merged)
merged = Dense(32, activation='relu')(merged)

output = Dense(num_classes, activation='softmax')(merged)

model = Model(inputs=[title_input, desc_input, winery_input, points_input], outputs=output)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit([X_title, X_desc, X_winery, X_points], y, validation_split=0.2, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [13]:
new_title = 'Nicosia 2013 VulkÃ  Bianco  (Etna)'
new_desc = "Aromas include tropical fruit, broom, brimstone and dried herb. The palate isn't overly expressive, offering unripened apple, citrus and dried sage alongside brisk acidity."
new_winery = 'Nicosia'
new_points = 87

new_title_seq = tokenizer.texts_to_sequences([new_title])
new_desc_seq = tokenizer.texts_to_sequences([new_desc])
new_winery_seq = tokenizer.texts_to_sequences([new_winery])

new_title_padded = pad_sequences(new_title_seq, maxlen=max_len)
new_desc_padded = pad_sequences(new_desc_seq, maxlen=max_len)
new_winery_padded = pad_sequences(new_winery_seq, maxlen=1)

prediction = model.predict([new_title_padded, new_desc_padded, new_winery_padded, np.array([[new_points]])])

predicted_class = label_encoder.inverse_transform([np.argmax(prediction)])
print('Predicted variety:', predicted_class[0])

Predicted variety: White Blend


In [15]:
loss, accuracy = model.evaluate([X_title, X_desc, X_winery, X_points], y)
print(accuracy*100)

99.41542148590088


In [16]:
model.save('Wine_Variety.h5')