In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import numpy as np
import pandas as pd
import os
import time
from tqdm import tqdm
import math

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, CuDNNGRU, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras.models import load_model
from keras.callbacks import ModelCheckpoint

from keras import initializers, regularizers, constraints, optimizers, layers \

    
import matplotlib.pyplot as plt
plt.switch_backend('agg')

Using TensorFlow backend.


In [2]:
df = pd.read_csv('./resources/scotch_review.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,name,category,review.point,price,currency,description
0,1,"Johnnie Walker Blue Label, 40%",Blended Scotch Whisky,97,225.0,$,"Magnificently powerful and intense. Caramels, ..."
1,2,"Black Bowmore, 1964 vintage, 42 year old, 40.5%",Single Malt Scotch,97,4500.0,$,What impresses me most is how this whisky evol...
2,3,"Bowmore 46 year old (distilled 1964), 42.9%",Single Malt Scotch,97,13500.0,$,There have been some legendary Bowmores from t...
3,4,"Compass Box The General, 53.4%",Blended Malt Scotch Whisky,96,325.0,$,With a name inspired by a 1926 Buster Keaton m...
4,5,"Chivas Regal Ultis, 40%",Blended Malt Scotch Whisky,96,160.0,$,"Captivating, enticing, and wonderfully charmin..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2247 entries, 0 to 2246
Data columns (total 7 columns):
Unnamed: 0      2247 non-null int64
name            2247 non-null object
category        2247 non-null object
review.point    2247 non-null int64
price           2247 non-null object
currency        2247 non-null object
description     2247 non-null object
dtypes: int64(2), object(5)
memory usage: 123.0+ KB


In [5]:
df['description'][8]

'The Dalmore is one of a handful of whiskies that seem to be able to age in the cask for many decades and still improve. This one is incredibly viscous on the nose and palate (and very heavy on the tongue), with chewy toffee and old pot still rum. The classic Dalmore marmalade note shines throughout, along with vanilla cream, an array of dried spices (especially cinnamon and evergreen), juicy oak, forest bedding, rancio, old armagnac, polished leather, tobacco, maple syrup, dark chocolate, almond macaroon, and subtle espresso. Long, mouth-coating finish. The flavors evolve like waves lapping on the palate -- especially the interplay with the oak. I can’t drink this whisky slowly enough. A rare experience for the lucky few who can afford it. (Price is per 100ml).'

In [6]:
df['description'] = df['description'].apply(lambda x: str(x))
data = df['description']

In [10]:
data_phd = """Before explaing how bound states can be naturally embedded in the framework of the
S-matrix, we shall first of all derive a generic expression for the bound state in terms of
the microscopic degrees of freedom of the underlying theory which is independent of any
asymptotic construction. Indeed, when computing observables connected to the interior of
the black hole we will make heavy use of this non-asymptotic construction.
As explained before, at the kinematical level all quantum states are identified by their
quantum numbers. In particular, from the point of view of representing a bound state in
terms of Fock eigenstates constructed from the weakly coupled degrees of freedom appearing in the microscopic Lagrangian, it is clear that only those Fock states have non-vanishing
overlap with the bound state which carry the same quantum numbers as the latter. In
other words, these states should have quantum numbers in accordance with the intrinsic
symmetries at work (such as gauge symmetries), and with the isometries characterizing
bound states in Minkowski space–time. Furthermore, the state has to be characterized
according to the Casimir operators of Minkowski, i.e. mass squared and spin. Including all these quantum numbers, collectively denoted as L, leads to a complete kinematic
characterization of the bound state in question.
"""

In [16]:
from numpy import array
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

tokenizer = Tokenizer()
tokenizer.fit_on_texts([data_phd])
encoded = tokenizer.texts_to_sequences([data_phd])[0]

word_index = tokenizer.word_index
print(list(tokenizer.index_word.items())[:10])
print(list(tokenizer.index_word.items())[-10:])

vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)
# encode 3 words -> 1 word
sequences = list()
for i in range(4, len(encoded)):
    sequence = encoded[i-4:i+1]
    sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))

max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print(sequences[0])
print('Max Sequence Length: %d' % max_length)

sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)

[(1, 'the'), (2, 'of'), (3, 'in'), (4, 'bound'), (5, 'states'), (6, 'state'), (7, 'quantum'), (8, 'to'), (9, 'as'), (10, 'numbers')]
[(113, 'spin'), (114, 'including'), (115, 'collectively'), (116, 'denoted'), (117, 'l'), (118, 'leads'), (119, 'complete'), (120, 'kinematic'), (121, 'characterization'), (122, 'question')]
Vocabulary Size: 123
Total Sequences: 209
[14 34 35  4  5]
Max Sequence Length: 5


In [14]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=max_length-1))
model.add(LSTM(50))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, epochs=300, verbose=2)



# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    in_text = seed_text

    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'how bound states can', 40))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 4, 10)             1230      
_________________________________________________________________
lstm_4 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_4 (Dense)              (None, 123)               6273      
Total params: 19,703
Trainable params: 19,703
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/300
 - 1s - loss: 4.8115 - acc: 0.0478
Epoch 2/300
 - 0s - loss: 4.8055 - acc: 0.0909
Epoch 3/300
 - 0s - loss: 4.7994 - acc: 0.0909
Epoch 4/300
 - 0s - loss: 4.7913 - acc: 0.0909
Epoch 5/300
 - 0s - loss: 4.7809 - acc: 0.0909
Epoch 6/300
 - 0s - loss: 4.7666 - acc: 0.0909
Epoch 7/300
 - 0s - loss: 4.7433 - acc: 0.0909
Epoch 8/300
 - 0s - loss: 4.7091 - acc: 0.0909
Epoch 9/300
 - 0s - l

Epoch 156/300
 - 0s - loss: 1.1198 - acc: 0.8086
Epoch 157/300
 - 0s - loss: 1.1044 - acc: 0.7990
Epoch 158/300
 - 0s - loss: 1.0933 - acc: 0.8134
Epoch 159/300
 - 0s - loss: 1.0746 - acc: 0.8086
Epoch 160/300
 - 0s - loss: 1.0635 - acc: 0.8134
Epoch 161/300
 - 0s - loss: 1.0435 - acc: 0.8325
Epoch 162/300
 - 0s - loss: 1.0298 - acc: 0.8325
Epoch 163/300
 - 0s - loss: 1.0144 - acc: 0.8278
Epoch 164/300
 - 0s - loss: 1.0028 - acc: 0.8278
Epoch 165/300
 - 0s - loss: 0.9878 - acc: 0.8278
Epoch 166/300
 - 0s - loss: 0.9771 - acc: 0.8373
Epoch 167/300
 - 0s - loss: 0.9624 - acc: 0.8373
Epoch 168/300
 - 0s - loss: 0.9479 - acc: 0.8373
Epoch 169/300
 - 0s - loss: 0.9366 - acc: 0.8373
Epoch 170/300
 - 0s - loss: 0.9242 - acc: 0.8565
Epoch 171/300
 - 0s - loss: 0.9113 - acc: 0.8469
Epoch 172/300
 - 0s - loss: 0.8993 - acc: 0.8517
Epoch 173/300
 - 0s - loss: 0.8869 - acc: 0.8469
Epoch 174/300
 - 0s - loss: 0.8777 - acc: 0.8612
Epoch 175/300
 - 0s - loss: 0.8659 - acc: 0.8612
Epoch 176/300
 - 0s 