In [1]:
import tensorflow as tf
import keras
import numpy as np
np.random.seed(0)

  from ._conv import register_converters as _register_converters
  (fname, cnt))
  (fname, cnt))
Using TensorFlow backend.


In [27]:
whitelist = '0123456789abcdefghijklmnopqrstuvwxyz '

T = 20
T_min = 10


def process(x):
    # Strip and lower
    x = x.strip().lower()
    
    # Only keep chars in white list
    x_ = ""
    for char in x:
        if char in whitelist:
            x_ += char
    x = x_
    
    # Only keep non-space tokens
    x_ = []
    for word in x.split(" "):
        if word != " ":
            x_.append(word)
    x = " ".join(x_)
    
    return x

def section(line):
    chunk_size = lambda x: len(" ".join(x))
    
    chunk = []
    for word in line.split(" "):
        word = process(word)
        # Word too big!
        if chunk_size(chunk + [word]) > T - 1:
            if chunk_size(chunk) >= T_min:
                yield(pad(" ".join(chunk)))
            if len(word) <= T - 1:
                chunk = [word]
        else:
            if len(word) <= T - 1:
                chunk.append(word)
    # Capture the last stuff
    if chunk:
        if chunk_size(chunk) >= T_min and chunk_size(chunk) <= T - 1:
            yield(pad(" ".join(chunk)))
        

def pad(x):
    x += "."
    assert(len(x) <= T)
    if len(x) < T:
        x += " " * (T - len(x))
    return x

In [28]:
import pickle

beer = []
with open("beer_names.txt", "r") as f:
    for line in f:
        for x in section(line):
            beer.append(x)

normal = []
with open("tokenized.pkl", "rb") as f:
    dump = pickle.load(f)
    for line in dump:
        line = " ".join(line)
        for x in section(line):
            normal.append(x)

In [31]:
def init_idify(data):
    ids = {}
    i = 0
    for line in data:
        for char in line:
            if char in ids:
                continue
            else:
                ids[char] = i
                i += 1
    return ids

ids = init_idify(normal + beer)

def idify(line):
    res = []
    for char in line:
        res.append(ids[char])
    return res

reverse_ids = {}
for key, value in ids.items():
    reverse_ids[value] = key

def unidify(line):
    res = []
    for id_ in line:
        res.append(reverse_ids[id_])
    return res
    

pos = list(map(idify, beer))
neg = list(map(idify, normal))

In [32]:
import numpy as np
X = np.array(pos, dtype=np.uint8)
Y = np.repeat(np.array([[0, 1]]), len(X), axis=0)

X_ = np.array(neg, dtype=np.uint8)[:len(X)*10]
Y_ = np.repeat(np.array([[1, 0]]), len(X_), axis=0)

X = np.concatenate((X, X_), axis=0)
Y = np.concatenate((Y, Y_), axis=0)

p = np.random.permutation(len(X))
X = X[p]
Y = Y[p]

In [33]:
import keras
from keras.optimizers import SGD, Adam, RMSprop
import tensorflow as tf
import matplotlib.pyplot as plt
import timeit
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, CuDNNGRU, CuDNNLSTM, \
    TimeDistributed, Reshape, Input, Dropout, Embedding, Bidirectional
from keras.activations import relu

In [34]:
TD = TimeDistributed
VOCAB = np.max(X)

inputs = Input(shape=(T,), dtype='int8', name='inputs')
emb = Embedding(VOCAB, 256, input_length=T)(inputs)

rnn_x = Reshape((T, -1))(emb)
rnn_x = Bidirectional(CuDNNLSTM(512, return_state=False, return_sequences=False))(rnn_x)

fc_x = Dropout(0.5)(rnn_x)
fc_x = Dense(128)(fc_x)
fc_x = Activation("relu")(fc_x)
fc_x = Dropout(0.5)(rnn_x)
fc_x = Dense(2)(fc_x)
predictions = Activation("softmax")(fc_x)

model = Model(inputs=inputs, outputs=predictions)

In [35]:
import h5py

BATCH = 128
# model.load_weights("model1_weights.h5")
model.compile(optimizer='adam', loss='binary_crossentropy')
model.fit(X, Y, batch_size=BATCH, epochs=4)
# model.save_weights("model1_weights.h5")

Epoch 1/4
Epoch 2/4
Epoch 3/4
  2688/221254 [..............................] - ETA: 1:12 - loss: 0.0428

KeyboardInterrupt: 

In [36]:
query = "Chocolate Weltenburger"
print(model.predict(np.array([idify(x) for x in section(query)])))

query = "Hello there my friend"
print(model.predict(np.array([idify(x) for x in section(query)])))

[[0.03237325 0.96762675]]
[[0.9961719  0.00382817]]


In [38]:
while True:
    query = input("Here: ")
    print(model.predict(np.array([idify(x) for x in section(query)])))

Here: a sweet stout
[[0.02629532 0.9737047 ]]
Here: an elder's place
[[0.9614792  0.03852081]]
Here: pliny's elder beer
[[0.02291322 0.97708684]]
Here: pliny is a beer
[[0.99631727 0.00368269]]
Here: pliny elder elder
[[0.38409457 0.6159054 ]]


KeyboardInterrupt: 