In [1]:
from __future__ import print_function
import json
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import tensorflow as tf

In [2]:
%%capture
! wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1m5rdL_vkZr7JCo_-8g4v9fh2jfU-sX-A' -O yelp_100_3.txt
! wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1qfMZCaANhTR6b1NM4jXGNQQP5Z3eSyq8' -O indices_char.txt
! wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1a1CVCwKQWxyHBpTR3sVgsf7eRtZHjUqx' -O char_indices.txt
! wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1MZEUyaqT48D858zwEVzlTfjgXJvmY-BI' -O transfer_weights

In [3]:
!ls -l

total 29896
-rw-r--r-- 1 root root      541 May  5 08:17 char_indices.txt
-rw-r--r-- 1 root root      663 May  5 08:17 indices_char.txt
drwxr-xr-x 1 root root     4096 Apr 21 13:39 sample_data
-rw-r--r-- 1 root root 30524436 May  5 08:17 transfer_weights
-rw-r--r-- 1 root root    71250 May  5 08:17 yelp_100_3.txt


In [4]:
path = 'yelp_100_3.txt'
text = open(path).read().lower()
print('corpus length:', len(text))

char_indices = json.loads(open('char_indices.txt').read())
indices_char = json.loads(open('indices_char.txt').read())
chars = sorted(char_indices.keys())
print(indices_char)
#chars = sorted(list(set(text)))
print('total chars:', len(chars))
#char_indices = dict((c, i) for i, c in enumerate(chars))
#indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 256
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1




corpus length: 71250
{'0': '\n', '1': ' ', '2': '!', '3': '"', '4': '#', '5': '$', '6': '%', '7': '&', '8': "'", '9': '(', '10': ')', '11': '*', '12': '+', '13': ',', '14': '-', '15': '.', '16': '/', '17': '0', '18': '1', '19': '2', '20': '3', '21': '4', '22': '5', '23': '6', '24': '7', '25': '8', '26': '9', '27': ':', '28': ';', '29': '=', '30': '?', '31': '[', '32': ']', '33': 'a', '34': 'b', '35': 'c', '36': 'd', '37': 'e', '38': 'f', '39': 'g', '40': 'h', '41': 'i', '42': 'j', '43': 'k', '44': 'l', '45': 'm', '46': 'n', '47': 'o', '48': 'p', '49': 'q', '50': 'r', '51': 's', '52': 't', '53': 'u', '54': 'v', '55': 'w', '56': 'x', '57': 'y', '58': 'z', '59': '{', '60': '}'}
total chars: 61
nb sequences: 23665
Vectorization...


In [5]:

# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(1024, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(LSTM(512, return_sequences=False))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
optimizer = Adam(lr=0.002)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

Build model...


In [6]:
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, 256, 1024)         4448256   
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               3147776   
_________________________________________________________________
dense (Dense)                (None, 61)                31293     
_________________________________________________________________
activation (Activation)      (None, 61)                0         
Total params: 7,627,325
Trainable params: 7,627,325
Non-trainable params: 0
_________________________________________________________________
None


In [7]:
model.load_weights("transfer_weights")
print('model loaded...')
def sample(preds, temperature=.6):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# train the model, output generated text after each iteration
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    x = np.zeros((1, maxlen, len(chars)))
    preds = model.predict(x, verbose=0)[0]
    
    model.fit(X, y, batch_size=128, epochs=1)

    start_index = random.randint(0, len(text) - maxlen - 1)
    #start_index = char_indices["{"]

    for diversity in [0.2, 0.4, 0.6, 0.8]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            #print(next_index)
            #print (indices_char)
            next_char = indices_char[str(next_index)]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()

model loaded...

--------------------------------------------------
Iteration 1

----- diversity: 0.2
----- Generating with seed: "crap.  boo. also ordered fries, and a chicago dog.. the chicago dog was overpriced and forgettable, and the crinkle cut fries were reminiscent of del taco's.. which isn't exactly a bad thing, but considering that lobby's is supposed to be a tasty, quality "
crap.  boo. also ordered fries, and a chicago dog.. the chicago dog was overpriced and forgettable, and the crinkle cut fries were reminiscent of del taco's.. which isn't exactly a bad thing, but considering that lobby's is supposed to be a tasty, quality of the conner for the conners are so the bottor was we were the other dinner a breakfast was so i did the self was all do the other was sorn the most was a little short and the conner was all a sprimply chili breakfast was pretty drink that we were the other w there were the most of the self was the other were served with the food and the completely for

  


sant was pre abs, as wowning incoused to me back wotht got tod a little lack.}{i was all as we had wed.  atchore.  i got ma freshlearly fived...

--------------------------------------------------
Iteration 3

----- diversity: 0.2
----- Generating with seed: "e of the best fresh cheeses i have ever had, smooth and soft yet very flavourful. the tortilla is not what you might be used to either, they are corn tortillas but they are not the course chip like corn tortillas so prevalent in americanized hispanic foods"
e of the best fresh cheeses i have ever had, smooth and soft yet very flavourful. the tortilla is not what you might be used to either, they are corn tortillas but they are not the course chip like corn tortillas so prevalent in americanized hispanic foods for the tomito food. the buth the burger and the salsa and the chef was so you can the best say i will be beck food.}{i was a little like the food as the best fast surerish, but the worth the burger as i keep to get a salad. 