In [14]:
#perplexity
import nltk
from nltk.lm.preprocessing import padded_everygram_pipeline
from nltk.lm import MLE
from nltk.lm import Vocabulary

train_sentences = ['an apple', 'an orange', 'the green apple is sweet']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in train_sentences]

n = 2
train_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
words = [word for sent in tokenized_text for word in sent]
words.extend(["<s>", "</s>"])
padded_vocab = Vocabulary(words)
model = MLE(n)
model.fit(train_data, padded_vocab)

test_sentences = ['an apple', 'an ant']
tokenized_text = [list(map(str.lower, nltk.tokenize.word_tokenize(sent))) for sent in test_sentences]

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for test in test_data:
    print ("MLE Estimates:", [((ngram[-1], ngram[:-1]),model.score(ngram[-1], ngram[:-1])) for ngram in test])

test_data = [nltk.bigrams(t,  pad_right=True, pad_left=True, left_pad_symbol="<s>", right_pad_symbol="</s>") for t in tokenized_text]
for i, test in enumerate(test_data):
  print("PP({0}):{1}".format(test_sentences[i], model.perplexity(test)))

MLE Estimates: [(('an', ('<s>',)), 0.6666666666666666), (('apple', ('an',)), 0.5), (('</s>', ('apple',)), 0.5)]
MLE Estimates: [(('an', ('<s>',)), 0.6666666666666666), (('ant', ('an',)), 0.0), (('</s>', ('ant',)), 0)]
PP(an apple):1.8171205928321397
PP(an ant):inf


In [None]:
!git clone https://github.com/IBM/deep-learning-language-model

Cloning into 'deep-learning-language-model'...
remote: Enumerating objects: 395, done.[K
remote: Counting objects: 100% (2/2), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 395 (delta 0), reused 0 (delta 0), pack-reused 393[K
Receiving objects: 100% (395/395), 37.74 MiB | 24.97 MiB/s, done.
Resolving deltas: 100% (185/185), done.


In [None]:
from __future__ import print_function
import json
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import requests
import pandas as pd

In [None]:
path = 'deep-learning-language-model/yelp_100_3.txt'
text = open(path).read().lower()
print('corpus length:', len(text))
char_indices = json.loads(open('deep-learning-language-model/char_indices.txt').read())
indices_char = json.loads(open('deep-learning-language-model/indices_char.txt').read())
chars = sorted(char_indices.keys())
print(indices_char)
#chars = sorted(list(set(text)))
print('total chars:', len(chars))
#char_indices = dict((c, i) for i, c in enumerate(chars))
#indices_char = dict((i, c) for i, c in enumerate(chars))

# cut the text in semi-redundant sequences of maxlen characters
maxlen = 256
step = 3
sentences = []
next_chars = []
for i in range(0, len(text) - maxlen, step):
    sentences.append(text[i: i + maxlen])
    next_chars.append(text[i + maxlen])
print('nb sequences:', len(sentences))

print('Vectorization...')
X = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
    for t, char in enumerate(sentence):
        X[i, t, char_indices[char]] = 1
    y[i, char_indices[next_chars[i]]] = 1


# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(1024, return_sequences=True, input_shape=(maxlen, len(chars))))
model.add(LSTM(512, return_sequences=False))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))

optimizer = Adam(lr=0.002)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

model.load_weights("deep-learning-language-model/transfer_weights")

def sample(preds, temperature=.6):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

# train the model, output generated text after each iteration
for iteration in range(1, 60):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    x = np.zeros((1, maxlen, len(chars)))
    preds = model.predict(x, verbose=0)[0]
    
    model.fit(X, y, batch_size=128, epochs=1)

    start_index = random.randint(0, len(text) - maxlen - 1)
    #start_index = char_indices["{"]

    for diversity in [0.2, 0.4, 0.6, 0.8]:
        print()
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)
        for i in range(400):
            x = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x[0, t, char_indices[char]] = 1.

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            #print(next_index)
            #print (indices_char)
            next_char = indices_char[str(next_index)]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()
model.save_weights("transfer_weights")

corpus length: 71250
{'0': '\n', '1': ' ', '2': '!', '3': '"', '4': '#', '5': '$', '6': '%', '7': '&', '8': "'", '9': '(', '10': ')', '11': '*', '12': '+', '13': ',', '14': '-', '15': '.', '16': '/', '17': '0', '18': '1', '19': '2', '20': '3', '21': '4', '22': '5', '23': '6', '24': '7', '25': '8', '26': '9', '27': ':', '28': ';', '29': '=', '30': '?', '31': '[', '32': ']', '33': 'a', '34': 'b', '35': 'c', '36': 'd', '37': 'e', '38': 'f', '39': 'g', '40': 'h', '41': 'i', '42': 'j', '43': 'k', '44': 'l', '45': 'm', '46': 'n', '47': 'o', '48': 'p', '49': 'q', '50': 'r', '51': 's', '52': 't', '53': 'u', '54': 'v', '55': 'w', '56': 'x', '57': 'y', '58': 'z', '59': '{', '60': '}'}
total chars: 61
nb sequences: 23665
Vectorization...
Build model...


  "The `lr` argument is deprecated, use `learning_rate` instead.")



--------------------------------------------------
Iteration 1

----- diversity: 0.2
----- Generating with seed: "e this place, or at least like it.  we walked out of there terribly disappointed, i doubt we'll go back.}{this is a nothing but pies place...so i was expecting a little bit more oomph...

dispointing small selection of pie by the slices...also you gotta pu"
e this place, or at least like it.  we walked out of there terribly disappointed, i doubt we'll go back.}{this is a nothing but pies place...so i was expecting a little bit more oomph...

dispointing small selection of pie by the slices...also you gotta pup and the service is no the compery to spe is a little shopp of the salad, and i have to compery to seated a need a bit of meal for me some cheeses of the compery chicken was a bit of perfect for me hearthele for delicious are or meal with out the cook torato salad.  i would be back and salad and the service was so it was awayone.  i have been at the porking and the pa



r was potato for a biers and difen anout the best it wasn't and shoppen and the corner of that is a girling that that is that the staff with cashes and the borther and she then the best it wasn't that the best pista was got the best past on the menu and she is a great place

----- diversity: 0.4
----- Generating with seed: "m has was way better.  if you go here.... order a burger.  they are big, juicy and super duper good.  it's also super duper messy.  it's a "fork and knife" type of burger. 

their bloody mary is pretty good... a little too tomato-y.... needed more seasonin"
m has was way better.  if you go here.... order a burger.  they are big, juicy and super duper good.  it's also super duper messy.  it's a "fork and knife" type of burger. 

their bloody mary is pretty good... a little too tomato-y.... needed more seasoning that when i saided tort out the change that is a great staff.

the salad and was so my friends and the most of the copped food coffee of pretis souss of peryo

In [3]:
!pip install nltk==3.5

Collecting nltk==3.5
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |▎                               | 10kB 20.4MB/s eta 0:00:01[K     |▌                               | 20kB 28.7MB/s eta 0:00:01[K     |▊                               | 30kB 31.1MB/s eta 0:00:01[K     |█                               | 40kB 24.7MB/s eta 0:00:01[K     |█▏                              | 51kB 15.4MB/s eta 0:00:01[K     |█▍                              | 61kB 12.5MB/s eta 0:00:01[K     |█▋                              | 71kB 13.8MB/s eta 0:00:01[K     |█▉                              | 81kB 15.2MB/s eta 0:00:01[K     |██                              | 92kB 15.9MB/s eta 0:00:01[K     |██▎                             | 102kB 16.1MB/s eta 0:00:01[K     |██▌                             | 112kB 16.1MB/s eta 0:00:01[K     |██▊                             | 122kB 16.1MB/s eta 0:00:01

In [1]:
import nltk

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

True