In [2]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM, Embedding
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
import numpy as np
import random
import sys
import pickle
import string
from collections import Counter

Using TensorFlow backend.


In [3]:
def pretty_join(word_list):
    sent = ''
    for i, word in enumerate(word_list[:-1]):
        sent += word + ' '*(word_list[i+1] not in string.punctuation)
    return sent+word_list[-1] 

path = 'hindi_corpus.txt'
with open(path) as f:
    all_words = f.read().replace('.',' . ').split()


train_len = int(0.9*len(all_words))
print(train_len)
list_words = all_words[:train_len]

words = sorted(list(set(list_words)))

all_word_indices = pickle.load(open('fastTextHindi_word2id.pkl','rb'))
all_indices_word = pickle.load(open('fastTextHindi_id2word.pkl','rb'))

words_with_vecs = set(words).intersection(all_word_indices.keys())

top_words, _ = zip(*Counter([word for word in list_words if word in words_with_vecs]).most_common(9999))

all_vectors = pickle.load(open('fastTextHindi.vec','rb'))

vectors = all_vectors[np.array(list(map(all_word_indices.get, top_words)))-1]

# del all_vectors

vectors = np.concatenate([np.zeros(vectors[0:1].shape), vectors],axis=0)

from collections import defaultdict

word_indices = defaultdict(int)
word_indices.update(dict([(word, i+1) for i, word in enumerate(top_words)]))
indices_word = dict(zip(word_indices.values(), word_indices.keys()))
indices_word[0] = '</unk>'

print("word_indices", type(word_indices), "length:",len(word_indices) )
print("indices_words", type(indices_word), "length", len(indices_word))

print('Num vectors', len(vectors))

embed_dim = vectors.shape[1]
# cut the text in semi-redundant sequences of maxlen words
maxlen = 30
step = 3

def get_word_vec(word):
    return vectors[word_indices[word]]


2718137
word_indices <class 'collections.defaultdict'> length: 9999
indices_words <class 'dict'> length 10000
Num vectors 10000


In [4]:
vec_pickle = {'word_indices':word_indices, 'indices_word':indices_word, 
              'vector_rows':np.array(list(map(all_word_indices.get, top_words)))-1}
pickle.dump(vec_pickle,open('vec_pickle.pkl','wb'))

In [5]:
len(word_indices)

9999

In [6]:
#Sanity testing

for i, word in enumerate(top_words):
    assert np.sum(vectors[word_indices[word]]-all_vectors[all_word_indices[word]-1])==0
    assert np.sum(get_word_vec(word)-all_vectors[all_word_indices[word]-1])==0
print('Seems ok')

Seems ok


In [7]:
import os

In [8]:
sentences = []
next_words = []
files = os.listdir('hin_corp_unicode')
for file_ in files:
    with open('hin_corp_unicode/'+file_) as f:
        corpus_text = f.readlines()[2:]
        corpus_words = ' '.join(corpus_text).replace('.',' . ').split()
    for i in range(0, len(corpus_words) - maxlen, step):
        sentence = ' '.join(corpus_words[i : i + maxlen])
        sentences.append(sentence)
        next_words.append(corpus_words[i + maxlen])
    if len(sentences) > 60000:
        break
print('nb sequences(length of sentences):', len(sentences))
print('length of next word', len(next_words))


nb sequences(length of sentences): 60040
length of next word 60040


In [9]:
import tensorflow as tf

In [10]:
def perplexity(y_true, y_pred):
    return 2**tf.nn.softmax_cross_entropy_with_logits(logits = y_pred, labels = y_true)

In [11]:
from keras.callbacks import History

In [12]:
# build the model: a single LSTM
print('Build model...')
model = Sequential()
model.add(LSTM(512, return_sequences=True, input_shape=(maxlen, embed_dim)) )
model.add(Dropout(0.2))
model.add(LSTM(512, return_sequences=False))
model.add(Dropout(0.2))
model.add(Dense(len(top_words)+1))
model.add(Activation('softmax'))
history = History()
optimizer = RMSprop(0.00001)
model.compile(loss='categorical_crossentropy', 
              optimizer=optimizer)

class Examples(object):
    def __init__(self, batch_size, sentences, next_words=None, labels=True):
        self.cursor = 0
        self.sentences = sentences
        self.next_words = next_words
        self.data = list(zip(sentences, next_words)) if next_words is not None else sentences
        self.batch_size = batch_size
        self.X = np.zeros((batch_size, maxlen, embed_dim), dtype=np.float32)
        self.labels = labels
        if labels:
            self.y = np.zeros((batch_size, len(top_words)+1), dtype=np.int32)
        self.examples_per_epoch = (len(self.data)//self.batch_size)*self.batch_size
        
    def _shuffle(self):
        shuffle = np.random.permutation(len(self.data))
        self.data = [self.data[i] for i in shuffle]
            
    def __next__(self):
        i = 0
        self.X = self.X*0
        self.y = self.y*0
        while i < self.batch_size:
            if self.cursor == 0:
                self._shuffle()
            datum = self.data[self.cursor]
            
            for t, word in enumerate(datum[0].split()):
                self.X[i, t] = get_word_vec(word)
                
            if self.labels:
                self.y[i, word_indices[datum[1]]] = 1
                
            i+=1
            self.cursor = (self.cursor + 1)%(self.examples_per_epoch)
                
        return  (self.X,) +  ((self.y,) if self.labels else ())



def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)



Build model...


In [13]:
sents = sentences[:50000]
nxt_wds = next_words[:50000]

In [14]:
batch_size = 128
train_gen = Examples(batch_size, sents, nxt_wds)
num_iters = 20
gen_length = 250

In [None]:
val_gen = Examples(batch_size, sentences[-5000:] , next_words[-5000:])

In [None]:
# train_gen = Examples(128, sentences)
# train the model, output generated text after each iteration
start = 14
for iteration in range(start, num_iters):
    print()
    print('-' * 50)
    print('Iteration', iteration)

    try:
        model.load_weights('weights_hindi_epoch_%i.pkl'%(iteration-1))
        print('Loading weights')
    except OSError:
        print('Starting from scratch')

    model.fit_generator(train_gen, 
              validation_data = val_gen,
              validation_steps = 5000//batch_size,
              steps_per_epoch = len(sents)//batch_size,
              callbacks = [history],
              verbose = 1,
              epochs=1)
    model.save_weights('weights_hindi_epoch_%i.pkl'%iteration)       
    print('Train Perplexity', np.exp(history.history['loss'][0]))
    print('Val Perplexity', np.exp(history.history['val_loss'][0]))
    
    start_index = random.randint(0, len(list_words) - maxlen - 1)

    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print()
        print('----- diversity:', diversity)
        generated = ''
        sentence = list_words[start_index: start_index + maxlen]
        generated += ' '.join(sentence)
        print('----- Generating with seed: "' , sentence , '"')
        sys.stdout.write(generated)
        print()

        for i in range(gen_length):
            x = np.zeros((1, maxlen,embed_dim))
            for t, word in enumerate(sentence):
                x[0, t] = get_word_vec(word)

            preds = model.predict(x, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]
            generated += next_word

            sentence.append(next_word)
            del sentence[0]
            sys.stdout.write(' ')
            sys.stdout.write(next_word)
            sys.stdout.flush()
        print()


--------------------------------------------------
Iteration 14
Loading weights
Epoch 1/1
Train Perplexity 189.211266944
Val Perplexity 265.029819843

----- diversity: 0.2
----- Generating with seed: " ['अपने', 'परिवार', 'के', 'लिए', 'भोजन', 'पकाने', 'में', 'क्या', 'सामग्री', 'प्रयोग', 'की', 'थी', 'और', 'उसके', 'अलावा', 'बाहर', 'से', 'किसने', 'क्या', 'खाया', 'था', '.', 'यदि', 'पर्याप्त', 'संख्या', 'में', 'हर', 'आय-वर्ग', 'के', 'लोगों'] "
अपने परिवार के लिए भोजन पकाने में क्या सामग्री प्रयोग की थी और उसके अलावा बाहर से किसने क्या खाया था . यदि पर्याप्त संख्या में हर आय-वर्ग के लोगों
 में </unk> </unk> की </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </unk> </u

In [None]:
blank = np.tile(get_word_vec('</unk>'),[1,30,1])

In [None]:
blank

In [None]:
from keras import to

In [None]:
def get_perplexities(text):
    tokens = text.split()
#     sent = [get_word_vec('</unk>') for _ in range(30)]
    sent = [get_word_vec(token) for token in tokens[:30]]
    losses = []
    probs = []
    for token in tokens[30:]:
        out = np.zeros((1, len(top_words)+1), dtype=np.int32)
        ind = word_indices[token]
        out[0,ind] = 1
        x = np.expand_dims(sent,axis=0)
        loss = model.evaluate(x,out,verbose=False)
        preds = model.predict(x,verbose=False)
        probs.append(preds[0, ind])
        sent.append(get_word_vec(token))
        del sent[0]
        if len(losses) > 0:
            loss = losses[-1]+loss
        losses.append(loss)
    perplexities = np.exp(losses/np.arange(1,len(losses)+1))
    return perplexities, probs

In [None]:
text = open('hin_corp_unicode/'+file_).read()
text = ' '.join(text.split()[:1830])

In [None]:
p, q = get_perplexities(text)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
plt.figure(figsize=(12,12))
plt.plot(p,'o')

In [None]:
p[-1]

In [None]:
plt.figure(figsize=(12,12))
plt.plot(q,'o')