In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import collections

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['warandpeace40', 'glove-global-vectors-for-word-representation']


In [2]:
import collections

lower_case = True

if lower_case:
    train_file = '../input/warandpeace40/war-and-peace-trimmed-1-12.txt'
    test_file = '../input/warandpeace40/war-and-peace-trimmed-13-15.txt'
else:
    train_file = '../input/warandpeace40/war-and-peace-trimmed-1-12_original.txt'
    test_file = '../input/warandpeace40/war-and-peace-trimmed-13-15_original.txt'
    
with open (train_file) as f:
    counter = collections.Counter([token for line in f for token in line.split()])
    counts = counter.most_common()
    rare = set()
    train_words = set()
    for i in range(len(counts)-1, -1, -1):
        if counts[i][1] == 1:
            rare.add(counts[i][0])
        else:  
            train_words.add(counts[i][0])
            
train_words.add('<na>')
train_words.add('<sos>')
train_words.add('<eos>')
train_words.add('<unk>')

In [3]:
with open(train_file) as f:      
    train_data = np.array([ '<sos> ' + ' '.join([token if token not in rare else '<unk>' for token in line.split()]) + ' <eos>' for line in f])

In [4]:
train_data[0:5]

array(['<sos> book one : 1805 <eos>', '<sos> chapter i <eos>',
       '<sos> " well , prince , so genoa and <unk> are now just family estates of the <unk> . <eos>',
       "<sos> but i warn you , if you do n't tell me that this means war , if you still try to defend the <unk> and horrors <unk> by that antichrist <eos>",
       "<sos> - i really believe he is antichrist - i will have nothing more to do with you and you are no longer my friend , no longer my ' faithful slave , ' as you call yourself ! <eos>"],
      dtype='<U254')

In [5]:
with open(test_file) as f:
    test_data = np.array([ '<sos> ' + ' '.join([token if token in train_words else '<unk>' for token in line.split()]) + ' <eos>' for line in f])
test_words = {word for sentence in test_data for word in sentence.split()}

# Question 1: Report the number of unique tokens in each split of the data.

In [6]:
print('unique words in train: {}, test: {}'.format(len(train_words), len(test_words)))

unique words in train: 8473, test: 4645


# Question 2: Read the following paper:
`Zaremba, Wojciech, Ilya Sutskever, and Oriol Vinyals. ”Recurrent neural
network regularization.” arXiv preprint arXiv:1409.2329 (2014)`

Explain briefly (about 1-2 paragraphs at most) the motivation of the paper and its
contribution.

```
The Recurrent Neural Network sequence model achieves state of the art performance on important tasks that include language modeling Mikolov (2012), speech recognition Graves et al. (2013), and machine translation Kalchbrenner & Blunsom (2013). Most deep learning models require usage of regularization to genneralize well and to not overfit on data, and this problem effects RNN based models. Dropout, the most popular form of regularization in neural networks does not seem to work as well with RNNs because dropping sequential information can stack up over time stamps and greatly damage the hidden state when applied on recurrent connections.

This paper suggests using dropout in deep RNN blocks by only using dropout in the stacked connections and not using dropout on the recurrent connections. As a result, the flow of information is only effected L+1 times given L stacked layers. By not using dropout on the recurrent connections, the LSTM can benefit from dropout regularization without sacrificing its valuable memorization ability.
```

# Question 3: Implement an LSTM language model (LOWERCASE):

In [23]:
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout, Input, TimeDistributed, Activation
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.models import Sequential
import keras.utils as ku 
import keras.backend as K
import keras.losses as losses
from keras.models import load_model

In [8]:
vocabulary_size = len(train_words)


vocabulary = dict(zip(sorted(train_words), np.arange(vocabulary_size)+1))
vocabulary['<na>'] = 0
vocabulary_size += 1 # because we haven't encountered <na> yet
reverse_vocabulary = dict(zip(vocabulary.values(), vocabulary.keys()))

In [9]:
train = np.array([np.array([vocabulary[token] for token in sentence.split()]) for sentence in train_data])
test = np.array([np.array([vocabulary[token] for token in sentence.split()]) for sentence in test_data])

In [10]:
print(train[3])
print(test[3])

[  49 1063 3702 8169 8452   16 3717 8452 2223 4858 7476 4586 7518 7561
 4592 8159   16 3717 8452 7153 7780 7632 1915 7521   50  317 3620   50
 1075 7518  355   47]
[  49 7536 4012   16  317 1109  691   16 4958 1179 5076  311 3569 2595
 2631 7521 5107 1179 5076  249 1181   19   47]


In [11]:
def generate_padded_sequences(input_sequences, max_len=None):
    lengths = np.array([len(x) for x in input_sequences])
    if max_len:
        max_sequence_len = max_len
    else:
        max_sequence_len = np.max(lengths)
    
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='post'))
    empty_col = np.zeros(input_sequences.shape[0])[...,None]
    
    X = input_sequences.copy()
    X[np.arange(input_sequences.shape[0]), lengths-1] = 0
    
    Y = input_sequences.copy()[np.arange(input_sequences.shape[0]),1:]
    Y = np.append(Y, empty_col, 1)
    Y = Y.astype(int)
    
    return X, Y, lengths, max_sequence_len

X, Y, lengths, max_sequence_len = generate_padded_sequences(train)

In [12]:
for n in range(3):
    print('input: ', X[n, 0:lengths[n]-1])
    print('output: ', Y[n, 0:lengths[n]-1])
    print()

input:  [  49  911 5107   45   28]
output:  [ 911 5107   45   28   47]

input:  [  49 1232 3702]
output:  [1232 3702   47]

input:  [  49    2 8229   16 5673   16 6905 3179  317   50  426 5005 4082 2752
 2581 5076 7521   50   19]
output:  [   2 8229   16 5673   16 6905 3179  317   50  426 5005 4082 2752 2581
 5076 7521   50   19   47]



In [13]:
embedding_matrix = np.zeros((vocabulary_size, 100))
found = 0
with open('../input/glove-global-vectors-for-word-representation/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        if word in vocabulary:
            found += 1
            embedding_matrix[vocabulary[word]] = np.asarray(values[1:], dtype='float32')
print(f'found {found} word entries')

found 7987 word entries


In [14]:
vocabulary_size

8474

In [21]:
def sparse_loss(yTrue,yPred):
    return K.sparse_categorical_crossentropy(yTrue,yPred,from_logits=True)

Instructions for updating:
Use tf.cast instead.


In [None]:
def create_model(vocab_size, max_length, embedding_matrix, keep_prob=0.5):
    model = Sequential()
    # Add Input Embedding Layer
    model.add(Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length))
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(200, dropout=keep_prob, return_sequences=True))
    # Add Hidden Layer 2 - LSTM Layer
    model.add(LSTM(200, dropout=keep_prob, return_sequences=True))
    # model.add(LSTM(200, dropout= 0.5, return_sequences=True))
    
    # Add Output Layer
    model.add(TimeDistributed(Dense(vocabulary_size, activation='linear')))
    model.compile(loss=sparse_loss,  optimizer='adam')
    
    return model

`Have been using incremental epoch training`

In [53]:
# if 'model.h5' in os.listdir("../input/warandpeace40/"):
#    losses.sparse_loss = sparse_loss
#    model = load_model("../input/warandpeace40/model.h5")
#    model.summary()
model = create_model(vocabulary_size, max_sequence_len, embedding_matrix)
model.summary()
history = model.fit(X, Y, epochs=50)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 42, 100)           847400    
_________________________________________________________________
lstm_1 (LSTM)                (None, 42, 200)           240800    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 42, 8474)          1703274   
Total params: 2,791,474
Trainable params: 2,791,474
Non-trainable params: 0
_________________________________________________________________


ValueError: Error when checking target: expected time_distributed_1 to have 3 dimensions, but got array with shape (24112, 42)

In [27]:
pred = model.predict(X[0:100])
np.argmax(pred, axis=2).shape

(100, 42)

In [28]:
for col in np.argmax(pred, axis=2):
    print(' '.join([reverse_vocabulary[token] for token in col if token != 0 ]))

" the of 1812 <eos>
" iii <eos>
" i , what , " genoa , <unk> , <unk> , , , , the vistula of <eos>
" the am you , " i know n't know me that i is of is " i know , to get the army of the of of the antichrist <eos>
" i'll am , it is not - i am tell been to than him . you . have will not longer . fault . " longer , father . slave . " i you please him . "
" the is you think ? <eos>
" am you am been you , you down . have me to . same . <eos> <eos>
" was evident the , and , and the rost'ovs was a same - known grandee p'avlovna sch'erer , who , the . the of the rost'ovs . f"edorovna . <eos>
" the words he had her andrew and , who footman of the - , <unk> . and had not same time the at the . . <eos>
" p'avlovna was come come moment , her time , <eos>
" was not and if had to and , her grippe , grippe <unk> <unk> a <unk> <unk> of the p'avlovna . and to to the <unk> of <eos>
" the invitations she exception , and , the , and the by the smile , liveried footman , had , and out far : <eos>
" i ! <eos>

In [29]:
X_test, Y_test, lengths_test, _ = generate_padded_sequences(test)

In [30]:
pred = model.predict(X_test)

In [31]:
print(X[0])
print(np.argmax(pred[0], axis=1))
print(Y[0])

[  49  911 5107   45   28    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
[   2 7521   16   35   47    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
[ 911 5107   45   28   47    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [32]:
def softmax(s):
    shift_scores = s - np.max(s)
    return np.exp(shift_scores)/np.sum(np.exp(shift_scores))

In [33]:
def perplexity(pred, labels, sent_lengths):
    perp = []
    for idx, example in enumerate(pred):
        pp = 0
        for word_idx in range(sent_lengths[idx]-2):
            pp += np.log(softmax(pred[idx, word_idx, :])[labels[idx][word_idx]])
        perp.append(-pp/(sent_lengths[idx]-1))
    return perp

# Perplexity on test

In [34]:
print(np.exp(np.mean(perplexity(pred, Y_test, lengths_test))))

72.09152590741388


# Question 5: Example text:

In [55]:
for i in range(10):
    sent = np.zeros(X.shape[1])
    sent[0] = vocabulary['<sos>']
    pick = None
    idx = 1
    while (pick != vocabulary['<eos>']) and idx < X.shape[1]:
        model_pred = model.predict(np.array([sos]))
        model_pred = model_pred[0] # get the first example in "batch"
        model_pred = model_pred[0] # get the first RNN output
        pred = softmax(model_pred)
        pick = np.random.choice(vocabulary_size, p=pred) # choosing from this distribution for our first word
        sent[idx] = pick
        idx += 1

    print(' '.join([reverse_vocabulary[word] for word in sent]))
    print()

<sos> so le meeting " but chapter " in he to " i rost'ov " his " " " prince what father i " kur'agin " " only " ( what " it this i napoleon at d'olokhov soon " " "

<sos> balash"ev good there " others he " what napoleon p'etya everything " why everybody " " " pierre " " this i he the she prince how he " " the it after " " ah the " he " she

<sos> " eh this " nat'asha at there " he nicholas i prince " yet he " <unk> the " without he " it pfuel i he m'arya he it there while the suv'orov there <unk> the " " at next the

<sos> really and " " some a he " they the ... if and " " the yes the " he she this alp'atych i " evidently but but chapter she he " there he " thought oh " they nat'asha un

<sos> this i do " " at she she " " tim'okhin i but my occasionally the - " we he chapter and you " den'isov a go but it all " " having at his " " the an why mind

<sos> " " nat'asha some she let " she the the if a well he a on the but since a " " " " mimi bazd'eev " let the " and " nicholas " nat'asha 

# Question 6: Report the perplexity of both language models on the following sentences from The Tale of Two Cities:

* It was the best of times, it was the worst of times . 
* It was the age of wisdom, it was the age of foolishness.
* It was the epoch of belief, it was the epoch of incredulity. 
* It was the season of Light, it was the season of Darkness.
* It was the spring of hope, it was the winter of despair. 
* We had everything before us, we had nothing before us .


In [56]:
val_data = ['It was the best of times , it was the worst of times', 
            'It was the age of wisdom , it was the age of foolishness',
            'It was the epoch of belief , it was the epoch of incredulity', 
            'It was the season of Light it was the season of Darkness',
            'It was the spring of hope , it was the winter of despair', 
            'We had everything before us , we had nothing before us']
val_data = ['<sos> ' + sent + ' <eos>' for sent in val_data]
if lower_case:
    val_data = [sent.lower() for sent in val_data]

In [57]:
val = np.array([np.array([vocabulary[token] if token in vocabulary else vocabulary['<unk>'] for token in sentence.split()]) for sentence in val_data])
val

array([array([  49, 4018, 8175, 7521,  789, 5076, 7620,   16, 4018, 8175, 7521,
       8375, 5076, 7620,   47]),
       array([  49, 4018, 8175, 7521,  206, 5076, 8317,   16, 4018, 8175, 7521,
        206, 5076,   50,   47]),
       array([  49, 4018, 8175, 7521, 2549, 5076,  753,   16, 4018, 8175, 7521,
       2549, 5076,   50,   47]),
       array([  49, 4018, 8175, 7521,   50, 5076, 4317, 4018, 8175, 7521,   50,
       5076, 1849,   47]),
       array([  49, 4018, 8175, 7521, 7054, 5076, 3607,   16, 4018, 8175, 7521,
       8311, 5076, 2004,   47]),
       array([  49, 8198, 3391, 2603,  732, 7965,   16, 8198, 3391, 4990,  732,
       7965,   47])], dtype=object)

In [58]:
X_val, Y_val, lengths_val, _ = generate_padded_sequences(val, X.shape[1])

In [59]:
print(X_val[0])
print(Y_val[0])

[  49 4018 8175 7521  789 5076 7620   16 4018 8175 7521 8375 5076 7620
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]
[4018 8175 7521  789 5076 7620   16 4018 8175 7521 8375 5076 7620   47
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0]


In [60]:
pred_val = model.predict(X_val)

## perplexities:

In [61]:
print(np.exp(perplexity(pred_val, Y_val, lengths_val)))

[39.69536812 61.53427454 75.07962459 35.0528755  59.86974412 65.49199422]


In [62]:
model.save('model.h5')