In [3]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [4]:
with open("/kaggle/input/yagityphoon/YagiTyphoon.txt", 'r', encoding='utf-8') as myfile:
    text = myfile.read()

In [5]:
text

'from https://en.vietnamplus.vn/pm-demands-urgent-response-to-super-typhoon-yagi-post296016.vnp\n\nHanoi (VNA) – Prime Minister Pham Minh Chinh has signed an official dispatch demanding efforts to be hastened in response to the approaching Typhoon Yagi, which intensified into a super typhoon on September 5 morning.\n\nThe dispatch was sent to leaders of provinces and cities along the coast from the northern to the central regions, namely Quang Ninh, Hai Phong, Thai Binh, Nam Dinh, Ninh Binh, Thanh Hoa, Nghe An, Ha Tinh, Quang Binh, Quang Tri, Thua Thien - Hue, Da Nang, Quang Nam, Quang Ngai, and Binh Dinh, along with the inland northern localities of Lang Son, Cao Bang, Bac Kan, Thai Nguyen, Ha Giang, Lao Cai, Yen Bai, Tuyen Quang, Phu Tho, Vinh Phuc, Dien Bien, Lai Chau, Son La, Hoa Binh, Hanoi, Ha Nam, Hung Yen, Hai Duong, Bac Ninh, and Bac Giang.\n\nIt was also sent to the ministers of national defence, public security, natural resources and environment, agriculture and rural develo

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [5]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'response': 5,
 'with': 6,
 'in': 7,
 'level': 8,
 'quang': 9,
 'for': 10,
 'from': 11,
 'pm': 12,
 'on': 13,
 'binh': 14,
 'ha': 15,
 'localities': 16,
 'areas': 17,
 'typhoon': 18,
 'be': 19,
 'september': 20,
 'leaders': 21,
 'along': 22,
 'it': 23,
 'also': 24,
 'at': 25,
 'up': 26,
 'as': 27,
 'measures': 28,
 'must': 29,
 'super': 30,
 'yagi': 31,
 'dispatch': 32,
 'into': 33,
 'morning': 34,
 'was': 35,
 'central': 36,
 'ninh': 37,
 'nam': 38,
 'bac': 39,
 'sea': 40,
 'storm': 41,
 'mainland': 42,
 'committees': 43,
 'safety': 44,
 'people': 45,
 'urgent': 46,
 'hanoi': 47,
 'an': 48,
 'a': 49,
 '5': 50,
 'sent': 51,
 'northern': 52,
 'regions': 53,
 'hai': 54,
 'thai': 55,
 'dinh': 56,
 'hoa': 57,
 'tinh': 58,
 'son': 59,
 'giang': 60,
 'yen': 61,
 'ministers': 62,
 'public': 63,
 'noted': 64,
 'fastest': 65,
 'wind': 66,
 'speed': 67,
 '16': 68,
 'over': 69,
 'is': 70,
 'directly': 71,
 'gulf': 72,
 'tonkin': 73,
 '6': 74,
 '13': 75,


In [7]:
#prepare n-gram sequence
n_gram_sequences = []
for line in text.split('\n'):
    tokens = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1,len(tokens)):
        n_gram_sequence = tokens[:i+1]
        n_gram_sequences.append(n_gram_sequence)

In [9]:
#padding with respect to the longest sequence
max_sequence_len = max([len(seq) for seq in n_gram_sequences])
#pre-padding for RNN-based model (GRU, LSTM ...)
pad_n_gram_sequences = np.array(pad_sequences(n_gram_sequences, maxlen=max_sequence_len, padding='pre'))

In [8]:
pad_n_gram_sequences[1]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,  11, 100, 101], dtype=int32)

In [10]:
X = pad_n_gram_sequences[:, :-1]
y = pad_n_gram_sequences[:, -1]

In [19]:
print(X.shape)
print(total_words)
print(max_sequence_len-1)

(633, 99)
342
99


In [11]:
#one hot encoding
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [12]:
n_layers = 2
embedding_dim = 128

model = Sequential()
model.add(Embedding(input_dim=total_words, output_dim=embedding_dim))

for i in range(n_layers - 1):
    model.add(LSTM(embedding_dim, return_sequences=True))

model.add(LSTM(embedding_dim))
model.add(Dense(total_words, activation='softmax'))

model.build(input_shape=(None, max_sequence_len-1))

print(model.summary())

None


In [13]:
model.compile(loss ='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs = 100, verbose = 1)

Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 180ms/step - accuracy: 0.0354 - loss: 5.8028
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 181ms/step - accuracy: 0.0466 - loss: 5.4505
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 183ms/step - accuracy: 0.0661 - loss: 5.3719
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 179ms/step - accuracy: 0.0476 - loss: 5.3161
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 202ms/step - accuracy: 0.0636 - loss: 5.3330
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 182ms/step - accuracy: 0.0935 - loss: 5.2180
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 180ms/step - accuracy: 0.0678 - loss: 5.2336
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 179ms/step - accuracy: 0.0774 - loss: 5.1817
Epoch 9/100
[1m20/20[0m [32m━

<keras.src.callbacks.history.History at 0x7fc5c515cd90>

In [14]:
#Test model
input_text = "Yagi is"
predict_next_words = 6

for i in range(predict_next_words):
    tokens = tokenizer.texts_to_sequences([input_text])[0]
    print(tokens)
    pad_tokens = pad_sequences([tokens], maxlen = max_sequence_len - 1, padding = 'pre')
    predicted = np.argmax(model.predict(pad_tokens), axis = -1)
    print(predicted)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word;
            break;
    input_text += " " + output_word

print(input_text)

[31, 70]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 350ms/step
[322]
[31, 70, 322]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[3]
[31, 70, 322, 3]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[71]
[31, 70, 322, 3, 71]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
[194]
[31, 70, 322, 3, 71, 194]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
[1]
[31, 70, 322, 3, 71, 194, 1]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
[72]
Yagi is bear to directly impact the gulf


# Evaluation

To evaluate language models, we usually use perplexity which is a measure of how well a probability model predicts a sample. Note that perplexity is defined as: 

$$P(W) = \sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}$$

As an implementation hack, you would usually take the log of that formula (to enable us to use the log probabilities we get as output of our `RNN`, convert exponents to products, and products into sums which makes computations less complicated and computationally more efficient). You should also take care of the padding, since you do not want to include the padding when calculating the perplexity (because we do not want to have a perplexity measure artificially good).


$$\log P(W) = {\log\left(\sqrt[N]{\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}}\right)}$$$$ = \log\left(\left(\prod_{i=1}^{N} \frac{1}{P(w_i| w_1,...,w_{n-1})}\right)^{\frac{1}{N}}\right)$$
$$ = \log\left(\left({\prod_{i=1}^{N}{P(w_i| w_1,...,w_{n-1})}}\right)^{-\frac{1}{N}}\right)$$$$ = -\frac{1}{N}{\log\left({\prod_{i=1}^{N}{P(w_i| w_1,...,w_{n-1})}}\right)} $$$$ = -\frac{1}{N}{{\sum_{i=1}^{N}{\log P(w_i| w_1,...,w_{n-1})}}} $$

In [26]:
def evaluate(preds, target, pad_token_index):
    epsilon = 1e-10
    preds = np.clip(preds, epsilon, 1.0)
    log_probs = np.log(preds[np.arange(len(target)), target])

    valid_targets = target != pad_token_index
    log_probs = log_probs[valid_targets]

    N = len(log_probs)
    if N == 0:
        return float('inf')  
    log_perplexity = np.sum(log_probs) / -N
    perplexity = np.exp(log_perplexity)
    
    return perplexity

In [27]:
preds = model.predict(X)
print(preds)
target = np.argmax(y, axis=1)  
print(target)
pad_token_index = 0 

perplexity = evaluate(preds, target, pad_token_index)
print(f'Perplexity: {perplexity}')

[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 72ms/step
[[7.6440259e-08 5.5135939e-02 7.2526128e-04 ... 6.5499057e-07
  8.4390131e-06 2.2133586e-06]
 [2.3741181e-06 1.1008451e-03 3.5514470e-06 ... 9.1831935e-06
  1.2851865e-05 6.2041199e-06]
 [8.0906275e-06 1.9280498e-04 1.9338429e-06 ... 1.6672844e-05
  1.8297238e-05 7.1656368e-06]
 ...
 [8.7753325e-09 7.6438760e-04 9.4080257e-01 ... 1.0894432e-07
  5.6366247e-05 5.4358417e-04]
 [1.0667900e-06 2.5083908e-04 2.1117178e-03 ... 3.3932181e-06
  1.9308591e-05 8.6514646e-01]
 [1.0069282e-07 1.8836376e-04 3.9300561e-02 ... 1.7468852e-07
  7.1139926e-05 1.2177945e-02]]
[100 101 102 103  12 104  46   5   3  30  18  31 105 106 107 108 109 110
 111 112 113 114 115  48 116  32 117 118   3  19 119   7   5   3   1 120
  18  31 121 122  33  49  30  18  13  20  50  34  32  35  51   3  21   4
 123   2 124  22   1 125  11   1  52   3   1  36  53 126   9  37  54 127
  55  14  38  56  37  14 128  57 129  48  15  58   9  14   9 130 131 1