In [1]:
#@title Import libraries
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.callbacks import EarlyStopping, ModelCheckpoint

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Dataset load

In [2]:
file_path = '/content/drive/MyDrive/NLP/dataset_processed.csv'
df = pd.read_csv(file_path)

In [3]:
df

Unnamed: 0,index,author,title,text,category,date,link
0,0,M,* * * * *,laska v basnach smutok v dusi bolest v srdci v...,Zaľúbené,10. 2. 2008 14:18,http://www.basnicky.sk/basnicky/22200
1,1,Joe690,* * * For You - Happy New Year * * *,nemysli uz v tejto chvili na trable a starosti...,Priania,20. 2. 2007 21:19,http://www.basnicky.sk/basnicky/12038
2,2,Joe690,* * * Pre Ivku - zdravotnú sestričku * * *,ked je rano a hned vstane este chodi opantane ...,Zaľúbené,20. 2. 2007 19:58,http://www.basnicky.sk/basnicky/12034
3,3,Joe690,* * * Twenty Happy Birthdays * * *,dnes mas prave rokov dvadcat nikdy nezacni ziv...,Priania,20. 2. 2007 20:44,http://www.basnicky.sk/basnicky/12035
4,4,milson5,* * * *,pri tebe sa mi zda ze svet je stastny kopec ru...,Ostatné,15. 9. 2008 20:36,http://www.basnicky.sk/basnicky/31778
...,...,...,...,...,...,...,...
39211,55424,Stuno,čim čim.,na strome sedi vrabec necvirika cim cim to ja ...,Ostatné,19. 10. 2010 11:27,http://www.basnicky.sk/basnicky/45398
39212,55425,patrik598,Činy,z listu usli slova zbalil som si kufor a zo za...,Zaľúbené,14. 2. 2015 19:14,http://www.basnicky.sk/basnicky/55719
39213,55426,zLtok,Čin,zosnula nadej dovrsit jeho a jej dej ich krasy...,Smutné,20. 4. 2011 23:21,http://www.basnicky.sk/basnicky/47861
39214,55427,Dorotha,čistiaci prostriedok.,potrebujem oprat srdce je zanesene tie skvrny ...,Smutné,1. 12. 2009 11:02,http://www.basnicky.sk/basnicky/41263


In [4]:
# Only text column is needed
data = df['text']
data[:5]

0    laska v basnach smutok v dusi bolest v srdci v...
1    nemysli uz v tejto chvili na trable a starosti...
2    ked je rano a hned vstane este chodi opantane ...
3    dnes mas prave rokov dvadcat nikdy nezacni ziv...
4    pri tebe sa mi zda ze svet je stastny kopec ru...
Name: text, dtype: object

### Data processing

In [5]:
# Tokenize all words within the dataset
num_words = 1000
token = Tokenizer(num_words=num_words) # , oov_token="<OOV>")
token.fit_on_texts(data)

In [6]:
len(token.word_index)

181794

In [7]:
# Transform data
encoded_text = token.texts_to_sequences(data)
# vocabulary size should be + 1
vocab_size = num_words # len(token.word_counts) + 1
vocab_size

1000

In [8]:
# First sentence raw
data[0]

'laska v basnach smutok v dusi bolest v srdci v ociach slzy pocit prazdna v mysli ty malickou utechou su spomienky a sny nesmiem uz cakat tuzit lubit tak sa uz neda musim z mysle ta odstranit nesmiem uz ti dat sancu opat ma ranit bojujem proti vlastnemu srdcu musim sa ta stranit dost bolo ublizovania si'

In [9]:
# First sentence encoded
print(*encoded_text[0])

33 4 222 4 187 175 4 134 4 230 133 148 713 4 428 26 57 288 1 239 13 566 278 18 2 13 212 273 21 16 13 24 271 732 209 17 779 823 273 2 16 302 96 5


In [10]:
# Example
# First word "laska" should be token 33
# Fourth word "smutok" should be token 222
print(token.word_index['laska'])
print(token.word_index['smutok'])

33
222


In [11]:
# Sanity check
# assert len(data[0].split()) == len(encoded_text[0])

In [12]:
# Remove poems longer than "max_len_n" words
max_len_n = 300
print(f'Len of list with long poems: {len(encoded_text)}')
encoded_text = [poem for poem in encoded_text if len(poem) <= max_len_n]
print(f'Len of list with short poems only: {len(encoded_text)}')
# We can see there's only a small difference

Len of list with long poems: 39216
Len of list with short poems only: 38876


In [13]:
# Find out the longest poem in dataset
# It may not be necessarily exactly 1000 words long (may be 993, etc.)
longest = 0
for poem in encoded_text:
  if len(poem) > longest:
    longest = len(poem)
    lognest_text = poem

longest

300

In [14]:
# Pad sequences in order to create a model
sequences = pad_sequences(encoded_text, maxlen=longest, padding='pre')
X = sequences[:, :-1]
y = sequences[:, -1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [15]:
# Variable "sequences" is now type=np.array
sequences.shape

(38876, 300)

### Model

In [16]:
model = Sequential()
# model.add(Embedding(vocab_size, 50, input_length=seq_length))
# model.add(Bidirectional(LSTM(150)))
# model.add(LSTM(100, return_sequences=True))
# model.add(LSTM(100))
# model.add(Dense(100, activation='relu'))
# model.add(Dense(vocab_size, activation='softmax'))

In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=seq_length))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(vocab_size, activation='softmax'))

In [18]:
callbacks = [
    EarlyStopping(patience=15, monitor='accuracy', min_delta=0, mode='min'),
    ModelCheckpoint('best-weights.h5', monitor='accuracy', save_best_only=True, save_weights_only=True)
    ]

In [19]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X, y, batch_size=128, epochs=50, verbose=1, callbacks=callbacks)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50