In [109]:
import re
from urllib.parse import urlparse

import numpy
import requests
from bs4 import BeautifulSoup
from keras.callbacks import ModelCheckpoint
from keras.layers import (
    Dense,
    LSTM, Dropout,
)
from keras.models import Sequential
from keras.utils import np_utils
from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/denisivanov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [110]:
TEXT_RESOURCE_PAGES = 10
TEXT_CORPUS_PATH = 'text_corpus.txt'
MODEL_WEIGHTS_PATH = 'model_weights_saved.hdf5'
NEED_TO_LOAD_TEXT = False
NEED_TO_FIT_MODEL = False
SEQUENCE_LENGTH = 20
EPOCH_NUM = 16
BATCH_SIZE = 512
NEURO_SIZE = 512

In [111]:
def _get_domain(url):
    return '.'.join(urlparse(url).netloc.split('.')[-2:])


def _parse_default(html):
    all_texts = ' '.join(re.findall(r'[а-яА-ЯёЁ]+', html))
    all_texts = re.sub(r'\n', r' ', all_texts)
    all_texts = re.sub(r'\s+', ' ', all_texts)
    all_texts = all_texts.strip()
    return all_texts


def _parse_livejournal_page(html):
    soup = BeautifulSoup(html, "html.parser")
    all_texts = soup.find_all('p', {'class': None})
    all_texts = ' '.join((p.text for p in all_texts))
    return _parse_default(all_texts)


page_parsers = {
    'livejournal.com': _parse_livejournal_page,
    'default': _parse_default,
}

def get_text_from_url(url) -> str:
    print(f'Getting {url} ...')
    response = requests.get(url)
    if response.status_code != 200:
        print(f'Some error: {response.status_code}')
        return ''

    parser = page_parsers.get(_get_domain(url))
    if not parser:
        parser = page_parsers['default']
    return parser(response.text)

In [112]:
if NEED_TO_LOAD_TEXT:
    urls = [f'https://pesen-net.livejournal.com/?skip={i*10}' for i in range(TEXT_RESOURCE_PAGES)]

    texts = []
    for url in urls:
        text = get_text_from_url(url)
        texts.append(text)

    text = ' '.join(texts)
    cleaned_text = text.lower()

    with open(TEXT_CORPUS_PATH, 'w') as f:
        f.write(cleaned_text)

else:
    with open(TEXT_CORPUS_PATH, 'r') as f:
        cleaned_text = f.read()

In [113]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(cleaned_text)

In [114]:
cleaned_text = ' '.join(filter(lambda x: x not in stopwords.words('russian'), tokens))
chars = sorted(list(set(cleaned_text)))
char_to_num = {char: num for num, char in enumerate(chars)}
num_to_char = {num: char for num, char in enumerate(chars)}

In [115]:
input_len = len(cleaned_text)
vocab_len = len(chars)
x_data, y_data = [], []

In [116]:
for i in range(0, input_len - SEQUENCE_LENGTH, 1):
    in_seq = cleaned_text[i:i + SEQUENCE_LENGTH]
    out_seq = cleaned_text[i + SEQUENCE_LENGTH]

    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

    if i <= 3:
        print(
            f'In seq:\t{in_seq}',
            f'Out seq:\t{out_seq}',
            f'X data:\t{x_data}',
            f'Y data: \t{y_data}',
            sep='\n'
        )


In seq:	самолёте пассажир со
Out seq:	б
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15]]
Y data: 	[2]
In seq:	амолёте пассажир соб
Out seq:	л
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15], [1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15, 2]]
Y data: 	[2, 12]
In seq:	молёте пассажир собл
Out seq:	а
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15], [1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15, 2], [13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15, 2, 12]]
Y data: 	[2, 12, 1]
In seq:	олёте пассажир собла
Out seq:	з
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15], [1, 13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15, 2], [13, 15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15, 2, 12], [15, 12, 33, 19, 6, 0, 16, 1, 18, 18, 1, 7, 9, 17, 0, 18, 15, 2, 12, 1]]
Y data: 	[2, 12, 1

If  you want to print something from `for`:

```
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
```

```
To create a jupyter_notebook_config.py file, with all the defaults commented out, you can use the following command line:

$ jupyter notebook --generate-config

Open the file and search for c.NotebookApp.iopub_data_rate_limit

Comment out the line c.NotebookApp.iopub_data_rate_limit = 1000000 and change it to a higher default rate. l used c.NotebookApp.iopub_data_rate_limit = 10000000
```

https://stackoverflow.com/questions/43288550/iopub-data-rate-exceeded-in-jupyter-notebook-when-viewing-image

In [117]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 100481


In [118]:
X = numpy.reshape(x_data, (n_patterns, SEQUENCE_LENGTH, 1))
X = X / float(vocab_len)
X

array([[[0.52941176],
        [0.02941176],
        [0.38235294],
        ...,
        [0.        ],
        [0.52941176],
        [0.44117647]],

       [[0.02941176],
        [0.38235294],
        [0.44117647],
        ...,
        [0.52941176],
        [0.44117647],
        [0.05882353]],

       [[0.38235294],
        [0.44117647],
        [0.35294118],
        ...,
        [0.44117647],
        [0.05882353],
        [0.35294118]],

       ...,

       [[0.44117647],
        [0.47058824],
        [0.58823529],
        ...,
        [0.35294118],
        [0.26470588],
        [0.32352941]],

       [[0.47058824],
        [0.58823529],
        [0.64705882],
        ...,
        [0.26470588],
        [0.32352941],
        [0.44117647]],

       [[0.58823529],
        [0.64705882],
        [0.        ],
        ...,
        [0.32352941],
        [0.44117647],
        [0.41176471]]])

In [119]:
y = np_utils.to_categorical(y_data)
y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [120]:
model = Sequential()
model.add(LSTM(NEURO_SIZE, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(NEURO_SIZE, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(int(NEURO_SIZE / 2)))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_22 (LSTM)               (None, 20, 512)           1052672   
_________________________________________________________________
dropout_19 (Dropout)         (None, 20, 512)           0         
_________________________________________________________________
lstm_23 (LSTM)               (None, 20, 512)           2099200   
_________________________________________________________________
dropout_20 (Dropout)         (None, 20, 512)           0         
_________________________________________________________________
lstm_24 (LSTM)               (None, 256)               787456    
_________________________________________________________________
dropout_21 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 34)               

In [121]:
if NEED_TO_FIT_MODEL:
    checkpoint = ModelCheckpoint(MODEL_WEIGHTS_PATH, monitor='loss', verbose=1, save_best_only=True, mode='min')
    desired_callbacks = [checkpoint]
    model.fit(X, y, epochs=EPOCH_NUM, batch_size=BATCH_SIZE, callbacks=desired_callbacks)

I've reinstall h5py, because of an error during loading weights
```AttributeError: 'str' object has no attribute 'decode'```
```bash
pip install h5py==2.10.0 --force-reinstall
```

In [122]:
model.load_weights(MODEL_WEIGHTS_PATH)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [124]:
chars_indexes = []
for i in range(SEQUENCE_LENGTH):
    char_index = numpy.random.randint(0, vocab_len - 1)
    chars_indexes.append(char_index)

pattern = chars_indexes
# pattern = x_data[start]
print("Random Seed:")
print(pattern)
print(''.join([num_to_char[value] for value in pattern]), '\n\n')

result = []

for i in range(300):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)

    index = numpy.argmax(prediction)
    char = num_to_char[index]
    result.append(char)

    if i < 1:
        print(
            f'Prediction:\t{prediction}',
            f'Max:\t{numpy.max(prediction)}',
            f'Argmax:\t{index}',
            f'Char:\t{char}',
            sep='\n',
        )

    pattern.append(index)

    pattern = pattern[1:len(pattern)]

''.join(result)

Random Seed:
[17, 4, 13, 11, 22, 16, 0, 19, 22, 19, 22, 15, 8, 9, 8, 31, 19, 10, 0, 10]
ргмкхп тхтхозизютй й
Prediction:	[[3.7754485e-03 2.9628980e-01 1.0279366e-02 1.8305356e-02 1.9361570e-02
  4.1098166e-02 5.7384878e-02 1.1294904e-03 2.5825379e-02 3.9272014e-02
  2.8626146e-04 1.3105953e-03 2.0950457e-02 1.2283113e-02 1.4278574e-01
  1.1952854e-01 1.4679553e-03 7.3389836e-02 2.8962744e-02 6.3163107e-03
  4.1250158e-02 5.5246078e-04 2.1606458e-03 1.3026056e-03 8.4159913e-04
  1.0998097e-03 3.0364662e-03 1.8357257e-04 6.4738379e-03 4.1809585e-04
  6.4184674e-04 1.1243784e-02 1.9141474e-03 8.8779386e-03]]
Max:	0.2962898015975952
Argmax:	1
Char:	а


'арантине собак  саздели собака сазвилась селефон пододненно сазвитель сакон поддотова саидели пододроиненно пододненно сазвитель сакон поддотова саидели пододроиненно пододненно сазвитель сакон поддотова саидели пододроиненно пододненно сазвитель сакон поддотова саидели пододроиненно пододненно сазв'