In [1]:
import re
from urllib.parse import urlparse

import numpy
import requests
from bs4 import BeautifulSoup
from keras.callbacks import ModelCheckpoint
from keras.layers import (
    Dense,
    LSTM, Dropout,
)
from keras.models import Sequential
from keras.utils import np_utils
from nltk import download
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

download('stopwords')

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/denisivanov/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
TEXT_RESOURCE_PAGES = 10
TEXT_CORPUS_PATH = 'text_corpus.txt'
MODEL_WEIGHTS_PATH = 'model_weights_saved.hdf5'
NEED_TO_LOAD_TEXT = False
NEED_TO_FIT_MODEL = True
SEQUENCE_LENGTH = 9
EPOCH_NUM = 4
BATCH_SIZE = 256
NEURO_ISZE = 32

In [3]:
def _get_domain(url):
    return '.'.join(urlparse(url).netloc.split('.')[-2:])


def _parse_default(html):
    all_texts = ' '.join(re.findall(r'[а-яА-ЯёЁ]+', html))
    all_texts = re.sub(r'\n', r' ', all_texts)
    all_texts = re.sub(r'\s+', ' ', all_texts)
    all_texts = all_texts.strip()
    return all_texts


def _parse_livejournal_page(html):
    soup = BeautifulSoup(html, "html.parser")
    all_texts = soup.find_all('p', {'class': None})
    all_texts = ' '.join((p.text for p in all_texts))
    return _parse_default(all_texts)


page_parsers = {
    'livejournal.com': _parse_livejournal_page,
    'default': _parse_default,
}

def get_text_from_url(url) -> str:
    print(f'Getting {url} ...')
    response = requests.get(url)
    if response.status_code != 200:
        print(f'Some error: {response.status_code}')
        return ''

    parser = page_parsers.get(_get_domain(url))
    if not parser:
        parser = page_parsers['default']
    return parser(response.text)

In [4]:
if NEED_TO_LOAD_TEXT:
    urls = [f'https://pesen-net.livejournal.com/?skip={i*10}' for i in range(TEXT_RESOURCE_PAGES)]

    texts = []
    for url in urls:
        text = get_text_from_url(url)
        texts.append(text)

    text = ' '.join(texts)
    cleaned_text = text.lower()

    with open(TEXT_CORPUS_PATH, 'w') as f:
        f.write(cleaned_text)

else:
    with open(TEXT_CORPUS_PATH, 'r') as f:
        cleaned_text = f.read()

In [5]:
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(cleaned_text)

In [6]:
cleaned_text = ' '.join(filter(lambda x: x not in stopwords.words('russian'), tokens))
chars = sorted(list(set(cleaned_text)))
char_to_num = {char: num for num, char in enumerate(chars)}
num_to_char = {num: char for num, char in enumerate(chars)}

In [7]:
input_len = len(cleaned_text)
vocab_len = len(chars)
x_data, y_data = [], []

In [8]:
for i in range(0, input_len - SEQUENCE_LENGTH, 1):
    in_seq = cleaned_text[i:i + SEQUENCE_LENGTH]
    out_seq = cleaned_text[i + SEQUENCE_LENGTH]

    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

    if i <= 3:
        print(
            f'In seq:\t{in_seq}',
            f'Out seq:\t{out_seq}',
            f'X data:\t{x_data}',
            f'Y data: \t{y_data}',
            sep='\n'
        )


In seq:	самолёте 
Out seq:	п
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0]]
Y data: 	[16]
In seq:	амолёте п
Out seq:	а
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0], [1, 13, 15, 12, 33, 19, 6, 0, 16]]
Y data: 	[16, 1]
In seq:	молёте па
Out seq:	с
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0], [1, 13, 15, 12, 33, 19, 6, 0, 16], [13, 15, 12, 33, 19, 6, 0, 16, 1]]
Y data: 	[16, 1, 18]
In seq:	олёте пас
Out seq:	с
X data:	[[18, 1, 13, 15, 12, 33, 19, 6, 0], [1, 13, 15, 12, 33, 19, 6, 0, 16], [13, 15, 12, 33, 19, 6, 0, 16, 1], [15, 12, 33, 19, 6, 0, 16, 1, 18]]
Y data: 	[16, 1, 18, 18]


If  you want to print something from `for`:

```
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
```

```
To create a jupyter_notebook_config.py file, with all the defaults commented out, you can use the following command line:

$ jupyter notebook --generate-config

Open the file and search for c.NotebookApp.iopub_data_rate_limit

Comment out the line c.NotebookApp.iopub_data_rate_limit = 1000000 and change it to a higher default rate. l used c.NotebookApp.iopub_data_rate_limit = 10000000
```

https://stackoverflow.com/questions/43288550/iopub-data-rate-exceeded-in-jupyter-notebook-when-viewing-image

In [9]:
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 100492


In [10]:
X = numpy.reshape(x_data, (n_patterns, SEQUENCE_LENGTH, 1))
X = X / float(vocab_len)
X

array([[[0.52941176],
        [0.02941176],
        [0.38235294],
        ...,
        [0.55882353],
        [0.17647059],
        [0.        ]],

       [[0.02941176],
        [0.38235294],
        [0.44117647],
        ...,
        [0.17647059],
        [0.        ],
        [0.47058824]],

       [[0.38235294],
        [0.44117647],
        [0.35294118],
        ...,
        [0.        ],
        [0.47058824],
        [0.02941176]],

       ...,

       [[0.26470588],
        [0.26470588],
        [0.29411765],
        ...,
        [0.35294118],
        [0.26470588],
        [0.32352941]],

       [[0.26470588],
        [0.29411765],
        [0.        ],
        ...,
        [0.26470588],
        [0.32352941],
        [0.44117647]],

       [[0.29411765],
        [0.        ],
        [0.52941176],
        ...,
        [0.32352941],
        [0.44117647],
        [0.41176471]]])

In [11]:
y = np_utils.to_categorical(y_data)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.]], dtype=float32)

In [20]:
model = Sequential()
model.add(LSTM(NEURO_ISZE, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(NEURO_ISZE, return_sequences=True))
model.add(LSTM(NEURO_ISZE, return_sequences=True))
model.add(LSTM(NEURO_ISZE, return_sequences=True))
model.add(LSTM(NEURO_ISZE, return_sequences=True))
model.add(LSTM(NEURO_ISZE, return_sequences=True))
model.add(LSTM(NEURO_ISZE))
model.add(Dense(y.shape[1], activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam')

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, 9, 64)             16896     
_________________________________________________________________
dropout_6 (Dropout)          (None, 9, 64)             0         
_________________________________________________________________
lstm_13 (LSTM)               (None, 9, 64)             33024     
_________________________________________________________________
lstm_14 (LSTM)               (None, 9, 64)             33024     
_________________________________________________________________
lstm_15 (LSTM)               (None, 9, 64)             33024     
_________________________________________________________________
lstm_16 (LSTM)               (None, 9, 64)             33024     
_________________________________________________________________
lstm_17 (LSTM)               (None, 9, 64)            

In [21]:
if NEED_TO_FIT_MODEL:
    checkpoint = ModelCheckpoint(MODEL_WEIGHTS_PATH, monitor='loss', verbose=1, save_best_only=True, mode='min')
    desired_callbacks = [checkpoint]
    model.fit(X, y, epochs=EPOCH_NUM, batch_size=BATCH_SIZE, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 3.13890, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.13890 to 3.12179, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 3.12179 to 3.12146, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 3.12146 to 3.12121, saving model to model_weights_saved.hdf5


I've reinstall h5py, because of an error during loading weights
```AttributeError: 'str' object has no attribute 'decode'```
```bash
pip install h5py==2.10.0 --force-reinstall
```

In [14]:
model.load_weights(MODEL_WEIGHTS_PATH)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print(pattern)
print(''.join([num_to_char[value] for value in pattern]))

Random Seed:
[16, 20, 19, 6, 25, 6, 18, 19, 3]
путешеств


In [16]:
result = []

for i in range(30):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)

    index = numpy.argmax(prediction)
    char = num_to_char[index]
    result.append(char)

    if i < 3:
        print(
            f'Prediction:\t{prediction}',
            f'Max:\t{numpy.max(prediction)}',
            f'Argmax:\t{index}',
            f'Char:\t{char}',
            sep='\n',
        )

    pattern.append(index)

    pattern = pattern[1:len(pattern)]

''.join(result)

Prediction:	[[0.12744235 0.04126055 0.01702779 0.04760411 0.01921226 0.02991275
  0.03535248 0.01482173 0.01767742 0.04371662 0.01254199 0.04195932
  0.05523998 0.04256954 0.0654618  0.06206031 0.03534507 0.03536747
  0.07051218 0.04500466 0.02420885 0.00448    0.01444068 0.00736378
  0.01591341 0.00972368 0.00367934 0.00086351 0.01495632 0.0065235
  0.00379237 0.00928333 0.02151926 0.00316161]]
Max:	0.1274423450231552
Argmax:	0
Char:	 
Prediction:	[[0.0598829  0.0394749  0.02250754 0.04475373 0.0169071  0.04324585
  0.04116851 0.0145909  0.03516687 0.02749087 0.00462094 0.03964369
  0.06361637 0.04141466 0.05975288 0.06545603 0.08433738 0.05505801
  0.07762568 0.04511007 0.0263584  0.00619349 0.01137932 0.00551456
  0.01374922 0.01082249 0.00388751 0.00089075 0.01043459 0.00319752
  0.00581302 0.00709635 0.00953626 0.00330168]]
Max:	0.08433738350868225
Argmax:	16
Char:	п
Prediction:	[[0.0338835  0.13441618 0.01559567 0.01849491 0.00848222 0.02143398
  0.10082453 0.00601043 0.00827069 

' пооа оооа оооа оооа оооа оооа'