In [2]:
import os
import numpy
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.utils import to_categorical
import re
import numpy as np
import sys

In [10]:
SEQUENCE_LENGTH = 20
EPOCHS = 200
BATCH_SIZE = 64
EPS = 1e-6

In [4]:
def clear_file(input):
    def isCorrectChar(c):
        return c.isspace() or c == '”' or c == '\'' or (c.isalpha() and (c not in alp))

    alp = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j',
           'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u',
           'v', 'w', 'x', 'y', 'z', 'ê', 'ê', 'ê']
    
    inputFile = os.path.join("results", f'{input}.txt')
    outputFile = os.path.join("results", f'New{input}.txt')

    text = open(inputFile).read()
    text = text.lower()
    text = "".join(list(filter(isCorrectChar, text)))
    text = re.sub('\n+', '\n', re.sub('\n ', '\n', re.sub(' +', ' ', text)))
    open(outputFile, "w").write(text)
    return outputFile

def init_data(file):
    raw_text = open(file).read()

    # Составление словарей    
    chars = sorted(list(set(raw_text)))
    charDic = dict((c, i) for i, c in enumerate(chars))
    intDic = dict((i, c) for i, c in enumerate(chars))
    amount_chars = len(raw_text)
    amount_different_chars =  len(chars)

    
    xt, yt = [], []
    for i in range(amount_chars - SEQUENCE_LENGTH):
        sequence_from = raw_text[i:i + SEQUENCE_LENGTH]
        char_out =  raw_text[i + SEQUENCE_LENGTH]
        x_arr_tmp = list(map(lambda char: charDic[char], sequence_from))
        xt.append(x_arr_tmp)
        yt.append(charDic[char_out])

    print(xt[2],yt[2])
    x = numpy.reshape(xt, (len(xt), SEQUENCE_LENGTH, 1))
    x = x / float(amount_different_chars)
    y = to_categorical(yt)

    return x, y, xt, intDic

In [5]:
def brench(file, n, k, m):
    print("Brench gen")
    text = open(file).read()
    
    s = set()
    for i in range(len(text) - n + 1):
        s.add(text[i:i + n])
        #print(text[i:i+n])
    print("Set:",len(s))
    
    graph, w_to_int, int_to_w = get_graph(s, text, n)
    
    lines = text.split('\n')
    start = np.random.randint(0, len(lines) - 1)
    start_line = lines[start]
    
    
    prefix = start_line[0:k]
    print('Start phrase:')
    print("Generating:")
    start_window = prefix[len(prefix) - n:]
    for i in range(m):
        sug_next_pos = graph[w_to_int[start_window]]
        max_v = max(sug_next_pos)
        res = []
        for i in range(len(sug_next_pos)):
            if abs(sug_next_pos[i] - max_v) < EPS:
                res.append(i) 
        if len(res) == 0:
            print(f'Can\'t continue', flush=True)
            break
        elif len(res) == 1:
            ind = 0
        else:
            ind = np.random.randint(0, len(res) - 1)
        next_pos = res[ind]
        start_window = int_to_w[next_pos]
        sys.stdout.write(start_window[len(start_window) - 1])
    print("End generating text")



def get_graph(s, text, n):
    window_to_int = dict((c, i) for i, c in enumerate(s))
    int_to_wwindow = dict((i, c) for i, c in enumerate(s))
    
    matrix = [[0 for _ in range(len(s))] for _ in range(len(s))]
    
    for i in range(len(text) - n):
        cur_w = text[i:i + n]
        next_w = text[(i + 1):(i + n + 1)]
        matrix[window_to_int[cur_w]][window_to_int[next_w]] += 1
    matrix = np.array(norm_matrix(matrix))
    return matrix, window_to_int, int_to_wwindow 




def norm_matrix(matrix):
    new_matrix = []
    for row in matrix:
        sum_v = sum(row)
        if sum_v != 0:
            new_row = list(map(lambda x: x / sum_v, row))
            new_matrix.append(new_row)
        else:
            new_matrix.append(row)
    return new_matrix

In [6]:
def init_model(x_arr_dataset, y_arr_dataset):
    sgd = SGD(learning_rate=0.1, decay=1e-6, momentum=0.9, nesterov=True)

    model = Sequential()
    
    model.add(LSTM(256, input_shape=(x_arr_dataset.shape[1], x_arr_dataset.shape[2])))
    model.add(Dropout(0.2))
    model.add(Dense(y_arr_dataset.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer=sgd)
    return model


def teach_model(file):
    print("Start teaching model", flush=True)

    xt, yt, tx, ty = init_data(file)
    print(xt, yt)

    model = init_model(xt, yt)

    model.fit(xt, yt, epochs=EPOCHS, batch_size=BATCH_SIZE)
    print("End teaching model", flush=True)
    return model, tx, ty

In [7]:
def gen(file, model, amount_sequence, tx, ty):
    print("Generating")
    
    amount_different_chars = len(ty)

    start_sequence_id = numpy.random.randint(0, len(tx) - 1)
    start_sequence = tx[start_sequence_id]
    sequence_from = "".join([ty[value] for value in start_sequence])
    
    print("Start phrase:")
    print(sequence_from)
    
    print("Generating:")
    for i in range(amount_sequence):
        x = numpy.reshape(start_sequence, (1, len(start_sequence), 1))
        x = x / float(amount_different_chars)
        
        prediction = model.predict(x, verbose=0)
        index = numpy.argmax(prediction)
        result = ty[index]
        sys.stdout.write(result)
        start_sequence.append(index)
        start_sequence = start_sequence[1:len(start_sequence)]
    print(f'\nEnd generating text', flush=True)

In [25]:
raw_file = "Stoik"

cleared_file = clear_file(raw_file)
model = teach_model(cleared_file)

Start teaching model
[[[0.        ]
  [0.61111111]
  [0.25      ]
  ...
  [0.61111111]
  [0.5       ]
  [0.33333333]]

 [[0.61111111]
  [0.25      ]
  [0.5       ]
  ...
  [0.5       ]
  [0.33333333]
  [0.38888889]]

 [[0.25      ]
  [0.5       ]
  [0.22222222]
  ...
  [0.33333333]
  [0.38888889]
  [0.        ]]

 ...

 [[0.38888889]
  [0.5       ]
  [0.19444444]
  ...
  [0.11111111]
  [0.55555556]
  [0.11111111]]

 [[0.5       ]
  [0.19444444]
  [0.5       ]
  ...
  [0.55555556]
  [0.11111111]
  [0.61111111]]

 [[0.19444444]
  [0.5       ]
  [0.02777778]
  ...
  [0.11111111]
  [0.61111111]
  [0.11111111]]] [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
Epoch 1/6
Epoch 1: loss improved from inf to 2.93134, saving model to results/radion\epoch_01__loss_2.9313.hdf5
Epoch 2/6
Epoch 2: loss improved from 2.93134 to 2.76905, saving model to results/radion\epoch_02__loss_2.7691.hdf5
E

In [27]:
gen(cleared_file, model, amount_sequence=20)

Start generating text under teached model on results/radion/epoch_30__loss_1.8504.hdf5
Start phrase:
я хочу избежать лишн
Generating:
 и соолан пооомуел с
End generating text


In [14]:
raw_file = "Chehov"

cleared_file = clear_file(raw_file)
brench(cleared_file, 4, 10, 20)

Start generating text for results\___clear___Chehov.txt
Windows: 7978
Start phrase:
 пойдемте
Generating:
 кто не мог бы ни од
End generating text


In [11]:
raw_file = "Chehov"

cleared_file = clear_file(raw_file)
model, tx ,ty = teach_model(cleared_file)

Start teaching model
[30, 9, 17, 5, 16, 11, 13, 0, 8, 28, 8, 1, 20, 1, 19, 3, 16, 16, 8, 6] 17
[[[0.36111111]
  [0.52777778]
  [0.83333333]
  ...
  [0.08333333]
  [0.44444444]
  [0.44444444]]

 [[0.52777778]
  [0.83333333]
  [0.25      ]
  ...
  [0.44444444]
  [0.44444444]
  [0.22222222]]

 [[0.83333333]
  [0.25      ]
  [0.47222222]
  ...
  [0.44444444]
  [0.22222222]
  [0.16666667]]

 ...

 [[0.55555556]
  [0.58333333]
  [0.61111111]
  ...
  [0.02777778]
  [0.44444444]
  [0.47222222]]

 [[0.58333333]
  [0.61111111]
  [0.72222222]
  ...
  [0.44444444]
  [0.47222222]
  [0.72222222]]

 [[0.61111111]
  [0.72222222]
  [0.08333333]
  ...
  [0.47222222]
  [0.72222222]
  [0.86111111]]] [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]]
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13

In [15]:
gen(cleared_file, model, 20, tx ,ty)

Generating
Start phrase:
но для простых людей
Generating:
 пока довольно одной
End generating text


In [16]:
gen(cleared_file, model, 20, tx ,ty)

Generating
Start phrase:
ерного не видала вдо
Generating:
воль стала чахнуть о
End generating text


In [19]:
gen(cleared_file, model, 50, tx ,ty)

Generating
Start phrase:
 мужа и хлеба черног
Generating:
о не видала вдоволь стала чахнуть от такой жизни д
End generating text


In [20]:
gen(cleared_file, model, 50, tx ,ty)

Generating
Start phrase:
 историю
 да я хотел
Generating:
 тогда рассказать про своего брата
иван иваныч про
End generating text


In [44]:
a,b,c,d = init_data(cleared_file)

[30, 9, 17, 5, 16, 11, 13, 0, 8, 28, 8, 1, 20, 1, 19, 3, 16, 16, 8, 6] 17


In [57]:
raw_file = "Chehov"

cleared_file = clear_file(raw_file)
brench(cleared_file, 4, 10, 20)

Brench gen
Windows: 7978
Start phrase:
 нам нужно
Generating:
 это было скорблю дуEnd generating text


In [63]:
raw_file = "Chehov"

cleared_file = clear_file(raw_file)
brench(cleared_file, 4, 10, 20)

Brench gen
Set: 7978
Start phrase:
Generating:
 тут пока но в казалEnd generating text
