In [2]:
import scipy
print('scipy: %s' % scipy.__version__)
# numpy
import numpy as np
print('numpy: %s' % np.__version__)
# matplotlib
import matplotlib
print('matplotlib: %s' % matplotlib.__version__)
# pandas
import pandas
print('pandas: %s' % pandas.__version__)
# statsmodels
import statsmodels
print('statsmodels: %s' % statsmodels.__version__)
# scikit-learn
import sklearn
print('sklearn: %s' % sklearn.__version__)
# tensorflow
import tensorflow
print('tensorflow: %s' % tensorflow.__version__)
#keras
import keras
print('keras: %s' % keras.__version__)
from pickle import load
from numpy.random import rand
from numpy.random import shuffle
import string
import re
from pickle import dump
from unicodedata import normalize
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding,TimeDistributed,Activation, Dense, Dropout
from keras.layers.recurrent import LSTM
from tensorflow.keras.layers import Input, RepeatVector
from keras.utils.vis_utils import plot_model
from tensorflow.keras.utils import to_categorical
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu
from numpy import argmax
from keras.preprocessing.sequence import pad_sequences
from keras.callbacks import ModelCheckpoint

scipy: 1.7.1
numpy: 1.20.3
matplotlib: 3.4.3
pandas: 1.3.4
statsmodels: 0.12.2
sklearn: 0.24.2
tensorflow: 2.8.0
keras: 2.8.0


In [3]:

#Загрузка данных
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

# Разбивание в массив
def to_pairs(doc):
    lines = doc.strip().split('\n')
    pairs = [line.split('\t') for line in  lines]
    pairs = np.delete(pairs,2,1)
    return pairs

# Очистка данных от знаков препинания + перевод в нижний регистр
def clean_pairs(lines):
    cleaned = list()
    table = str.maketrans('', '', string.punctuation)
    for pair in lines:
        clean_pair = list()
        for line in pair:
            line = line.split()
            line = [word.lower() for word in line]
            line = [word.translate(table) for word in line]
            clean_pair.append(' '.join(line))
        cleaned.append(clean_pair)
    return array(cleaned)

# Сохранение в pkl
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

filename = 'rus.txt'
doc = load_doc(filename)
pairs = to_pairs(doc)
print(pairs)
clean_pairs = clean_pairs(pairs)
print(clean_pairs)
save_clean_data(clean_pairs, 'english-russian.pkl')
for i in range(100):
    print('[%s] => [%s]' % (clean_pairs[i,0], clean_pairs[i,1]))

In [4]:

# Загрузка уже чистых данных
def load_clean_sentences(filename):
    return load(open(filename, 'rb'))

# Сохрание чистых данных
def save_clean_data(sentences, filename):
    dump(sentences, open(filename, 'wb'))
    print('Saved: %s' % filename)

# загрузка набора данных
raw_dataset = load_clean_sentences('english-russian.pkl')

# Уменьшение кол-ва примеров
n_sentences = 50000
dataset = raw_dataset[:n_sentences, :]
# Перемешка
shuffle(dataset)
# Разделения на обучающие и проверчноые примеры
train, test = dataset[:45000], dataset[45000:]
# Сохранение
save_clean_data(dataset, 'english-russian-both.pkl')
save_clean_data(train, 'english-russian-train.pkl')
save_clean_data(test, 'english-russian-test.pkl')

Saved: english-russian-both.pkl
Saved: english-russian-train.pkl
Saved: english-russian-test.pkl


In [5]:
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer
def max_length(lines):
    return max(len(line.split()) for line in lines)
# Токенизация Английского словаря
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# Токенизация Русского словаря
rus_tokenizer = create_tokenizer(dataset[:, 1])
rus_vocab_size = len(rus_tokenizer.word_index) + 1
rus_length = max_length(dataset[:, 1])
print('Russian Vocabulary Size: %d' % rus_vocab_size)
print('Russian Max Length: %d' % (rus_length))

English Vocabulary Size: 5065
English Max Length: 6
Russian Vocabulary Size: 13792
Russian Max Length: 10


In [9]:
def encode_sequences(tokenizer, length, lines):
    X = tokenizer.texts_to_sequences(lines)
    X = pad_sequences(X, maxlen=length, padding='post')
    return X
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

trainX = encode_sequences(rus_tokenizer, rus_length, train[:, 1])
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0])
trainY = encode_output(trainY, eng_vocab_size)

testX = encode_sequences(rus_tokenizer, rus_length, test[:, 1])
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_output(testY, eng_vocab_size)

[[  33   25 4059 ...    0    0    0]
 [  48  985    0 ...    0    0    0]
 [   1  763  101 ...    0    0    0]
 ...
 [ 406    4   19 ...    0    0    0]
 [   2 4502 4835 ...    0    0    0]
 [1970  276    6 ...    0    0    0]]


In [15]:
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

model = define_model(rus_vocab_size, eng_vocab_size, rus_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 10, 256)           3530752   
                                                                 
 lstm_4 (LSTM)               (None, 256)               525312    
                                                                 
 repeat_vector_2 (RepeatVect  (None, 6, 256)           0         
 or)                                                             
                                                                 
 lstm_5 (LSTM)               (None, 6, 256)            525312    
                                                                 
 time_distributed_2 (TimeDis  (None, 6, 5065)          1301705   
 tributed)                                                       
                                                                 
Total params: 5,883,081
Trainable params: 5,883,081
No

<keras.callbacks.History at 0x17baf6aa190>

In [11]:
def word_for_id(interus, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == interus:
            return word
    return None

def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    interuss = [argmax(vector) for vector in prediction]
    target = list()
    for i in interuss:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

def evaluate_model(model, tokenizer, sources, raw_dataset):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        raw_target, raw_src = raw_dataset[i]
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation))
        actual.append(raw_target.split())
        predicted.append(translation.split())
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

dataset = load_clean_sentences('english-russian-both.pkl')
train = load_clean_sentences('english-russian-train.pkl')
test = load_clean_sentences('english-russian-test.pkl')
eng_tokenizer = create_tokenizer(dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(dataset[:, 0])
rus_tokenizer = create_tokenizer(dataset[:, 1])
rus_vocab_size = len(rus_tokenizer.word_index) + 1
rus_length = max_length(dataset[:, 1])
trainX = encode_sequences(rus_tokenizer, rus_length, train[:, 1])
testX = encode_sequences(rus_tokenizer, rus_length, test[:, 1])
model = load_model('model.h5')
print('train')
evaluate_model(model, eng_tokenizer, trainX, train)
print('test')
evaluate_model(model, eng_tokenizer, testX, test)

train
src=[кто вас избил], target=[who beat you up], predicted=[who nails you]
src=[сейчас полтретьего], target=[its now 230], predicted=[now that]
src=[я боюсь идти], target=[i am afraid to go], predicted=[im going to]
src=[он огляделся], target=[he looked around], predicted=[hes is]
src=[будьте кратки пожалуйста], target=[be brief please], predicted=[please be please]
src=[ты была несчастна], target=[were you unhappy], predicted=[you you eat]
src=[это не моя машина], target=[thats not my car], predicted=[its isnt my car]
src=[ты не устала], target=[werent you tired], predicted=[you you go in]
src=[разбей это], target=[smash it], predicted=[modest it]
src=[я не ем], target=[i am not eating], predicted=[i not a put]


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.079154
BLEU-2: 0.000903
BLEU-3: 0.000000
BLEU-4: 0.000000
test
src=[мы его исправим], target=[well fix it], predicted=[we okay it]
src=[вы мне надоели], target=[im sick of you], predicted=[did you me me]
src=[не прикасайся ко мне], target=[dont touch me], predicted=[keep on of me]
src=[я не так уж и устал], target=[im not so tired], predicted=[im not still day]
src=[том увидел кошку], target=[tom saw the cat], predicted=[tom will theirs]
src=[вставай с постели], target=[get up out of bed], predicted=[get up next]
src=[я видел нескольких], target=[ive seen a few], predicted=[i saw a sacred]
src=[можно потрогать], target=[can i touch it], predicted=[may can an fishing]
src=[ловите мяч], target=[catch the ball], predicted=[everyone it gate]
src=[у тебя дети есть], target=[do you have kids], predicted=[do you have a]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


BLEU-1: 0.073889
BLEU-2: 0.000000
BLEU-3: 0.000000
BLEU-4: 0.000000


TypeError: 'Tokenizer' object is not subscriptable