In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import string
from string import digits
import re
from sklearn.utils import shuffle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import LSTM, Input, Dense,Embedding, Concatenate, TimeDistributed
from tensorflow.keras.models import Model,load_model, model_from_json
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing.text import one_hot, Tokenizer
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

2021-11-18 14:12:39.879456: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-11-18 14:12:39.879482: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Read the Data and Create DF

In [2]:
en_text = open('data/europarl-v7.pl-en.en', 'r').read().split('\n')
pl_text = open('data/europarl-v7.pl-en.pl', 'r').read().split('\n')
print(len(pl_text), len(en_text))

632566 632566


In [3]:
language_df = pd.DataFrame(columns=['en', 'pl'])
en_list = []
pl_list = []
for i in range(len(pl_text)):
    en_list.append(en_text[i])
    pl_list.append(pl_text[i])
language_df['en'] = en_list
language_df['pl'] = pl_list
language_df.head()

Unnamed: 0,en,pl
0,Action taken on Parliament's resolutions: see ...,Działania podjęte w wyniku rezolucji Parlament...
1,Documents received: see Minutes,Składanie dokumentów: patrz protokół
2,Written statements (Rule 116): see Minutes,Oświadczenia pisemne (art. 116 Regulaminu): pa...
3,Texts of agreements forwarded by the Council: ...,Teksty porozumień przekazane przez Radę: patrz...
4,Membership of Parliament: see Minutes,Skład Parlamentu: patrz protokół


# Data Cleaning

In [4]:
def clean_text(text):
    # to lower case and remove spaces and end and start
    text = text.lower().strip()
    # remove digits
    text = re.sub('\d', '', text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

language_df['en'] = language_df['en'].apply(clean_text)
language_df['pl'] = language_df['pl'].apply(clean_text)
language_df.head()

Unnamed: 0,en,pl
0,action taken on parliaments resolutions see mi...,działania podjęte w wyniku rezolucji parlament...
1,documents received see minutes,składanie dokumentów patrz protokół
2,written statements rule see minutes,oświadczenia pisemne art regulaminu patrz pro...
3,texts of agreements forwarded by the council s...,teksty porozumień przekazane przez radę patrz ...
4,membership of parliament see minutes,skład parlamentu patrz protokół


In [5]:
# putting START and END in polish sentences
language_df['pl'] = language_df['pl'].apply(lambda text: "START "+text+" END")
print("English Sentence: " + language_df['en'].iloc[0])
print("Polish Sentence: " + language_df['pl'].iloc[0])

English Sentence: action taken on parliaments resolutions see minutes
Polish Sentence: START działania podjęte w wyniku rezolucji parlamentu patrz protokól END


# Data Preparation for Model Building

In [7]:
# train test split
x = language_df['en']
y = language_df['pl']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)

# determine the maximum length of sentence
def determine_max_len(data):
    return max([len(x.split(' ')) for x in data])
max_len = max(determine_max_len(x_train), determine_max_len(y_train))
max_len

254

In [8]:
# tokenization
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(x_train)
vocab_size_en = len(en_tokenizer.word_index) + 1

x_train = en_tokenizer.texts_to_sequences(x_train)
x_train = pad_sequences(x_train, maxlen=max_len, padding='post')
x_test = en_tokenizer.texts_to_sequences(x_test)
x_test = pad_sequences(x_test, maxlen=max_len, padding='post')

pl_tokenizer = Tokenizer()
pl_tokenizer.fit_on_texts(y_train)
vocab_size_pl = len(pl_tokenizer.word_index) + 1

y_train = pl_tokenizer.texts_to_sequences(y_train)
y_train = pad_sequences(y_train, maxlen=max_len, padding='post')
y_test = pl_tokenizer.texts_to_sequences(y_test)
y_test = pad_sequences(y_test, maxlen=max_len, padding='post')
x_train[0], y_train[0]

(array([  14,   52,   57, 3568,   35, 3413,   85, 2781,    5,  521,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0, 

# Define Model

In [9]:
from tensorflow.keras.layers import Attention

## Encoder

In [13]:
encoder_input = Input(shape=(max_len, ))
encoder_embedding = Embedding(vocab_size_en, 500, trainable=True)(encoder_input)

# LSTM with states
encoder_lstm_1 = LSTM(256, return_sequences=True)(encoder_embedding)
encoder_lstm_2 = LSTM(256, return_sequences=True)(encoder_lstm_1)
encoder_output, state_h, state_c = LSTM(256, return_sequences=True, return_state=True)(encoder_lstm_2)

## Decoder

In [14]:
decoder_input = Input(shape=(None, ))
decoder_embedding = Embedding(vocab_size_pl, 500, trainable=True)(decoder_input)

# decoder LSTM on encoder states
decoder_lstm_output, decoder_fwd_state, decoder_bwd_state, = LSTM(256, return_sequences=True, return_state=True)(decoder_embedding, initial_state=[state_h, state_c])

# attention layer
attn_out, attn_states = Attention()([encoder_output, decoder_lstm_output])
decoder_concat_input = Concatenate(axis=-1,)([decoder_lstm_output, attn_out])

decoder_output = TimeDistributed(Dense(vocab_size_pl, activation='softmax'))(decoder_concat_input)

TypeError: Cannot iterate over a Tensor with unknown first dimension.