In [1]:
import re
import pandas as pd
import numpy as np
import os
import sys
import tensorflow as tf

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
import warnings
warnings.simplefilter('ignore')

In [3]:
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, LSTMCell
from keras.layers import TimeDistributed, Bidirectional, Dense
from keras.layers import RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences 
from keras.callbacks import ModelCheckpoint
from keras import optimizers

Using TensorFlow backend.


In [4]:
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
pd.set_option('display.max_colwidth', 200) #max col width.

In [6]:
def read_text_from_file(filename):
    
    file=open(filename, 'rt')
    text = file.read()
    file.close()
    print("\nFile read to system.")
    return text

In [7]:
def convert_to_lines(text):
    
    sent=text.strip().split('\n') #remove truncating spaces after end of sentences, split by sentence.
    sent=[i.split('\t') for i in sent]
    print("\nConverted to lines.")
    return sent

In [8]:
def get_processed_data():
   
    try:
        data = read_text_from_file("deu.txt")
        separated_data = convert_to_lines(data)
        
        data_df = pd.DataFrame()
        eng,other=[],[]        

        for item in separated_data:
            eng.append(item[0])
            other.append(item[1])
        
        data_df['English']=eng
        data_df['Other']=other
        print("\nNumber of sentence pairs : ",len(data_df))
        
        return data_df
    
    except FileNotFoundError as e:
        print("\nFile not found in the present directory")
        

In [9]:
get_processed_data()


File read to system.

Converted to lines.

Number of sentence pairs :  195847


Unnamed: 0,English,Other
0,Hi.,Hallo!
1,Hi.,Grüß Gott!
2,Run!,Lauf!
3,Wow!,Potzdonner!
4,Wow!,Donnerwetter!
5,Fire!,Feuer!
6,Help!,Hilfe!
7,Help!,Zu Hülf!
8,Stop!,Stopp!
9,Wait!,Warte!


In [10]:
def clean_and_preprocess():
    
    df = pd.DataFrame()
    df = get_processed_data()
    
    processed_english, processed_other = [],[]
    
    import string
    from string import punctuation as punc
    
    for item in df.iterrows():
        eng_sen, oth_sen = item[1][0], item[1][1]
        eng_sen = eng_sen.translate(str.maketrans('','',punc))
        oth_sen = oth_sen.translate(str.maketrans('','',punc))
        processed_english.append(eng_sen.lower())
        processed_other.append(oth_sen.lower())
        
    df['Processed English']=processed_english
    df['Processed Other']=processed_other
    
    print(df.columns)
    
    return df

In [11]:
df = clean_and_preprocess()


File read to system.

Converted to lines.

Number of sentence pairs :  195847
Index(['English', 'Other', 'Processed English', 'Processed Other'], dtype='object')


In [12]:
df.iloc[:,[3]]

Unnamed: 0,Processed Other
0,hallo
1,grüß gott
2,lauf
3,potzdonner
4,donnerwetter
5,feuer
6,hilfe
7,zu hülf
8,stopp
9,warte


In [13]:
def tokenization(lines):
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

In [14]:
def encode_sequences(tokenizer, length, lines):
    
    seq=tokenizer.texts_to_sequences(lines)
    seq=pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [15]:
def train_and_test_split():
    
    df = pd.DataFrame()
    df = clean_and_preprocess()
    
    from sklearn.model_selection import train_test_split
    train,test = train_test_split(df, test_size = 0.2, random_state=42)
    
#     print(train)
#     print(train.iloc[:,3])
    
    eng_tokenizer, oth_tokenizer = tokenization(df.iloc[:,[2]]) , tokenization(df.iloc[:,[3]])
    
    x_train = encode_sequences(oth_tokenizer, 8, train.iloc[:,[3]])
    y_train = encode_sequences(eng_tokenizer, 8, train.iloc[:,[2]])
    
    x_test = encode_sequences(oth_tokenizer, 8, test.iloc[:,[3]])
    y_test = encode_sequences(eng_tokenizer, 8, train.iloc[:,[2]])
    
#     print(train, test)
    
    return x_train, x_test, y_train, y_test, eng_tokenizer, oth_tokenizer

#     return df

In [22]:
def build_model(in_vocab, out_vocab, in_timesteps, out_timesteps, units): #seq2seq architecture, deep LSTM encoder
    
    model=Sequential()
    model.add(Embedding(in_vocab, units, input_length= in_timesteps, mask_zero=True))
#     model.add(LSTM(units, return_sequences=True))
    model.add(LSTM(units))
    model.add(RepeatVector(out_timesteps))
    model.add(LSTM(units, return_sequences=True))
    model.add(Dense(out_vocab, activation='softmax'))
    return model

In [24]:
x_tr, x_te, y_tr, y_te, e_tok, o_tok = train_and_test_split()
# print(len(e_tok.word_index)+1)
model=build_model(6453, 10998, 8, 8, 512)
rms=optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, metrics=['accuracy'], loss='sparse_categorical_crossentropy')

#save model file for least loss

filename='model.encoder.aug_27'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')


File read to system.

Converted to lines.

Number of sentence pairs :  195847
Index(['English', 'Other', 'Processed English', 'Processed Other'], dtype='object')
                                                                               English  \
179806                             Do you really want a job in the same office as Tom?   
180948                             Tom, Mary, John, Alice and I used to sing together.   
22051                                                               What is your name?   
152344                                        I wish you'd told me that a bit earlier.   
55584                                                         It appears to be broken.   
44085                                                           Tom doesn't know much.   
31949                                                             Tom is a bad driver.   
170559                                  I hope to have that sort of opportunity again.   
17526                      

In [25]:
history = model.fit(x_tr, y_tr.reshape(y_tr.shape[0], y_tr.shape[1], 1), 
          epochs=50, batch_size=512, 
          validation_split = 0.2,
          callbacks=[checkpoint], verbose=0)

In [27]:
y_tr

array([[1, 2, 0, 0, 0, 0, 0, 0]], dtype=int32)