In [424]:
import re 
import os
import pandas as pd
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding,Conv1D,MaxPooling1D,Dense
from tensorflow.keras.activations import relu,sigmoid
import tensorflow.keras.backend  as k
import contractions

In [425]:
def readlines(file):
    with open(file,'r') as file:
        data=file.readlines()
        file.close()
    data=[x.strip().lower() for x in data]
    return data

In [426]:
def convertIndicesToTensor(data):
    val=np.array(data)
    return k.variable(value=val)

In [427]:
def Build_char_indexes(file):
    with open(file,'r') as file:
        data=file.readlines()
        file.close()
        
    data=[x.strip() for x in data]
    data=[re.sub("[^A-Za-z0-9\s]+"," ",x) for x in data]
    text=" ".join(data).lower()
    text=re.sub("\s+","",text)
    tokens=list(text)
    tokens=np.unique(tokens)
    char2id={}
    id2char={}
    tokens=sorted(tokens)
    tokens=['<pad>','<u>']+tokens
    for i,tok in enumerate(tokens):
        char2id[tok]=i
        id2char[i]=tok
    index={}
    index['char2id']=char2id
    index['id2char']=id2char
    return index

In [428]:
def charPadding(sent_indices,char_max_len,index):
    new_sents=[ [ word[:char_max_len] if len(word)>char_max_len else word+[index['char2id']['<pad>']]*(char_max_len-len(word))  for word in sent]for sent in sent_indices]
    return new_sents

In [429]:
def wordPadding(sent_indices,char_max_len,index):
    max_words=max([len(x) for x in sent_indices])
    #new_sents=[sent if len(sent)==max_words else sent+[index['char2id']['<pad>']*char_max_len]*(max_words-len(sent)) for sent in sent_indices]
    new_sents=[]
    
    for sent in sent_indices:
        if len(sent)==max_words:
            new_sents.append(sent)
        else:
            diff=max_words-len(sent)
            ref=sent+[[index['char2id']['<pad>']]*char_max_len]*diff
            new_sents.append(ref)
    return new_sents

In [430]:
def convertSentstoCharIndices(sents,index):
    sents=[re.sub("[^A-Za-z0-9\s]+"," ",x) for x in sents]
    sents=[re.sub("\s+"," ",x).split(" ") for x in sents]
    idx=index['char2id']
    output=[[[ idx[char] if char in idx.keys() else idx['<u>'] for char in word ]for word in sent]for sent in sents]
    return output

In [431]:
def convOnCharacters(src_sent_indices):
    print("No of sentences Fed : ",str(len(src_sent_indices)))
    outputs=[]
    for sent_ids in src_sent_indices:
        sent_tensor=convertIndicesToTensor([sent_ids])
        embeds=src_embeddings(sent_tensor)
        embeds=tf.transpose(embeds,[0,1,3,2])
        embeds=tf.squeeze(embeds,[0])
        conv_output=conv_layer(embeds)
        relu_output=relu(conv_output)
        pool_out=max_pool_layer(relu_output)
        pool_out=tf.squeeze(pool_out,[2])
        outputs.append(pool_out)
    output=tf.stack(outputs)
    print(output.shape)
    return output

In [432]:
src_file="/home/ravi/codeBase/Neural-Machine-Translation/Charcter-NMT/en_es_data/test_tiny.es"
tgt_file="/home/ravi/codeBase/Neural-Machine-Translation/Charcter-NMT/en_es_data/test_tiny.en"
MAX_WORD_LENGTH=21
embedding_size=50
src_index=Build_char_indexes(src_file)
tgt_index=Build_char_indexes(tgt_file)

src_sents=readlines(src_file)
tgt_sents=readlines(tgt_file)

In [433]:
src_sent_indices=convertSentstoCharIndices(src_sents,src_index)

In [434]:
src_sent_indices=charPadding(src_sent_indices,MAX_WORD_LENGTH,src_index)
src_sent_indices=wordPadding(src_sent_indices,MAX_WORD_LENGTH,src_index)

In [435]:
src_embeddings=Embedding(input_dim=len(src_index['char2id']),output_dim=embedding_size)
conv_layer= Conv1D(filters=embedding_size,kernel_size=5,strides=1,use_bias=True,padding="valid",data_format="channels_first")
max_pool_layer=MaxPooling1D(pool_size=17,strides=None,padding="valid",data_format="channels_first")
highway_proj=Dense(embedding_size,activation=None,use_bias=True)
highway_gate=Dense(embedding_size,activation=None,use_bias=True)

In [436]:
def highwayLayer(conv_output):
    proj_output=relu(highway_proj(conv_output))
    gate_output=sigmoid(highway_gate(conv_output))
    out=tf.math.multiply(proj_output,gate_output)+tf.math.multiply((1-gate_output),conv_output)
    return out

In [437]:
conv_output=convOnCharacters(src_sent_indices)
highway_output=highwayLayer(conv_output)

print(conv_output.shape)
print(highway_output.shape)

No of sentences Fed :  4
(4, 33, 50)
(4, 33, 50)
(4, 33, 50)
