In [2]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.utils import plot_model
from string import punctuation,digits,ascii_letters
import re
import functions as fn
import classes as cs
import pickle
import tokenizers as tk
import transformers
from collections.abc import Iterable
from inltk.inltk import get_similar_sentences,setup
policy = tf.keras.mixed_precision.Policy('mixed_float16')
tf.keras.mixed_precision.set_global_policy(policy)

INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA GeForce RTX 3060 Laptop GPU, compute capability 8.6


In [3]:
dataset = pd.read_csv('../GoogleTransDataset.csv')
dataset.rename(columns={'english_sentence':'English','hindi_sentence':'Hindi'},inplace=True)
dataset = dataset.reset_index(drop=True).dropna()
# dataset['Hindi'] = dataset['Hindi'].apply(lambda x: x.lower().strip())
# dataset['English'] = dataset['English'].apply(lambda x: x.lower().strip())
# Strip punctuation
dataset['English'] = dataset['English'].apply(lambda x: ' '.join(re.split('\s+',x.lower().strip())))
dataset['Hindi'] = dataset['Hindi'].apply(lambda x: ' '.join(re.split('\s+',x.lower().strip())))
exclude_chars_hindi = set(punctuation+digits+ascii_letters+'[२३०८१५७९४६“”"]')
exclude_chars_english = set(punctuation+digits+'“”')
dataset['Hindi'] = dataset['Hindi'].apply(lambda x: ''.join([ch for ch in x if ch not in exclude_chars_hindi]))
dataset['English'] = dataset['English'].apply(lambda x: ''.join([ch for ch in x if ch not in exclude_chars_english]))
dataset = dataset.reset_index(drop=True).dropna()
tokenizer = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-cased")
dataset['English'] = dataset['English'].apply(lambda x: ' '.join(tokenizer.tokenize(x)))
dataset['Hindi'] = dataset['Hindi'].apply(lambda x: ' '.join(tokenizer.tokenize(x)))
dataset['Hindi'] = dataset['Hindi'].apply(lambda x: 'START__ ' + x + ' __END')
dataset['len_english'] = dataset['English'].apply(lambda x: len(list(re.split('\s+',x))))
dataset['len_hindi'] = dataset['Hindi'].apply(lambda x: len(list(re.split('\s+',x))))
indexes = dataset[ ( dataset['len_english']>64 ) | ( dataset['len_hindi']>64 ) ].index
dataset.drop(indexes,inplace=True)

In [4]:
# Making the vocabularies
english_vocab_dict = dict()
hindi_vocab_dict = dict()
english_vocab_dict['[UNK]'] = 0
hindi_vocab_dict['[UNK]'] = 0

for sentence in dataset['English']:
    for word in re.split('\s+',sentence):
        if(word not in english_vocab_dict.keys()):
            english_vocab_dict[word] = 1
        else:
            english_vocab_dict[word]+=1

for sentence in dataset['Hindi']:
    for word in re.split('\s+',sentence):
        if(word not in hindi_vocab_dict.keys()):
            hindi_vocab_dict[word] = 1
        else:
            hindi_vocab_dict[word]+=1


# for word,freq in english_vocab_dict.copy().items():
#     if(freq<=2 and word != '[UNK]'):
#         english_vocab_dict.pop(word)
#         english_vocab_dict['[UNK]']+=1

# for word,freq in hindi_vocab_dict.copy().items():
#     if(freq<=2 and word != '[UNK]'):
#         hindi_vocab_dict.pop(word)
#         hindi_vocab_dict['[UNK]']+=1

english_vocab_dict = {word:freq for word,freq in sorted(english_vocab_dict.items(),key=lambda pair: pair[1],reverse=True)}
hindi_vocab_dict = {word:freq for word,freq in sorted(hindi_vocab_dict.items(),key=lambda pair: pair[1],reverse=True)}
english_vocab_keys = list(english_vocab_dict.keys())
hindi_vocab_keys = list(hindi_vocab_dict.keys())

In [5]:
e_wtoi = dict([(word,i+1) for i,word in enumerate(english_vocab_dict)])
h_wtoi = dict([(word,i+1) for i,word in enumerate(hindi_vocab_dict)])

e_itow = dict([(i+1,word) for i,word in enumerate(english_vocab_dict)])
h_itow = dict([(i+1,word) for i,word in enumerate(hindi_vocab_dict)])

len(e_wtoi),len(h_wtoi)

(15866, 1369)

In [6]:
# dataset_googled = fn.make_dataset(dataset,e_wtoi,h_wtoi,64)
with open('Datasets/fullGOOGLED_E&H','rb') as f:
    dataset_vecced = pickle.load(f)

In [7]:
fn.mapToVocab(dataset_vecced[1][-1],h_itow),fn.mapToVocab(dataset_vecced[2][-1],h_itow)

(' START__ उन्होंने अपनी एम ##् ##ब ##ुल ##ें ##स बनाने के लिए स ##िर ##्फ चार सरकारी अ ##न ##ु ##बंध जी ##ते',
 ' उन्होंने अपनी एम ##् ##ब ##ुल ##ें ##स बनाने के लिए स ##िर ##्फ चार सरकारी अ ##न ##ु ##बंध जी ##ते __END')

In [8]:
EMBED_DIMS = 256
SEQ_LEN = 64
BATCH_SIZE = 32
# dataset_vecced = list(dataset_vecced)
# for i,input in enumerate(dataset_vecced):
#     dataset_vecced[i] = input[:int(len(input)/BATCH_SIZE)*BATCH_SIZE]
# dataset_vecced = tuple(dataset_vecced)

In [9]:
encoder_input = tf.keras.layers.Input((SEQ_LEN,))
decoder_input = tf.keras.layers.Input((SEQ_LEN,))
x= cs.Embeddings(len(english_vocab_keys),EMBED_DIMS,SEQ_LEN,name='encoder_embedding')(encoder_input)
x = tf.keras.layers.BatchNormalization()(x)
y= cs.Embeddings(len(hindi_vocab_keys),EMBED_DIMS,SEQ_LEN,name='decoder_embedding')(decoder_input)
y = tf.keras.layers.BatchNormalization()(y)

for i in range(6):
    x = cs.TransformerEncoder(6,EMBED_DIMS,2048,name=f'encoder_{i+1}')(x)
x = tf.keras.layers.BatchNormalization()(x)
for i in range(6):
    y = cs.TransformerDecoder(2048,6,EMBED_DIMS,name=f'decoder_{i+1}')(y,x)

y = tf.keras.layers.BatchNormalization()(y)
dense_output = tf.keras.models.Sequential([
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=2048,activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(units=len(hindi_vocab_keys),activation='softmax')
])(y)

transformer = tf.keras.Model(inputs=[encoder_input,decoder_input],outputs=dense_output)
transformer.compile(optimizer=tf.keras.optimizers.Adam(clipvalue=0.8,global_clipnorm=1,learning_rate=1e-5),loss='sparse_categorical_crossentropy',metrics='accuracy')
transformer.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 64)]         0           []                               
                                                                                                  
 encoder_embedding (Embeddings)  (None, 64, 256)     4061696     ['input_1[0][0]']                
                                                                                                  
 batch_normalization (BatchNorm  (None, 64, 256)     1024        ['encoder_embedding[0][0]']      
 alization)                                                                                       
                                                                                                  
 encoder_1 (TransformerEncoder)  (None, 64, 256)     2629632     ['batch_normalization[0][0]']

In [10]:
Xtrain = tf.data.Dataset.from_tensor_slices((dataset_vecced[0],dataset_vecced[1]))
Ytrain = tf.data.Dataset.from_tensor_slices(dataset_vecced[2])
training_dataset = tf.data.Dataset.zip((Xtrain,Ytrain)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE).cache()
training_dataset = training_dataset.take(2667)
training_dataset = training_dataset.shuffle(buffer_size=len(training_dataset),reshuffle_each_iteration=True)

In [11]:
# prev lr = 1e-3
# history = transformer.fit(x=training_dataset,epochs=30,batch_size=BATCH_SIZE)

In [12]:
# tf.keras.models.save_model(transformer,'JOD_MODELS/model_E&Htoken.h5')
transformer = tf.keras.models.load_model('GOOD_MODELS/model_E&Htoken_good.h5',custom_objects={'Embeddings':cs.Embeddings,'TransformerEncoder':cs.TransformerEncoder,'TransformerDecoder':cs.TransformerDecoder})

In [15]:
fn.translate('what are you doing')

'आप क्या कर रहे हैं'

In [16]:
fn.translate('education is the key to success')

'शिक्षा सफलता की कुंजी है'

In [18]:
fn.translate('he is the only one who can tell you about this topic')

'वह केवल एकमात्र है जो इस बारे में आप विषय के बारे में बता सकते हैं'

In [20]:
fn.translate('I am very happy')fn.

'मैं बहुत खुश हूँ'