In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import tensorflow as tf
import random
import re
import json
# import contractions
from collections import Counter
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/english-vocab/words_alpha.txt
/kaggle/input/hindinlp/HindiNLP.csv
/kaggle/input/samanantar-en-to-hi-1/dataset_1.csv
/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv
/kaggle/input/english-to-hindi-parallel-dataset/newdata.csv
/kaggle/input/english-to-hindi-dataset/english_to_hindi.txt
/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv


In [2]:
np.random.seed(42)
tf.random.set_seed(42)
random.seed(42)

In [3]:
#english_dictionary
path0 = "/kaggle/input/english-vocab/words_alpha.txt"
#main_vocab_files
path1 = '/kaggle/input/hindinlp/HindiNLP.csv'
path2 = "/kaggle/input/english-to-hindi-dataset/english_to_hindi.txt"
#samanantar_file
path3 = '/kaggle/input/samanantar-en-to-hi-1/dataset_1.csv'
#newdata and dataset_english_hindi_file
path4 = "/kaggle/input/english-hindi-dataset/Dataset_English_Hindi.csv"
path5 = "/kaggle/input/english-to-hindi-parallel-dataset/newdata.csv"
# IIT bombay file
path6 = "/kaggle/input/hindi-english-parallel-corpus/hindi_english_parallel.csv"

In [4]:
# Lowercase, trim, and remove non-letter characters
def normalizeEng(s):
    s = str(s).lower().strip()
    s = re.sub(r"([.!?|,])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?.,]+", r" ", s)
    s = ' '.join(s.split())
    return s

def normalizeHin(s):
    s = str(s).lower().strip()
    s = re.sub(r"([.!?|,])", r" \1", s)
    s = re.sub(r"([\u0964])", r" \1", s)
    s = re.sub(r'[^\u0900-\u0965\u0970-\u097F!?.,]+',r" ",s)
    s = ' '.join(s.split())
    return s

def tf_lower_and_remove_punct(text):
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[^ a-z.?,!]", "")
    text = tf.strings.regex_replace(text, "[.?,!]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text


def tf_lower_and_remove_punct_1(text):
    text = tf.strings.regex_replace(text,'[^ \u0900-\u0965\u0970-\u097F?,!]','')
    text = tf.strings.regex_replace(text, "[\u0964?,!]", r" \0 ")
    text = tf.strings.strip(text)
    text = tf.strings.join(["[SOS]", text, "[EOS]"], separator=" ")
    return text


In [5]:
#function to create vocab using dataframe and a column
def vocab_forming(df,column):
    vocab = df[column].to_list()
    vocab = [i for j in vocab for i in j.split()]
    vocab = list(Counter(vocab).items())
    vocab = sorted(vocab,key = lambda x:x[1],reverse = True)
    vocab = [k for k,v in vocab]
    return vocab

#function to filter rows with proper eng & hin sequence length
def filter_proper_seq_len(df):
    #getting seq_lengths
    df['eng_seq_len'] = df['english'].apply(lambda x: len(x.split()))
    df['hin_seq_len'] = df['hindi'].apply(lambda x: len(x.split()))
    df['diff'] = df['eng_seq_len']-df['hin_seq_len']
    #first filter to reduce the operations
    df = df[(df['eng_seq_len']<30)&(df['hin_seq_len']<40)]
    
    #dividing the seq lengths in three categories
    #this is to allocate allowable difference in seq_lengths
    df_a = df[(df['eng_seq_len']<=10)]
    df_a = df_a[(df_a['diff']<3)&(df_a['diff']>-3)]
    
    df_b = df[(df['eng_seq_len']>10)&(df['eng_seq_len']<=20)]
    df_b = df_b[(df_b['diff']<5)&(df_b['diff']>-5)]
    
    df_c = df[(df['eng_seq_len']>20)&(df['eng_seq_len']<=30)]
    df_c = df_c[(df_c['diff']<8)&(df_c['diff']>-8)]
    
    df = pd.concat([df_a,df_b,df_c])
    df = df.reset_index(drop = True)
    return df

#to extract rows containing required vocab with 
def filtering_rows_using_vocab(text,vocab =None,temp =None):
    text = text.split()
    text_set = set(text)
    vocab_in_text = len(text_set)
    common_vocab = len(text_set&vocab)
    if common_vocab>=temp*vocab_in_text:
        return True
    return False

In [6]:
with open(path0,'r') as f:
    data = f.readlines()
    data = [i.strip('\n') for i in data]
main_english_vocab = set(data)

In [7]:
#df1 some clean sentences used in daily conversations
#this df is used as base df1 for vocabulary and also used in val_pairs to make perfect training in the model
df1 =pd.read_csv(path1)
col = df1.columns
df1 = df1.rename(columns ={
    col[0]:'english',
    col[1]:'hindi'
})
df1 = df1.dropna()

print("length of df of basic_texts: ",len(df1))

df1['english'] = df1['english'].apply(normalizeEng)
df1['hindi'] = df1['hindi'].apply(normalizeHin)

df1 = df1.drop_duplicates()
df1 = df1.reset_index(drop = True)

df1 = filter_proper_seq_len(df1)
print('length of df after filtering: ',len(df1))
eng_vocab_set_from_df1 = set(vocab_forming(df1,'english'))
print(len(eng_vocab_set_from_df1))

hin_vocab_set_from_df1 = set(vocab_forming(df1,'hindi'))
print(len(hin_vocab_set_from_df1))
df1.head(3)

length of df of basic_texts:  617
length of df after filtering:  350
1013
1155


Unnamed: 0,english,hindi,eng_seq_len,hin_seq_len,diff
0,"hello , how are you ?","नमस्ते , आप कैसे हैं ?",6,6,0
1,what is your name ?,आपका क्या नाम है ?,5,5,0
2,where are you from ?,आप कहाँ से हैं ?,5,5,0


In [8]:
df1.describe()

Unnamed: 0,eng_seq_len,hin_seq_len,diff
count,350.0,350.0,350.0
mean,13.014286,13.642857,-0.628571
std,5.200542,5.496641,1.810281
min,2.0,2.0,-6.0
25%,9.0,9.0,-2.0
50%,14.0,14.0,0.0
75%,16.0,17.0,0.0
max,29.0,29.0,6.0


In [9]:
#df2 is also clean sentences that are used in daily life,
#it is also used as base df2 for vocabulary and val_pairs to fit them perfectly
with open(path2,'r',encoding = 'utf-8') as f:
    data = f.readlines()
    data = [i.strip('\n').split('\t') for i in data]
df2 = pd.DataFrame(data,columns = ['english','hindi'])

#dropping null objects
df2 = df2.dropna()


print('initial_length of df2: ', len(df2))

#applying normalizeEng and Hin functions
df2['english'] = df2['english'].apply(normalizeEng)
df2['hindi'] = df2['hindi'].apply(normalizeHin)

#dropping duplicates
df2 = df2.drop_duplicates()

#filtering rows using filter_proper_seq_len funciton
df2 = filter_proper_seq_len(df2)
df2 = df2.reset_index(drop = True)

print('length of df after filtering: ',len(df2))

#checking vocab length
eng_vocab_set_from_df2 = set(vocab_forming(df2,'english'))
print(len(eng_vocab_set_from_df2))

hin_vocab_set_from_df2 = set(vocab_forming(df2,'hindi'))
print(len(hin_vocab_set_from_df2))
df2.head(3)

initial_length of df2:  29415
length of df after filtering:  26512
10200
11245


Unnamed: 0,english,hindi,eng_seq_len,hin_seq_len,diff
0,help !,बचाओ !,2,2,0
1,jump .,उछलो .,2,2,0
2,jump .,कूदो .,2,2,0


In [10]:
df2.describe()

Unnamed: 0,eng_seq_len,hin_seq_len,diff
count,26512.0,26512.0,26512.0
mean,3.107876,3.327399,-0.219523
std,2.398979,2.530312,0.829885
min,1.0,1.0,-6.0
25%,2.0,2.0,-1.0
50%,2.0,2.0,0.0
75%,4.0,4.0,0.0
max,26.0,28.0,4.0


In [11]:
#main english and hindi vocab to filter the rows from other dataset
selected_english_vocab = eng_vocab_set_from_df2.union(eng_vocab_set_from_df1)
print('selected_english_vocab_length: ',len(selected_english_vocab))
print('selected_english_vocab_length_in_dict: ',len(selected_english_vocab&main_english_vocab))
selected_hindi_vocab = hin_vocab_set_from_df2.union(hin_vocab_set_from_df1)
print('selected_hindi_vocab_length: ',len(selected_hindi_vocab))

selected_english_vocab_length:  10511
selected_english_vocab_length_in_dict:  8032
selected_hindi_vocab_length:  11645


In [12]:
#df3 data frame 
df3 = pd.read_csv(path3)
print(len(df3))
df3 = df3[['src','tgt']]
col = df3.columns
df3 = df3.rename(columns = {
    col[0]:'english',
    col[1]:'hindi'
})
df3.head(3)

1265714


Unnamed: 0,english,hindi
0,Add sugar and muddle the mixture.,इस मिश्रण को छानकर इसमें चीनी मिलाएं।
1,"We can't, Coop.","हम नहीं देख सकते, Coop."
2,The court had delivered the order last week.,हाई कोर्ट ने यह आदेश पिछले सप्ताह दिया।


In [13]:
#df4 data frame
df4 = pd.read_csv(path4)
print(len(df4))
col = df4.columns
df4 = df4.rename(columns = {
    col[0]:'english',
    col[1]:'hindi'
})
df4.head(3)

130476


Unnamed: 0,english,hindi
0,Help!,बचाओ!
1,Jump.,उछलो.
2,Jump.,कूदो.


In [14]:
#df5 dataframe
df5 = pd.read_csv(path5,index_col =0)
print(len(df5))
col = df5.columns
df5 = df5.rename(columns = {
    col[0]:'english',
    col[1]:'hindi'
})
df5.head(3)

177606


Unnamed: 0,english,hindi
0,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।


In [15]:
#df6 dataframe
df6 = pd.read_csv(path6)
df6 = df6[['english','hindi']]
print(len(df6))
df6.head()

1561841


Unnamed: 0,english,hindi
0,Give your application an accessibility workout,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें
1,Accerciser Accessibility Explorer,एक्सेर्साइसर पहुंचनीयता अन्वेषक
2,The default plugin layout for the bottom panel,निचले पटल के लिए डिफोल्ट प्लग-इन खाका
3,The default plugin layout for the top panel,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका
4,A list of plugins that are disabled by default,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...


In [16]:
#pipeline_0 for droping cleaning and extracting rows with proper sequence length
def pipeline_0(df_n):
    print('current_running_pipeline_0:')
    print('initial length of the dataframe: ',len(df_n))
    df = df_n.copy()
    df = df.dropna()
    df = df.reset_index(drop = True)
    
    #removing data with englishalpha and hindi_alpha
    english_alpha =df[df['hindi'].apply(lambda x: True if len(re.findall('[a-zA-Z]',x)) else False)]
    df = df.drop(english_alpha.index)
    df = df.reset_index(drop = True)
    hindi_alpha = df[df['english'].apply(lambda x: True if len(re.findall('[\u0900-\u097F]',x)) else False)]
    df = df.drop(hindi_alpha.index)
    df = df.reset_index(drop = True)


    
    #initial filtering using sequence lenghts
    df['eng_seq_len'] = df['english'].apply(lambda x: len(x.split()))
    df['hin_seq_len'] = df['hindi'].apply(lambda x: len(x.split()))
    df['diff'] = df['eng_seq_len']-df['hin_seq_len']
    df = df[(df['eng_seq_len']<30)&(df['hin_seq_len']<30)]
    df = df[(df['diff']<10)&(df['diff']>-10)]
    
    # extracting respective alhpabets
    df['english'] = df['english'].apply(normalizeEng)
    df['hindi'] = df['hindi'].apply(normalizeHin)

    #dropping improper rows_1
    empty_columns = df[(df['english']=='')|(df['hindi'] == '')]
    df = df.drop(empty_columns.index)
        
    #dropping improper rows_2
    indexes1 =df[(df['english'].str.len()>20)&(df['hindi'].str.len()<7)].index 
    df = df.drop(indexes1)
    
    #dropping improper rows_3
    indexes2 = df[(df['english'].str.len()<8)&(df['hindi'].str.len()>14)].index
    df = df.drop(indexes2)

    #filtering again using seq_lengths using filter_proper_seq_len function
    df = filter_proper_seq_len(df)
    
    print('length of df after proper_cleaning: ',len(df))
    return df

In [17]:
#filtering rows 
def pipeline_1(df):
    print('current_running_pipeline_1: ')

    #checking the vocab length in the dataframe
    eng_vocab1 =vocab_forming(df,'english')
    print('total_english_vocab in df: ',len(eng_vocab1))

    hin_vocab1 = vocab_forming(df,'hindi')
    print('total_english_vocab in df: ',len(hin_vocab1))
    
    #filtering the rows using required vocab
    df_a = df[
        (df['english'].apply(filtering_rows_using_vocab,args =[selected_english_vocab,0.96]))&
         (df['hindi'].apply(filtering_rows_using_vocab,args = [selected_hindi_vocab,0.96]))
         ]
    print('final_length of df: ',len(df_a))
    return df_a

In [18]:
def pipeline_2(df):
    global tf_lower_and_remove_punct,tf_lower_and_remove_punct_1,make_dataset
    print('current_running_pipeline_2: ')
    text_pairs = list(zip(df['english'],df['hindi']))
    random.seed(42)
    random.shuffle(text_pairs)
    
   # Assigning  data for validation and testing set
    num_val_samples = int(0.1*len(text_pairs))
    num_test_samples = int(0.05*len(text_pairs))
    num_train_samples = len(text_pairs) -  num_val_samples - num_test_samples
    train_pairs = text_pairs[:num_train_samples]
    val_pairs = text_pairs[num_train_samples:num_train_samples+num_val_samples]
    test_pairs = text_pairs[num_train_samples+num_val_samples:]

    vocab_size_1 = 15000
    vocab_size_2 = 15000
    sequence_length = 25

    english_vectorization = tf.keras.layers.TextVectorization(
        max_tokens = vocab_size_1,
        output_mode='int',
        output_sequence_length=sequence_length,
    #     ragged = True,
        standardize = tf_lower_and_remove_punct

    )

    hindi_vectorization = tf.keras.layers.TextVectorization(
        max_tokens = vocab_size_2,
        output_mode='int',
        output_sequence_length=sequence_length+1,
    #     ragged = True,
        standardize = tf_lower_and_remove_punct_1
    )

    eng_text = [pair[0] for pair in train_pairs]
    hin_text = [pair[1] for pair in train_pairs]
    english_vectorization.adapt(eng_text)
    hindi_vectorization.adapt(hin_text)

    english_vocab = english_vectorization.get_vocabulary()
    hindi_vocab = hindi_vectorization.get_vocabulary()
    vocab_size_eng = len(english_vocab)
    vocab_size_hin = len(hindi_vocab)
    print('english_vocab_length: ',vocab_size_eng)
    print('hindi_vocab_length: ',vocab_size_hin)
    

    batch_size = 64
    def format_dataset(eng,hin):
        eng = english_vectorization(eng)
        hin = hindi_vectorization(hin)
        tar_in = hin[:,:-1]
        tar_out = hin[:,1:]
        return (eng,tar_in),tar_out

    def make_dataset(pairs):
        eng_texts, hin_texts = zip(*pairs)
        eng_texts = list(eng_texts)
        hin_texts = list(hin_texts)
        dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hin_texts))
        dataset = dataset.batch(batch_size)
        dataset = dataset.map(format_dataset)
        return dataset

    train_ds = make_dataset(train_pairs)
    val_ds = make_dataset(val_pairs)
    test_ds = make_dataset(test_pairs)

    return (train_ds,val_ds,test_ds,english_vectorization,hindi_vectorization,
train_pairs,val_pairs,test_pairs,vocab_size_eng,vocab_size_hin)

In [19]:
def main_pipeline(df):
    temp_df = pipeline_0(df)
    temp_df = pipeline_1(temp_df)
    return temp_df

In [20]:
#df_list for looping
df_list = [df3,df4,df5,df6]
#temporary list to prevent changes in orignal list
temp_df_list =[]
j=1
#loop to apply pipeline for every df in df_list
for i in df_list:
    print('current_df_running: ',j)
    print()
    temp_df = main_pipeline(i)
    print('final_english_vocab in df: ',len(vocab_forming(temp_df,'english')))
    print('final_hindi_vocab in df: ',len(vocab_forming(temp_df,'hindi')))
    temp_df_list.append(temp_df)
    j+=1
    print()
    

current_df_running:  1

current_running_pipeline_0:
initial length of the dataframe:  1265714
length of df after proper_cleaning:  735134
current_running_pipeline_1: 
total_english_vocab in df:  116738
total_english_vocab in df:  177180
final_length of df:  57335
final_english_vocab in df:  5728
final_hindi_vocab in df:  6393

current_df_running:  2

current_running_pipeline_0:
initial length of the dataframe:  130476
length of df after proper_cleaning:  85633
current_running_pipeline_1: 
total_english_vocab in df:  41181
total_english_vocab in df:  44466
final_length of df:  12755
final_english_vocab in df:  3655
final_hindi_vocab in df:  4075

current_df_running:  3

current_running_pipeline_0:
initial length of the dataframe:  177606
length of df after proper_cleaning:  123046
current_running_pipeline_1: 
total_english_vocab in df:  53033
total_english_vocab in df:  62420
final_length of df:  11035
final_english_vocab in df:  3172
final_hindi_vocab in df:  3274

current_df_running: 

In [21]:
#loop to destroy duplicates and stored in new temp_df_list1
temp_df_list1 = []
for i in range(4):
    temp_df = temp_df_list[i].drop_duplicates(subset=['english'])
    temp_df = temp_df.drop_duplicates(subset = ['hindi'])
    temp_df = temp_df[['english','hindi']]
    temp_df_list1.append(temp_df)
df1 = df1[['english','hindi']]
df2 = df2[['english','hindi']]

In [22]:
#checking the lengths of dataframe
for i in temp_df_list1:
    print(len(i))

50255
10078
8601
77503


In [23]:
print('total_english_vocab_in the temp_df_list: ',len(vocab_forming(pd.concat(temp_df_list1),'english')))
print('total_hindi_vocab_in the temp_df_list: ',len(vocab_forming(pd.concat(temp_df_list1),'hindi')))

total_english_vocab_in the temp_df_list:  9986
total_hindi_vocab_in the temp_df_list:  11512


In [24]:
# #this is to sample df if df is too long but not in our case
# temp_df_list2=[]
# for i in temp_df_list1:
#     sample_len=50000
#     if len(i)<sample_len:
#         sample_len =len(i)
#     if len(i)>100000:
#         sample_len = len(i)//2
# #     print('current_sample_len: ,'sample_len)
#     temp_df = i.sample(n = sample_len,random_state =42)
# #     temp_df = pd.concat([temp_df,temp_df.sample(n = int(0.1*len(temp_df)),random_state = 42)])
# #     print('total_df_length: ',len(temp_df))
#     temp_df_list2.append(temp_df)

In [25]:
#concating all df's in temp_df_list1
df_a =pd.concat([*temp_df_list1])
#shuffling df_a
df_a = df_a.sample(n = len(df_a),random_state = 42)

print('length of temp_df_list:',len(df_a))
print('length of unique_elements in temp_df_list:',len(df_a.drop_duplicates()))


#concating df_a with df1 and df2 to form final_df
final_df = pd.concat([df_a,*[df1]*2,
                      *[df2,df2.sample(n = int(0.3*len(df2)),random_state = 42)]])
print('final_df_length: ',len(final_df))
print('unique elements: ',len(final_df.drop_duplicates()))

length of temp_df_list: 146437
length of unique_elements in temp_df_list: 121015
final_df_length:  181602
unique elements:  127466


In [26]:
print(len(set(vocab_forming(final_df,'english'))))
print(len(set(vocab_forming(final_df,'hindi'))))
print(len(set(vocab_forming(final_df,'english'))&selected_english_vocab))
print(len(set(vocab_forming(final_df,'hindi'))&selected_hindi_vocab))

10548
11794
10511
11645


In [27]:
(train_ds,val_ds,test_ds,english_vectorization,hindi_vectorization,
train_pairs,val_pairs,test_pairs,vocab_size_eng,vocab_size_hin) = pipeline_2(final_df)

current_running_pipeline_2: 
english_vocab_length:  10438
hindi_vocab_length:  11697


In [28]:
print(len(train_ds))
print(len(val_ds))
print(len(test_ds))

2412
284
142


In [29]:
#customized val_pairs
batch_size = 64
def format_dataset(eng,hin):
    eng = english_vectorization(eng)
    hin = hindi_vectorization(hin)
    tar_in = hin[:,:-1]
    tar_out = hin[:,1:]
    return (eng,tar_in),tar_out

def make_dataset(pairs):
    eng_texts, hin_texts = zip(*pairs)
    eng_texts = list(eng_texts)
    hin_texts = list(hin_texts)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, hin_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset

#both df1 and df2 are main english_sentence_pair that need to be trained so they are present in val_pairs to 
# increase the fitting in train_data
new_val_pairs = val_pairs+list(zip(df1['english'],df1['hindi']))+list(zip(df2['english'],df2['hindi']))
new_val_pairs = list(set(new_val_pairs))
print(len(new_val_pairs))
new_val_ds = make_dataset(new_val_pairs)
print(len(new_val_ds))

38770
606


In [30]:
my_string = ""
for i,j in new_val_pairs:
    my_string += i+"\t"+j+"\n"
with open("customized_val_pairs.txt",'w',encoding = 'utf-8') as f:
    f.writelines(my_string)

In [31]:
for (inp,tar_in),tar_out in train_ds.take(1):
    pass
print(inp.shape)
print(tar_in.shape)
print(tar_out.shape)

(64, 25)
(64, 25)
(64, 25)


In [32]:
#positional encoding
def positional_encoding(positions,d_model):
    position = np.arange(positions)[:,np.newaxis]
    k = np.arange(d_model)[np.newaxis,:]
    i = k//2
    angle_rates = 1/(np.power(10000,(2*i)/np.float32(d_model)))
    angle_rads = position*angle_rates
#     print('looks of anglerads,: ',angle_rads.shape)
    angle_rads[:,0::2] = np.sin(angle_rads[:,0::2])
    angle_rads[:,1::2] = np.cos(angle_rads[:,1::2])
    pos_encoding = angle_rads[np.newaxis, ...]
#     print('new axis angle_rads: ',pos_encoding.shape)
    return tf.cast(pos_encoding,dtype = tf.float32)

In [33]:
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,vocab_size,d_model,mask_zero=True):
        super().__init__()
        self.d_model = d_model
        self.mask_zero= mask_zero
        self.token_embedding = tf.keras.layers.Embedding(input_dim = vocab_size,output_dim= d_model,mask_zero=mask_zero)
        self.pos_encoding = positional_encoding(128,d_model)
    
    def call(self, x):
        length = tf.shape(x)[1]
        x = self.token_embedding(x)
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x = x + self.pos_encoding[:, :length, :]
        return x
    def compute_mask(self,inputs,mask =None):
        if self.mask_zero:
            return tf.not_equal(inputs,0)
        else:
            return None

In [34]:
def FullyConnected(embedding_dim,dense_dim):
    feedforward = tf.keras.Sequential([
        tf.keras.layers.Dense(dense_dim,activation='relu'),
        tf.keras.layers.Dense(embedding_dim)
    ])
    return feedforward

In [35]:
#Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,embedding_dim,num_heads,dense_dim,dropout_rate =0.1,layernorm_eps =1e-6,**kwargs):
        super().__init__(**kwargs)

        self.mha = tf.keras.layers.MultiHeadAttention(
            num_heads=num_heads,
            key_dim = embedding_dim,
            dropout = dropout_rate
        )

        self.ffn = FullyConnected(embedding_dim=embedding_dim,dense_dim=dense_dim)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
        self.dropout_ffn = tf.keras.layers.Dropout(dropout_rate)
        self.supports_masking = True
    def call(self,x,mask=None):
        # print('current_mask is: ',mask)
        if mask is not None:
            padding_mask = tf.cast(mask[:,None,:],dtype = tf.int32)
        else:
            padding_mask = None
#         print('current_padding_mask: ',padding_mask)
        attn_output = self.mha( x, x, x,attention_mask = padding_mask)
        layernorm1_output = self.layernorm1(x + attn_output)
        feedforward_output = self.ffn(layernorm1_output)
        dropout_ffn_output = self.dropout_ffn(feedforward_output)
        encoder_layer_output = self.layernorm2(layernorm1_output + dropout_ffn_output)

        return encoder_layer_output

In [36]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self,embedding_dim,num_heads,dense_dim,num_layers,input_vocab_size,
                 dropout_rate =0.1,layernorm_eps =1e-6):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = PositionalEmbedding(input_vocab_size,self.embedding_dim)
        self.enc_layers = [EncoderLayer(embedding_dim=self.embedding_dim,
                                        num_heads=num_heads,
                                        dense_dim=dense_dim,
                                        dropout_rate=dropout_rate,
                                        layernorm_eps=layernorm_eps) 
                           for _ in range(self.num_layers)]
        
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.supports_masking =True
    def call(self,x,mask =None):
        # print('current mask: ',mask)
        x = self.embedding(x)
        x = self.dropout(x)
        for i in range(self.num_layers):
            x = self.enc_layers[i](x)
        return x


In [37]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,embedding_dim,num_heads,dense_dim,dropout_rate =0.1,layernorm_eps =1e-6):
        super().__init__()
        self.mha1 = tf.keras.layers.MultiHeadAttention(key_dim=embedding_dim,num_heads = num_heads,dropout=dropout_rate)
        self.mha2 = tf.keras.layers.MultiHeadAttention(key_dim = embedding_dim,num_heads = num_heads,dropout =dropout_rate)
        self.ffn = FullyConnected(embedding_dim=embedding_dim,dense_dim=dense_dim)

        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=layernorm_eps)
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.supports_masking = True
    def call(self,x,enc_output,mask=None):
        casual_mask = self.get_casual_attention_mask(x)
        if mask is not None:
            padding_mask = tf.cast(mask[:,None,:],dtype = tf.int32)
#             padding_mask = tf.minimum(padding_mask,casual_mask)
        else:
            padding_mask = None

        attn_out1 ,attn_scores = self.mha1(x,x,x,attention_mask=casual_mask,
                                           return_attention_scores = True)
        self.last_attn_scores = attn_scores
        Q = self.layernorm1(x+attn_out1)

        attn_out2 = self.mha2(query=Q,key=enc_output,value=enc_output,
                             attention_mask = padding_mask)
        attn_out2 = self.layernorm2(Q+attn_out2)

        ffn_output = self.ffn(attn_out2)
        ffn_output = self.dropout(ffn_output)
        decoder_output = self.layernorm3(attn_out2+ffn_output)
        
        return decoder_output

    def get_casual_attention_mask(self,x):
        input_shape = tf.shape(x)
        batch_size,sequence_length = input_shape[0],input_shape[1]
        i = tf.range(sequence_length)[:,None]
        j = tf.range(sequence_length)
        mask = tf.cast(i>=j,tf.int32)
        mask = tf.reshape(mask,(1,input_shape[1],input_shape[1]))
        mult = tf.concat(
            [
                tf.expand_dims(batch_size,-1),
                tf.convert_to_tensor([1,1]),
            ],
            axis =0,
        )
        return tf.tile(mask,mult)


In [38]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,embedding_dim,num_heads,dense_dim,num_layers,output_vocab_size,dropout_rate=0.1,layernorm_eps=1e-6):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers

        self.embedding = PositionalEmbedding(output_vocab_size,self.embedding_dim)
        self.dec_layer = [DecoderLayer(embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               dense_dim=dense_dim,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps) 
                        for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self,x,enc_output):
        attention_weights = {}
        x = self.embedding(x)
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.dec_layer[i](x,enc_output)
        #update attention_weights dictionary with the attention weights of block 1 and block 2
            # attention_weights['decoder_layer{}_block1_self_att'.format(i+1)] = block1
            # attention_weights['decoder_layer{}_block2_decenc_att'.format(i+1)] = block2
        return x
        

In [39]:
class Transformer(tf.keras.Model):
    def __init__(self, embedding_dim, num_heads,dense_dim, num_layers,input_vocab_size, 
               output_vocab_size, dropout_rate=0.1, layernorm_eps=1e-6):
        super(Transformer, self).__init__()

        self.encoder = Encoder(num_layers=num_layers,
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               dense_dim=dense_dim,
                               input_vocab_size=input_vocab_size,
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.decoder = Decoder(num_layers=num_layers, 
                               embedding_dim=embedding_dim,
                               num_heads=num_heads,
                               dense_dim=dense_dim,
                               output_vocab_size=output_vocab_size, 
                               dropout_rate=dropout_rate,
                               layernorm_eps=layernorm_eps)

        self.final_layer = tf.keras.layers.Dense(output_vocab_size, activation=tf.nn.log_softmax)
    
    def call(self, inputs):
        # print(type(inputs))
        input_sentence,output_sentence = inputs
        # print('inputs okay')
        enc_output = self.encoder(input_sentence)
        # print('enc_output_generated')
        
        # call self.decoder with the appropriate arguments to get the decoder output
        # dec_output.shape == (batch_size, tar_seq_len, fully_connected_dim)
        dec_output = self.decoder(output_sentence,enc_output)
        # print('dec_output_generated')
        
        # pass decoder output through a linear layer and softmax (~1 line)
        logits = self.final_layer(dec_output)
        ### END CODE HERE ###
        try:
        # Drop the keras mask, so it doesn't scale the losses/metrics.
        # b/250038731
            del logits._keras_mask
        except AttributeError:
            pass

        # Return the final output and the attention weights.
        return logits

In [40]:
embed_dim = 128
heads = 2
latent = 512
no_of_layers = 2

transformer1 = Transformer(embedding_dim=embed_dim,num_heads=heads,dense_dim=latent,
                          num_layers=no_of_layers,input_vocab_size=vocab_size_eng,
                           output_vocab_size=vocab_size_hin)

In [41]:
#optimizer
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [42]:
logits = transformer1((inp,tar_in))
transformer1.summary()
learning_rate = CustomSchedule(d_model=embed_dim)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)
def masked_loss(y_true, y_pred):
    
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)
    
    # Check which elements of y_true are padding
    mask = tf.cast(y_true != 0, loss.dtype)
    
    loss *= mask
    # Return the total.
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)


def masked_acc(y_true, y_pred):
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)
    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)
    match*= mask

    return tf.reduce_sum(match)/tf.reduce_sum(mask)




In [43]:
transformer1.compile(optimizer =optimizer,loss = masked_loss,metrics=[masked_acc])


In [48]:
epochs =15
# steps_per_epoch =500
transformer1.fit(train_ds,
                 epochs =epochs,
#                  steps_per_epoch = steps_per_epoch,
                 validation_data=new_val_ds,
#                  validation_steps =20,
                callbacks = tf.keras.callbacks.EarlyStopping(patience =1)
                )

Epoch 1/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 19ms/step - loss: 0.8963 - masked_acc: 0.8194 - val_loss: 1.6543 - val_masked_acc: 0.7450
Epoch 2/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 18ms/step - loss: 0.8689 - masked_acc: 0.8249 - val_loss: 1.6562 - val_masked_acc: 0.7460


<keras.src.callbacks.history.History at 0x7b42e9da6710>

In [49]:
x1 = transformer1.evaluate(train_ds)
print(x1)
x2 = transformer1.evaluate(val_ds)
print(x2)
x3 = transformer1.evaluate(test_ds)
print(x3)

[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 6ms/step - loss: 0.9344 - masked_acc: 0.8001
[0.8272944688796997, 0.8199390769004822]
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 2.2357 - masked_acc: 0.6403
[2.2312262058258057, 0.6417375206947327]
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 2.2844 - masked_acc: 0.6377
[2.287414073944092, 0.6375607252120972]


In [50]:
transformer1.evaluate(new_val_ds)

[1m606/606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 1.6469 - masked_acc: 0.7463


[1.656206727027893, 0.7460039854049683]

In [51]:
#save the dataframe
final_df.to_csv('english_hindi_10k_vocab.csv')

In [52]:
#save the vectorization layers
english_vectorization_config = english_vectorization.get_config()
english_vectorization_config.pop('standardize', None)
english_vocab = english_vectorization.get_vocabulary()
with open('english_vectorization_config.json', 'w', encoding='utf-8') as f:
    json.dump(english_vectorization_config, f)
    
with open('english_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(english_vocab, f)
    
hindi_vectorization_config = hindi_vectorization.get_config()
hindi_vectorization_config.pop('standardize', None)
hindi_vocab = hindi_vectorization.get_vocabulary()
with open('hindi_vectorization_config.json', 'w', encoding='utf-8') as f:
    json.dump(hindi_vectorization_config, f)
    
with open('hindi_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(hindi_vocab, f)

In [53]:
#save the model weights
transformer1.save_weights('english_hindi_model.weights.h5')

In [54]:
hindi_vocab = hindi_vectorization.get_vocabulary()
hindi_index_lookup = dict(zip(range(len(hindi_vocab)), hindi_vocab))
max_decoded_sentence_length = 25

def decode_sentence(input_sentence):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = "[SOS]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = hindi_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer1([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = tf.argmax(predictions[0, i, :]).numpy().item(0)
        sampled_token = hindi_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[EOS]":
            break
    return decoded_sentence

In [55]:
random.seed(42)
for _ in range(5):
    input_sentence,output_sentence = random.choice(train_pairs)
    # input_sentence = input_sentence.lower()
    # input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print(f"original  : [SOS] {output_sentence} [EOS]")
    print()

input: initial file chooser folder
translated: [SOS] आरंभिक फ़ाइल चयनक फ़ोल्डर [EOS]
original  : [SOS] प्रारंभिक फ़ाइल चयनकर्ता फ़ोल्डर [EOS]

input: so the first thing to figure out about this hyperbola is ,
translated: [SOS] तो , पहली बात यह है के बारे में इस अति परवलय समझ से बाहर है [EOS]
original  : [SOS] तो , पहली बात यह है के बारे में इस अति परवलय समझ से बाहर है [EOS]

input: . what does right to information mean ?
translated: [SOS] सूचना अधिकार का क्या अर्थ है ? [EOS]
original  : [SOS] . सूचना अधिकार का क्या अर्थ है ? [EOS]

input: but i don t think that it s strange at all .
translated: [SOS] लेकिन मुझे लगता है कि यह सब अजीब है [EOS]
original  : [SOS] पर मेरे खयाल से यह बिलकुल भी अजीब नहीं है । [EOS]

input: that is also a positive .
translated: [SOS] यह भी सकारात्मक है । [EOS]
original  : [SOS] वह भी सकारात्मक अर्थ में . [EOS]



In [56]:
random.seed(42)
for _ in range(5):
    input_sentence,output_sentence = random.choice(val_pairs)
    # input_sentence = input_sentence.lower()
    # input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print(f"original  : [SOS] {output_sentence} [EOS]")
    print()

input: and carry the .
translated: [SOS] हासिल हुआ [EOS]
original  : [SOS] और को हासिल ले लेते हैं [EOS]

input: india is a country in south asia .
translated: [SOS] भारत दक्षिण एशिया का एक देश है । [EOS]
original  : [SOS] भारत देश उत्तरी गोलार्ध में स्थित है । [EOS]

input: we should first make sure that it s not in this world .
translated: [SOS] जो हम दुनिया को यकीन न करें वह इस संसार में नहीं जानता । [EOS]
original  : [SOS] हमे पहले यह निश्चित करना चाहिए कि यह इस दुनिया का नहीं है । [EOS]

input: job type
translated: [SOS] कार्य प्रकार [EOS]
original  : [SOS] कार्य क़िस्म [EOS]

input: but we kind of chose this shape .
translated: [SOS] लेकिन हमने इस तरह की आकृति को चुना [EOS]
original  : [SOS] लेकिन हमने इस तरह की आकृति को चुना . [EOS]



In [57]:
random.seed(42)
for _ in range(5):
    input_sentence,output_sentence = random.choice(test_pairs)
    # input_sentence = input_sentence.lower()
    # input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print(f"original  : [SOS] {output_sentence} [EOS]")
    print()


input: so we wanted to win this match .
translated: [SOS] इसलिए हम इस मैच को जीतना चाहते हैं । [EOS]
original  : [SOS] इसलिए हम इस मैच को जीतना चाहते थे . [EOS]

input: it will have a common name all over the country .
translated: [SOS] यह एक आम नाम के सभी लोग ही हैं । [EOS]
original  : [SOS] देश भर में दुकान के लिए एक ही नाम होगा । [EOS]

input: please select a folder below
translated: [SOS] कृपया नीचे कोई फ़ोल्डर चुनें [EOS]
original  : [SOS] कृपया एक फोल्डर नीचे से चुनें [EOS]

input: and second of all ,
translated: [SOS] और दूसरी बात , [EOS]
original  : [SOS] और दूसरा , [EOS]

input: end of a short break
translated: [SOS] छोटे ब्रेक की समाप्ति [EOS]
original  : [SOS] छोटे ब्रेक की समाप्ति [EOS]



In [58]:
my_text = ["where are you ?",
          "i am not one of them ",
          "what do you think about them ?",
          "today is a good day",
          "who are you?",]

In [59]:
for i in my_text:
    print('sentence: ',i)
    print('translation: ',decode_sentence(i))
    print()

sentence:  where are you ?
translation:  [SOS] तुम कहाँ हो ? [EOS]

sentence:  i am not one of them 
translation:  [SOS] मैं उनमें से एक नहीं हूं [EOS]

sentence:  what do you think about them ?
translation:  [SOS] क्या आपको क्या लगता है ? [EOS]

sentence:  today is a good day
translation:  [SOS] आज के दिन अच्छा है । [EOS]

sentence:  who are you?
translation:  [SOS] तुम कौन हो ? [EOS]



In [60]:
#Running the transfomer without masked accuracy and masked loss

In [61]:
#simpler model
embed_dim2 = 128
heads2 = 2
latent2 = 512
no_of_layers2 = 2
transformer2 = Transformer(embedding_dim=embed_dim2,num_heads=heads2,dense_dim=latent2,
                          num_layers=no_of_layers2,input_vocab_size=vocab_size_eng,
                           output_vocab_size=vocab_size_hin)

In [62]:
transformer2((inp,tar_in))
transformer2.summary()
normal_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
transformer2.compile(optimizer = 'adam',loss = normal_loss,metrics = ['accuracy'])


In [63]:
epochs =15
# steps_per_epoch =500
transformer2.fit(train_ds,
                 epochs =epochs,
#                  steps_per_epoch = steps_per_epoch,
                 validation_data=new_val_ds,
#                  validation_steps =20,
                callbacks = tf.keras.callbacks.EarlyStopping(patience =3)
                )

Epoch 1/15


W0000 00:00:1727362172.497338     111 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m2410/2412[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 18ms/step - accuracy: 0.7997 - loss: 1.6050

W0000 00:00:1727362221.579760     113 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.7998 - loss: 1.6046

W0000 00:00:1727362228.400420     111 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert
W0000 00:00:1727362233.384636     111 assert_op.cc:38] Ignoring Assert operator compile_loss/sparse_categorical_crossentropy/SparseSoftmaxCrossEntropyWithLogits/assert_equal_1/Assert/Assert


[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 24ms/step - accuracy: 0.7998 - loss: 1.6044 - val_accuracy: 0.9045 - val_loss: 0.5390
Epoch 2/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.8730 - loss: 0.6524 - val_accuracy: 0.9237 - val_loss: 0.3794
Epoch 3/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 19ms/step - accuracy: 0.8922 - loss: 0.4992 - val_accuracy: 0.9312 - val_loss: 0.3316
Epoch 4/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.9033 - loss: 0.4275 - val_accuracy: 0.9347 - val_loss: 0.3141
Epoch 5/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 19ms/step - accuracy: 0.9166 - loss: 0.3509 - val_accuracy: 0.9396 - val_loss: 0.2970
Epoch 7/15
[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 19ms/step - accuracy: 0.9213 - loss: 0.3257 - val_accuracy: 0.9396 - val_loss: 0.2992
Epoch 8/15
[1m

<keras.src.callbacks.history.History at 0x7b42e24b6920>

In [65]:
y1 = transformer2.evaluate(train_ds)
print(y1)
y2 = transformer2.evaluate(val_ds)
print(y2)
y3 = transformer2.evaluate(test_ds)
print(y3)

[1m2412/2412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 6ms/step - accuracy: 0.9311 - loss: 0.2694
[0.24400301277637482, 0.9367454648017883]
[1m284/284[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 7ms/step - accuracy: 0.8990 - loss: 0.5667
[0.5677531957626343, 0.8987598419189453]
[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9010 - loss: 0.5673
[0.562375545501709, 0.9008325338363647]


In [66]:
transformer2.evaluate(new_val_ds)

[1m606/606[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - accuracy: 0.9446 - loss: 0.2985


[0.29748889803886414, 0.9448650479316711]

In [67]:
hindi_vocab = hindi_vectorization.get_vocabulary()
hindi_index_lookup = dict(zip(range(len(hindi_vocab)), hindi_vocab))
max_decoded_sentence_length = 25

def decode_sentence_2(input_sentence):
    tokenized_input_sentence = english_vectorization([input_sentence])
    decoded_sentence = "[SOS]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = hindi_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer2([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = tf.argmax(predictions[0, i, :]).numpy().item(0)
        sampled_token = hindi_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[EOS]":
            break
    return decoded_sentence

In [68]:
random.seed(42)
for _ in range(5):
    input_sentence,output_sentence = random.choice(train_pairs)
    # input_sentence = input_sentence.lower()
    # input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence_2(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print(f"original  : [SOS] {output_sentence} [EOS]")
    print()

input: initial file chooser folder
translated: [SOS] आरंभिक फ़ाइल चयनक फ़ोल्डर [EOS]
original  : [SOS] प्रारंभिक फ़ाइल चयनकर्ता फ़ोल्डर [EOS]

input: so the first thing to figure out about this hyperbola is ,
translated: [SOS] तो पहली बात यह है कि इस अति परवलय समझ से बाहर है [EOS]
original  : [SOS] तो , पहली बात यह है के बारे में इस अति परवलय समझ से बाहर है [EOS]

input: . what does right to information mean ?
translated: [SOS] जरूरी जानकारी है इसका क्या मतलब है ? [EOS]
original  : [SOS] . सूचना अधिकार का क्या अर्थ है ? [EOS]

input: but i don t think that it s strange at all .
translated: [SOS] लेकिन मुझे नहीं लगता कि यह अजीब है । [EOS]
original  : [SOS] पर मेरे खयाल से यह बिलकुल भी अजीब नहीं है । [EOS]

input: that is also a positive .
translated: [SOS] यह भी सकारात्मक है । [EOS]
original  : [SOS] वह भी सकारात्मक अर्थ में . [EOS]



In [69]:
random.seed(42)
for _ in range(5):
    input_sentence,output_sentence = random.choice(val_pairs)
    # input_sentence = input_sentence.lower()
    # input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence_2(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print(f"original  : [SOS] {output_sentence} [EOS]")
    print()

input: and carry the .
translated: [SOS] और हासिल कर [EOS]
original  : [SOS] और को हासिल ले लेते हैं [EOS]

input: india is a country in south asia .
translated: [SOS] दक्षिण एशिया का एक देश है । [EOS]
original  : [SOS] भारत देश उत्तरी गोलार्ध में स्थित है । [EOS]

input: we should first make sure that it s not in this world .
translated: [SOS] हमें यह पूरी दुनिया में यह है कि यह दुनिया में नहीं होना चाहिए [EOS]
original  : [SOS] हमे पहले यह निश्चित करना चाहिए कि यह इस दुनिया का नहीं है । [EOS]

input: job type
translated: [SOS] कार्य प्रकार [EOS]
original  : [SOS] कार्य क़िस्म [EOS]

input: but we kind of chose this shape .
translated: [SOS] लेकिन हमने इस तरह का आकार बदल दिया है [EOS]
original  : [SOS] लेकिन हमने इस तरह की आकृति को चुना . [EOS]



In [70]:
random.seed(42)
for _ in range(5):
    input_sentence,output_sentence = random.choice(test_pairs)
    # input_sentence = input_sentence.lower()
    # input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence_2(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print(f"original  : [SOS] {output_sentence} [EOS]")
    print()

input: so we wanted to win this match .
translated: [SOS] तो हम इस मैच को जीतना चाहते हैं । [EOS]
original  : [SOS] इसलिए हम इस मैच को जीतना चाहते थे . [EOS]

input: it will have a common name all over the country .
translated: [SOS] देश के साथ यह सब कुछ ही नाम होगा [EOS]
original  : [SOS] देश भर में दुकान के लिए एक ही नाम होगा । [EOS]

input: please select a folder below
translated: [SOS] कृपया निम्न फ़ोल्डर चुनें [EOS]
original  : [SOS] कृपया एक फोल्डर नीचे से चुनें [EOS]

input: and second of all ,
translated: [SOS] और दूसरी चीज़ें , [EOS]
original  : [SOS] और दूसरा , [EOS]

input: end of a short break
translated: [SOS] छोटे ब्रेक का अंत [EOS]
original  : [SOS] छोटे ब्रेक की समाप्ति [EOS]



In [None]:
# we can see some differences in masked accuracy translations and normal accuracy translations