## Making Punjabi Transliteration Model for Smooth JavaScript Deployment

In [184]:
import tensorflow as tf
import json
import pandas as pd
import keras


In [185]:
def read_json_to_dataframe(file_path):
    """
    Reads data from a JSON file and returns a pandas DataFrame.

    Args:
        file_path (str): Path to the JSON file.

    Returns:
        pd.DataFrame: DataFrame containing the JSON data.
    """
    try:
        with open(file_path, "r") as json_file:
            data = json.load(json_file)
        df = pd.DataFrame(data)
        return df
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None
    except json.JSONDecodeError:
        print(f"Error: Invalid JSON data in '{file_path}'.")
        return None


In [186]:

train_df = read_json_to_dataframe('pan_train.json')
val_df = read_json_to_dataframe('pan_valid.json')
test_df = read_json_to_dataframe('pan_test.json')


In [187]:
train_df.head()

Unnamed: 0,unique_identifier,native word,english word,source,score
0,pan1,‡®Ö‡©±‡®ú‡®ï‡©±‡®≤‡©ç‡®π,ajjkallh,Dakshina,
1,pan2,‡®ï‡©Å‡©±‡®ï‡®∞‡©Ä,kookerii,Dakshina,
2,pan3,‡®ß‡®Æ‡®®‡©Ä‡®Ü‡®Ç,dhamniyan,Dakshina,
3,pan4,‡®ñ‡®º‡©Å‡®¶‡®ó‡®º‡®∞‡®ú‡®º‡©Ä,khudgarzi,Dakshina,
4,pan5,‡®Ö‡®ó‡®≤‡©á‡®∞‡®æ,aglera,Dakshina,


In [188]:
test_df.head()

Unnamed: 0,unique_identifier,native word,english word,source
0,pan1,‡®™‡®≤‡®ü‡®æ‡®â‡®Ç‡®¶‡®ø‡®Ü‡®Ç,paltaundiyaan,AK-Freq
1,pan2,‡®Æ‡©à‡®∏‡®≤‡©Ä‡®≤‡©Ä‡®Ü,messleeleeya,AK-Freq
2,pan3,‡®™‡©ç‡®∞‡®Æ‡©ã‡®∏‡®®‡®æ‡®Ç,prmosanaan,AK-Freq
3,pan4,‡®™‡©±‡©Å‡®ú‡©á,pujje,AK-Freq
4,pan5,‡®¶‡®¨‡©ú‡©ç‡®π‡©Ä‡®ñ‡®æ‡®®‡®æ,dabadeehkhana,AK-Freq


In [189]:
val_df.head()

Unnamed: 0,unique_identifier,native word,english word,source,score
0,pan1,‡®Æ‡®ø‡®∏‡®º‡©á‡®≤‡©á,michelle,Wikidata,
1,pan2,‡®µ‡©Ä‡®®‡®∏,venus,Wikidata,
2,pan3,‡®´‡®º‡®∞‡©Ä‡®¶,fareed,Wikidata,
3,pan4,‡®Ø‡®æ‡®Æ‡®ø‡®®‡©Ä,yamini,Wikidata,
4,pan5,‡®ú‡©á‡®Æ‡®∏,james,Wikidata,


#### Testing With Character Level Embeddings for both EN and PA

In [190]:
def get_X_and_y(dataframe:pd.DataFrame):
    X = dataframe["english word"].tolist()
    y= dataframe["native word"].tolist()
    return (X,y)

In [191]:
X_train, y_train = get_X_and_y(train_df)
X_test, y_test = get_X_and_y(test_df)
X_val, y_val = get_X_and_y(val_df)

In [192]:
#Mini Model

X_train = X_train
y_train = y_train
X_val = X_val
y_val = y_val

In [193]:
len(X_train), len(y_train), len(X_test), len(y_test), len(X_val), len(y_val)

(514724, 514724, 11237, 11237, 8880, 8880)

In [194]:
X_train[:5], y_train[:5]

(['ajjkallh', 'kookerii', 'dhamniyan', 'khudgarzi', 'aglera'],
 ['‡®Ö‡©±‡®ú‡®ï‡©±‡®≤‡©ç‡®π', '‡®ï‡©Å‡©±‡®ï‡®∞‡©Ä', '‡®ß‡®Æ‡®®‡©Ä‡®Ü‡®Ç', '‡®ñ‡®º‡©Å‡®¶‡®ó‡®º‡®∞‡®ú‡®º‡©Ä', '‡®Ö‡®ó‡®≤‡©á‡®∞‡®æ'])

##### Splitting the words into chars

In [195]:
def words_to_chars(words:list[str]):
    
    return [" ".join(list(word)) for word in words]


In [196]:
X_train = words_to_chars(X_train)
y_train = words_to_chars(y_train)
X_test = words_to_chars(X_test)
y_test = words_to_chars(y_test)
X_val = words_to_chars(X_val)
y_val = words_to_chars(y_val)

In [197]:
y_test[:5], X_test[:5]

(['‡®™ ‡®≤ ‡®ü ‡®æ ‡®â ‡®Ç ‡®¶ ‡®ø ‡®Ü ‡®Ç',
  '‡®Æ ‡©à ‡®∏ ‡®≤ ‡©Ä ‡®≤ ‡©Ä ‡®Ü',
  '‡®™ ‡©ç ‡®∞ ‡®Æ ‡©ã ‡®∏ ‡®® ‡®æ ‡®Ç',
  '‡®™ ‡©± ‡©Å ‡®ú ‡©á',
  '‡®¶ ‡®¨ ‡©ú ‡©ç ‡®π ‡©Ä ‡®ñ ‡®æ ‡®® ‡®æ'],
 ['p a l t a u n d i y a a n',
  'm e s s l e e l e e y a',
  'p r m o s a n a a n',
  'p u j j e',
  'd a b a d e e h k h a n a'])

##### Preprocessing Punjabi chars for decoder and labels with start and end tokens

In [198]:
def decoder_and_labels(y_text:list[str]):
    decoder_inputs = ["start "+chars for chars in y_text]
    labels = [chars+" end" for chars in y_text]
    return (decoder_inputs, labels)


In [199]:
y_train_decoder, y_train_labels = decoder_and_labels(y_train)
y_test_decoder, y_test_labels = decoder_and_labels(y_test)
y_val_decoder, y_val_labels = decoder_and_labels(y_val)


In [200]:
y_val_decoder[:5], y_val_labels[:5]

(['start ‡®Æ ‡®ø ‡®∏ ‡®º ‡©á ‡®≤ ‡©á',
  'start ‡®µ ‡©Ä ‡®® ‡®∏',
  'start ‡®´ ‡®º ‡®∞ ‡©Ä ‡®¶',
  'start ‡®Ø ‡®æ ‡®Æ ‡®ø ‡®® ‡©Ä',
  'start ‡®ú ‡©á ‡®Æ ‡®∏'],
 ['‡®Æ ‡®ø ‡®∏ ‡®º ‡©á ‡®≤ ‡©á end',
  '‡®µ ‡©Ä ‡®® ‡®∏ end',
  '‡®´ ‡®º ‡®∞ ‡©Ä ‡®¶ end',
  '‡®Ø ‡®æ ‡®Æ ‡®ø ‡®® ‡©Ä end',
  '‡®ú ‡©á ‡®Æ ‡®∏ end'])

##### Text Vectorization

In [201]:
# Visualizing the data
X_train[:2], y_train[:2], y_train_decoder[:2], y_train_labels[:2]

(['a j j k a l l h', 'k o o k e r i i'],
 ['‡®Ö ‡©± ‡®ú ‡®ï ‡©± ‡®≤ ‡©ç ‡®π', '‡®ï ‡©Å ‡©± ‡®ï ‡®∞ ‡©Ä'],
 ['start ‡®Ö ‡©± ‡®ú ‡®ï ‡©± ‡®≤ ‡©ç ‡®π', 'start ‡®ï ‡©Å ‡©± ‡®ï ‡®∞ ‡©Ä'],
 ['‡®Ö ‡©± ‡®ú ‡®ï ‡©± ‡®≤ ‡©ç ‡®π end', '‡®ï ‡©Å ‡©± ‡®ï ‡®∞ ‡©Ä end'])

In [202]:
from keras.layers import TextVectorization

In [203]:
MAX_TOKENS = 100
SEQ_LENGTH = 50

english_vectorizer = TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=SEQ_LENGTH, pad_to_max_tokens=True)

punjabi_vectorizer = TextVectorization(max_tokens=MAX_TOKENS, output_sequence_length=SEQ_LENGTH, pad_to_max_tokens=True)

english_vectorizer.adapt(X_train)
punjabi_vectorizer.adapt([f"start {s} end" for s in y_train])


In [204]:
punjabi_vectorizer.get_vocabulary()[:5]

['', '[UNK]', 'start', 'end', '‡®æ']

In [205]:
punjabi_vocab_len = len(punjabi_vectorizer.get_vocabulary())
eng_vocab_len = len(english_vectorizer.get_vocabulary())

In [206]:
punjabi_vectorizer(y_train_decoder[:1])

<tf.Tensor: shape=(1, 50), dtype=int64, numpy=
array([[ 2, 33, 31, 20, 10, 31,  9, 32, 30,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]])>

In [207]:
punjabi_vectorizer(y_train_labels[:1])

<tf.Tensor: shape=(1, 50), dtype=int64, numpy=
array([[33, 31, 20, 10, 31,  9, 32, 30,  3,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]])>

In [208]:
punjabi_vocab_len, eng_vocab_len

(61, 28)

## Creating The model ü§ì

In [209]:

#encoder
encoder_inputs = keras.layers.Input(shape=(50), dtype=tf.int32)
#eng_vector = english_vectorizer(encoder_inputs)
eng_embs = keras.layers.Embedding(input_dim=eng_vocab_len, output_dim=512, input_length=SEQ_LENGTH)(encoder_inputs)
eng_lstm = keras.layers.Bidirectional(keras.layers.LSTM(256, return_sequences=True))(eng_embs)
encoder_outputs, state_h, state_c = keras.layers.LSTM(512, return_sequences=True, return_state=True)(eng_lstm)


In [210]:
#decoder
decoder_inputs = keras.layers.Input(shape=(50), dtype=tf.int32)
#punjabi_vector = punjabi_vectorizer(decoder_inputs)
punjabi_embs = keras.layers.Embedding(input_dim=punjabi_vocab_len, output_dim=512, input_length=SEQ_LENGTH)(decoder_inputs)
punjabi_lstm = keras.layers.LSTM(512, return_sequences=True)(punjabi_embs, initial_state=[state_h, state_c])
decoder_lstm = keras.layers.LSTM(512, return_sequences=True)(punjabi_lstm)
#context_vector = keras.layers.Attention()([decoder_lstm,encoder_outputs])
#context_concat = keras.layers.Concatenate()([decoder_lstm, context_vector])
dense_layer = keras.layers.Dense(64, activation='relu')(decoder_lstm)
decoder_outputs = keras.layers.Dense(punjabi_vocab_len, activation='softmax')(dense_layer)


In [211]:
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [212]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [213]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_10 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 embedding_9 (Embedding)     (None, 50, 512)              14336     ['input_10[0][0]']            
                                                                                                  
 input_11 (InputLayer)       [(None, 50)]                 0         []                            
                                                                                                  
 bidirectional_4 (Bidirecti  (None, 50, 512)              1574912   ['embedding_9[0][0]']         
 onal)                                                                                      

In [214]:
y_train_labels = tf.cast(tf.constant(punjabi_vectorizer(y_train_labels), dtype=tf.int64), dtype=tf.int32)
y_val_labels = tf.cast(tf.constant(punjabi_vectorizer(y_val_labels), dtype=tf.int64), dtype=tf.int32)

y_train_decoder = tf.cast(tf.constant(punjabi_vectorizer(y_train_decoder),dtype=tf.int64), dtype=tf.int32)
y_val_decoder = tf.cast(tf.constant(punjabi_vectorizer(y_val_decoder), dtype=tf.int64), dtype=tf.int32)

X_train = tf.cast(tf.constant(english_vectorizer(X_train), dtype=tf.int64), dtype=tf.int32)
X_val = tf.cast(tf.constant(english_vectorizer(X_val), dtype=tf.int64), dtype=tf.int32)




In [166]:
y_train_labels[:2]

<tf.Tensor: shape=(2, 50), dtype=int16, numpy=
array([[33, 31, 20, 10, 31,  9, 32, 30,  3,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0],
       [10, 23, 31, 10,  5,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0]], dtype=int16)>

In [None]:
model.fit([X_train, y_train_decoder], y_train_labels, validation_data=([X_val, y_val_decoder], y_val_labels), epochs=1000, callbacks=[keras.callbacks.EarlyStopping(restore_best_weights=True, patience=5), keras.callbacks.ModelCheckpoint(save_best_only=True, verbose=0, filepath="model2/model")])

In [217]:
import tensorflowjs as tfjs

tfjs.converters.save_keras_model(model, artifacts_dir="TFJS")

  saving_api.save_model(


In [None]:
model = keras.models.load_model('model2/model')

In [169]:
y_test_labels = tf.constant(punjabi_vectorizer(y_test_labels), dtype=tf.int64)

y_test_decoder = tf.constant(y_test_decoder, dtype=tf.string)

X_test = tf.constant(X_test, dtype=tf.string)


In [218]:
import numpy as np
def transliterate(text:str):

    decoder_input = "start"
    text = " ".join(list(text))

    text = english_vectorizer(text)
    text = tf.cast(text, dtype=tf.int16)
    text = tf.expand_dims(text, axis=0)
    
    

    for x in range(30):

        decoder_tokens = punjabi_vectorizer(decoder_input)
        decoder_tokens = tf.cast(decoder_tokens, dtype=tf.int16)
        decoder_tokens = tf.expand_dims(decoder_tokens, axis=0)
                    
        prediction = model.predict([text, decoder_tokens], verbose=0)
        pred_text = punjabi_vectorizer.get_vocabulary()[ np.argmax(prediction[0, x, :])]
        if(pred_text=='end'):
            break
        decoder_input= decoder_input+" "+pred_text
        
    
    
    return "".join(list(decoder_input)).replace(" ","").replace('start',"")


        


In [220]:
def transliterate_text(text:str):
    texts = text.split(" ")
    return " ".join([transliterate(t) for t in texts])


In [221]:
transliterate_text("mera naam sarbjeet hai ate mai bahut changa munda ha")

'‡®Æ‡©á‡®∞‡®æ ‡®®‡®æ‡®Æ ‡®∏‡®∞‡®¨‡®ú‡©Ä‡®§ ‡®π‡©à ‡®è‡®ü‡©Ä‡®à ‡®Æ‡©à ‡®¨‡®æ‡®π‡©Å‡®§ ‡®ö‡©∞‡®ó‡®æ ‡®Æ‡©Å‡©∞‡®°‡®æ ‡®π‡®æ'

In [176]:
transliterate("sarb")

'‡®∏‡®∞‡®¨'

In [222]:
def top_2_words(text:str):
    decoder_input = "start"
    text = " ".join(list(text))

    text = english_vectorizer(text)
    text = tf.cast(text, dtype=tf.int16)
    text = tf.expand_dims(text, axis=0)
    

    letters = []
        

    for x in range(30):
        decoder_tokens = punjabi_vectorizer(decoder_input)
        decoder_tokens = tf.cast(decoder_tokens, dtype=tf.int16)
        decoder_tokens = tf.expand_dims(decoder_tokens, axis=0)
                    
        prediction = model.predict([text, decoder_tokens], verbose=0)
        top_indexes = np.argsort(prediction[0, x, :])[-2:]
        top_letters = [punjabi_vectorizer.get_vocabulary()[i] for i in top_indexes]
        letters.append(list(top_letters))
        pred_text = top_letters[-1]
        if(pred_text=='end'):
            break
        decoder_input= decoder_input+" "+pred_text
        
    
    return("".join(list(decoder_input)).replace(" ","").replace('start',""), letters)
     

    
    

In [228]:
top_2_words("bhainn")

('‡®≠‡©à‡®£', [['‡®ò', '‡®≠'], ['‡®æ', '‡©à'], ['‡®®', '‡®£'], ['‡®æ', 'end']])

In [227]:
from itertools import product

def generate_word_combinations(sublists):
    # Generate all possible combinations of words from the sublists
    combinations = product(*sublists)
    
    # Join each combination of letters into words
    words = [''.join(combination) for combination in combinations]
    
    return words

# Example usage:
_,sublists = top_2_words("bhainn")
possible_words = generate_word_combinations(sublists)
print([x.replace("end","") for x in possible_words[-5:]])
print("Number of possible words:", len(possible_words))


['‡®≠‡®æ‡®£', '‡®≠‡©à‡®®‡®æ', '‡®≠‡©à‡®®', '‡®≠‡©à‡®£‡®æ', '‡®≠‡©à‡®£']
Number of possible words: 16


In [None]:
punjabi_vectorizer.get_vocabulary()

['',
 '[UNK]',
 'start',
 'end',
 '‡®æ',
 '‡®∞',
 '‡®∏',
 '‡©Ä',
 '‡®ø',
 '‡®≤',
 '‡®ï',
 '‡®®',
 '‡©ã',
 '‡®Æ',
 '‡©á',
 '‡®ü',
 '‡®™',
 '‡®µ',
 '‡®Ç',
 '‡®ó',
 '‡®ú',
 '‡®¨',
 '‡®§',
 '‡©Å',
 '‡®¶',
 '‡©à',
 '‡©∞',
 '‡®º',
 '‡®Ü',
 '‡®°',
 '‡®π',
 '‡©±',
 '‡©ç',
 '‡®Ö',
 '‡®´',
 '‡©Ç',
 '‡®ö',
 '‡®à',
 '‡®á',
 '‡®ñ',
 '‡®ê',
 '‡®£',
 '‡©å',
 '‡®â',
 '‡©ú',
 '‡®ß',
 '‡®•',
 '‡®≠',
 '‡®ì',
 '‡®Ø',
 '‡®è',
 '‡®ò',
 '‡®õ',
 '‡®ä',
 '‡®ù',
 '‡®†',
 '‡®¢',
 '‡®î',
 '‡®û',
 '‡®ô',
 '‡©≤']