In [None]:
src_url = "https://storage.googleapis.com/gresearch/dakshina/dakshina_dataset_v1.0.tar"
src_zip = "dakshina_dataset_v1.0.tar"
DATA_SRC="dakshina_dataset_v1.0/ta/lexicons"
DATA_TRAIN_SRC = "/ta.translit.sampled.train.tsv"
DATA_VAL_SRC = "/ta.translit.sampled.dev.tsv" 
DATA_TEST_SRC = "/ta.translit.sampled.test.tsv"
#TRAIN_IMAGES_PER_LABEL = 1000
#TEST_IMAGES_PER_LABEL = 200
BALANCED_SPLITS = {"train" : 900, "val" : 100}
PROJECT_NAME = "CS6910 ASSIGNMENT 3"
dataset='dakshina-dataset'

In [None]:
%%capture
!curl -SL $src_url > $src_zip
!tar -xf $src_zip

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
import numpy as np
import matplotlib.pyplot as plt
from random import shuffle
import pandas as pd
import keras
from keras.layers import Input, LSTM, Dense, Embedding, GRU, SimpleRNN, Dropout, Activation, dot, concatenate, TimeDistributed
from keras.models import Model

!pip3 install tensorflow -qqq
!pip3 install wandb -qqq
import wandb
!wandb login
from wandb.keras import WandbCallback

[K     |████████████████████████████████| 1.8MB 7.4MB/s 
[K     |████████████████████████████████| 102kB 13.0MB/s 
[K     |████████████████████████████████| 133kB 36.7MB/s 
[K     |████████████████████████████████| 174kB 28.3MB/s 
[K     |████████████████████████████████| 71kB 11.8MB/s 
[?25h  Building wheel for subprocess32 (setup.py) ... [?25l[?25hdone
  Building wheel for pathtools (setup.py) ... [?25l[?25hdone
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter: 
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [None]:
import matplotlib.font_manager as fm

!wget https://github.com/arul20be/Latha-Tamil-Font/blob/master/system/fonts/NotoSansTamil-Regular.ttf
fm.fontManager.ttflist += fm.createFontList(['NotoSansTamil-Regular.ttf'])
plt.rc('font', family='NotoSansTamil-Regular-Regular')
!mv NotoSansTamil-Regular.ttf /usr/share/fonts/truetype/liberation/

In [None]:
!fc-list :lang=en

Uploading Data: only run once

In [None]:

run = wandb.init(project=PROJECT_NAME, entity='cs6910krsrd',job_type="upload")

# create an artifact for all the raw data
raw_data_at = wandb.Artifact(dataset, type="raw_data")

raw_data_at.add_dir(DATA_SRC)

# save artifact to W&B
run.log_artifact(raw_data_at)
run.finish()

Downloading Data

In [None]:
run = wandb.init(project=PROJECT_NAME, entity='cs6910krsrd',job_type="download")

# Query W&B for an artifact and mark it as input to this run
artifact = run.use_artifact(dataset+':latest')

# Download the artifact's contents
artifact_dir = artifact.download()
run.finish()

[34m[1mwandb[0m: Currently logged in as: [33mcs6910krsrd[0m (use `wandb login --relogin` to force relogin)


VBox(children=(Label(value=' 0.05MB of 0.05MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
colnames=["ntv","rmn",'nAtt'] #native, romanized and number of attestations
df_train = pd.read_csv(artifact_dir + DATA_TRAIN_SRC,sep="\t",names=colnames,na_filter=False)
df_val = pd.read_csv(artifact_dir + DATA_VAL_SRC,sep="\t",names=colnames,na_filter=False)
MODEL_NAME = "Seq2Seq"
FINAL_MODEL_DIR = "trained_model"
ENCODER="encoder"
DECODER="decoder"


batch_size = 64  # Batch size for training.
epochs = 20  # Number of epochs to train for.
latent_dim = 256  # Latent dimensionality of the encoding and decoding space (LSTM/GRU/RNN)
num_samples = 10000  # Number of samples to train on.
embed_dim=16  #embedding size
n_encoder=2 #number of encoder layers 
n_decoder=2 #number of decoder layers
cell_type="LSTM"#"GRU" "RNN" # Cell type of encoder and decoder
do=0.2 #dropout
beam_size=3

input_texts = df_train.rmn.to_list()    # input words(romanized)
target_texts = df_train.ntv.apply(lambda s:'\t'+s+'\n').to_list() # target words(native)
input_characters = set(df_train.rmn.sum()) # input vocabulary (all english letters)
target_characters = set(df_train.ntv.sum()) # target vocabulary (all tamil letters)
val_input_texts = df_val.rmn.to_list()    # input words(romanized)
val_target_texts = df_val.ntv.apply(lambda s: s).to_list() # target words(native)
valid_target_texts = df_val.ntv.apply(lambda s:'\t'+s+'\n').to_list() # target words(native)


input_characters = sorted(list(input_characters)) 
input_characters.append(' ')
target_characters = sorted(list(target_characters))
target_characters.append(' ')
target_characters.append('\t')
target_characters.append('\n')



num_encoder_tokens = len(input_characters)   # size of input vocabulary
num_decoder_tokens = len(target_characters)   # ize of target vocabulary
max_encoder_seq_length = max([len(txt) for txt in input_texts])  # max input word size
max_decoder_seq_length = max([len(txt) for txt in target_texts]) # max output word size

print("Number of samples:", len(input_texts))
print("Number of unique input tokens:", num_encoder_tokens)
print("Number of unique output tokens:", num_decoder_tokens)
print("Max sequence length for inputs:", max_encoder_seq_length)
print("Max sequence length for outputs:", max_decoder_seq_length)
 
input_token_index = dict([(char, i) for i, char in enumerate(input_characters)]) #dict mapping input letters to integers
target_token_index = dict([(char, i) for i, char in enumerate(target_characters)]) #dict mapping output letters to integers

encoder_input_data = np.zeros(                                              
    (len(input_texts), max_encoder_seq_length), dtype="float32"
)

val_encoder_input_data = np.zeros(                                              
    (len(val_input_texts), max_encoder_seq_length), dtype="float32"
)

decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length,len(target_token_index)), dtype="float32"
)

val_decoder_input_data = np.zeros(
    (len(val_input_texts), max_decoder_seq_length), dtype="float32"
)
val_decoder_target_data = np.zeros(
    (len(val_input_texts), max_decoder_seq_length,len(target_token_index)), dtype="float32"
)

CELL={"LSTM":LSTM,"GRU":GRU,"RNN":SimpleRNN}

reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())

Number of samples: 68218
Number of unique input tokens: 27
Number of unique output tokens: 49
Max sequence length for inputs: 30
Max sequence length for outputs: 28


In [None]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
    for t, char in enumerate(input_text):
        encoder_input_data[i, t] = input_token_index[char]
    encoder_input_data[i, t + 1 :] = input_token_index[" "]
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        decoder_input_data[i, t] =  target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            decoder_target_data[i, t - 1,target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :] = target_token_index[" "]
    decoder_target_data[i, t:,target_token_index[" "]] =  1.0

for i, (input_text, target_text) in enumerate(zip(val_input_texts, valid_target_texts)):
    for t, char in enumerate(input_text):
        val_encoder_input_data[i, t] = input_token_index[char]
    val_encoder_input_data[i, t + 1 :] = input_token_index[" "]
    for t, char in enumerate(target_text):
        # decoder_target_data is ahead of decoder_input_data by one timestep
        val_decoder_input_data[i, t] =  target_token_index[char]
        if t > 0:
            # decoder_target_data will be ahead by one timestep
            # and will not include the start character.
            val_decoder_target_data[i, t - 1,target_token_index[char]] = 1.0
    val_decoder_input_data[i, t + 1 :] = target_token_index[" "]
    val_decoder_target_data[i, t:,target_token_index[" "]] =  1.0

In [None]:
def train():

  hyperparameter_defaults={
    "epochs" : 10,
    "num_samples" : 10000,
    "embed_dim" : 64,
    "n_encoder" : 3,
    "n_decoder" : 3,
    "drop_out" : 0.2,
    'latent_dim':128,
    'beam_size':0,
    'batch_size':64,
    'cell_type':"LSTM"
    }

  run = wandb.init(project=PROJECT_NAME, config=hyperparameter_defaults, entity='cs6910krsrd',job_type="train")


  cfg = wandb.config
  batch_size = cfg.batch_size  # Batch size for training.
  epochs = cfg.epochs  # Number of epochs to train for.
  latent_dim = cfg.latent_dim  # Latent dimensionality of the encoding and decoding space (LSTM/GRU/RNN)
  num_samples = cfg.num_samples  # Number of samples to train on.
  embed_dim=cfg.embed_dim#embedding size
  n_encoder=cfg.n_encoder #number of encoder layers 
  n_decoder=cfg.n_decoder #number of decoder layers
  cell_type=cfg.cell_type#"GRU" "RNN" # Cell type of encoder and decoder
  do=cfg.drop_out #dropout
  beam_size=cfg.beam_size
  # Define an input sequence and process it.
  encoder_inputs=Input(shape=(None,))
  x = Embedding(num_encoder_tokens, embed_dim)(encoder_inputs)
  encoder_states = []
  print(CELL)
  for j in range(n_encoder)[:-1]:
    x = (CELL[cell_type](latent_dim,dropout=do,return_state=True,return_sequences=True)(x)) [0]
  temp = CELL[cell_type](latent_dim,dropout=do,return_state=True,return_sequences=True)(x)
  x=temp[0]
  encoder_states = temp[1:]

  # Set up the decoder, using `encoder_states` as initial state.
  decoder_inputs = Input(shape=(None,))
  out_embed=Embedding(num_decoder_tokens, embed_dim)
  embedded_word=out_embed(decoder_inputs)
  x=embedded_word
  output_layers = []
  for j in range(n_decoder):
    output_layers.append(
        CELL[cell_type](latent_dim,dropout=do, return_sequences=True,return_state=True)
    )
    temp = output_layers[-1](x, initial_state=encoder_states)

    x= temp[0]
    decoder_states=temp[1:]



  dropout = Dropout(rate=do)
  x = dropout(x)
  decoder_dense = Dense(num_decoder_tokens, activation='softmax')
  decoder_outputs = decoder_dense(x)

  # Define the model that will turn
  # `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
  model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

  print(beam_size)
  model.compile(
      optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
  )

  if beam_size !=0:
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=([val_encoder_input_data, val_decoder_input_data],val_decoder_target_data),
        callbacks=[WandbCallback()]
    )
    
  else:
    model.fit(
        [encoder_input_data, decoder_input_data],
        decoder_target_data,
        batch_size=batch_size,
        epochs=epochs,
        validation_data=([val_encoder_input_data, val_decoder_input_data],val_decoder_target_data),
        callbacks=[WandbCallback()]
    )

  # Save model
  trained_model_artifact = wandb.Artifact(
            MODEL_NAME+"trn", type="model",
            description="trained model",
            metadata=dict(cfg))
  
  encoder_artifact = wandb.Artifact(
            MODEL_NAME+"enc", type="model",
            description="encoder",
            metadata=dict(cfg))
  
  decoder_artifact = wandb.Artifact(
            MODEL_NAME+"dec", type="model",
            description="decoder",
            metadata=dict(cfg))
  
  encoder_model = Model(encoder_inputs, encoder_states)

  decoder_states_inputs = []
  decoder_states = []
  d_outputs=embedded_word
  for j in range(n_decoder)[::-1]:
    current_state_inputs = [Input(shape=(latent_dim,)) for _ in range(2)]

    temp = output_layers[n_decoder-j-1](d_outputs, initial_state=current_state_inputs)

    d_outputs, cur_states = temp[0], temp[1:]
    decoder_states += cur_states
    decoder_states_inputs += current_state_inputs

  decoder_outputs = decoder_dense(d_outputs)
  decoder_model = Model(
      [decoder_inputs] + decoder_states_inputs,
      [decoder_outputs] + decoder_states)

  # Reverse-lookup token index to decode sequences back to
  # something readable.
  reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
  reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


  model.save(FINAL_MODEL_DIR)
  encoder_model.save(ENCODER)
  decoder_model.save(DECODER)

  trained_model_artifact.add_dir(FINAL_MODEL_DIR)
  encoder_artifact.add_dir(ENCODER)
  decoder_artifact.add_dir(DECODER)


  run.log_artifact(trained_model_artifact)
  run.log_artifact(encoder_artifact)
  run.log_artifact(decoder_artifact)
  run.finish()
  
  model.save("s2s_tamil")

In [None]:
config_defaults = {
    "epochs" : 5,
    "num_samples" : 10000,
    "embed_dim" : 16,
    "n_encoder" : 2,
    "n_decoder" : 2,
    "drop_out" : 0.2,
    'latent_dim':64,
    'beam_size':3,
    'batch_size':64,
    'cell_type':"LSTM"
    }

train()

{'LSTM': <class 'tensorflow.python.keras.layers.recurrent_v2.LSTM'>, 'GRU': <class 'tensorflow.python.keras.layers.recurrent_v2.GRU'>, 'RNN': <class 'tensorflow.python.keras.layers.recurrent.SimpleRNN'>}
0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10




INFO:tensorflow:Assets written to: trained_model/assets


INFO:tensorflow:Assets written to: trained_model/assets


INFO:tensorflow:Assets written to: encoder/assets


INFO:tensorflow:Assets written to: encoder/assets


INFO:tensorflow:Assets written to: decoder/assets


INFO:tensorflow:Assets written to: decoder/assets
[34m[1mwandb[0m: Adding directory to artifact (./trained_model)... Done. 0.3s
[34m[1mwandb[0m: Adding directory to artifact (./encoder)... Done. 0.1s
[34m[1mwandb[0m: Adding directory to artifact (./decoder)... Done. 0.2s


VBox(children=(Label(value=' 24.03MB of 24.03MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.…

0,1
epoch,9.0
loss,0.11231
accuracy,0.9682
val_loss,0.12398
val_accuracy,0.96696
_runtime,262.0
_timestamp,1621486342.0
_step,9.0
best_val_loss,0.12398
best_epoch,9.0


0,1
epoch,▁▂▃▃▄▅▆▆▇█
loss,█▆▅▃▃▂▂▁▁▁
accuracy,▁▃▄▅▆▇▇███
val_loss,█▆▅▃▂▂▁▁▁▁
val_accuracy,▁▃▄▆▇▇████
_runtime,▁▂▃▃▄▅▆▆▇█
_timestamp,▁▂▃▃▄▅▆▆▇█
_step,▁▂▃▃▄▅▆▆▇█




INFO:tensorflow:Assets written to: s2s_tamil/assets


INFO:tensorflow:Assets written to: s2s_tamil/assets


In [None]:
#sweep dictionary
sweep_config={
    'method':'bayes',
    'metric':{
        'name':'val_accuracy',
        'goal':'maximize'},

}

parameters_dict={
    
    'epochs':{
        'values':[10] # ,[36,49,64,64,121],[36,49,49,81,144]
    },
    'num_samples':{
        'values':[10000]
    },
    'embed_dim':{
        'values':[16,32,64]
    },
    'n_encoder':{
        'values':[1,2,3]
    },
    'n_decoder':{
        'values':[1,2,3]
    },
    'drop_out':{
        'values':[0,0.1,0.2]
    },
    'latent_dim':{
      'values':[16,32,64,128] 
    },
    'beam_size':{
        'values':[3]
    },
    'batch_size':{
        'values':[64]
    },
    'cell_type':{
        'values':["LSTM", "GRU"]#, "RNN"]
    }
}

sweep_config['parameters']=parameters_dict

def sweeper(sweep_config,PROJECT_NAME):
  sweep_id=wandb.sweep(sweep_config,project=PROJECT_NAME,entity='cs6910krsrd',)
  wandb.agent(sweep_id,train,project=PROJECT_NAME,entity='cs6910krsrd',)

In [None]:
sweeper(sweep_config,PROJECT_NAME)

Loading Model

In [None]:
run = wandb.init(project=PROJECT_NAME, job_type="inference")
enc_model_at = run.use_artifact('cs6910krsrd/CS6910 ASSIGNMENT 3/Seq2Seqenc:v0', type='model')
model_dir= enc_model_at.download()
encoder_model=tf.keras.models.load_model(model_dir)
dec_model_at = run.use_artifact('cs6910krsrd/CS6910 ASSIGNMENT 3/Seq2Seqdec:v0', type='model')
model_dir= dec_model_at.download()
decoder_model=tf.keras.models.load_model(model_dir)

run.finish()

VBox(children=(Label(value=' 0.00MB of 0.00MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…









VBox(children=(Label(value=' 0.22MB of 0.22MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

In [None]:
run = wandb.init(project=PROJECT_NAME, job_type="inference")


for beam_size in [1,3,5]:
  valAcc=calcValAcc(encoder_model,decoder_model,beam_size)
  wandb.log({"Beam_size":beam_size,"val_Acc": valAcc})

run.finish()

VBox(children=(Label(value=' 0.42MB of 0.42MB uploaded (0.00MB deduped)\r'), FloatProgress(value=1.0, max=1.0)…

0.48044529075728726


Inference

In [None]:
def calcValAcc(encoder_model,decoder_model,beam_size):
  

  # Reverse-lookup token index to decode sequences back to
  # something readable.
  correct_pred=list()

  for seq_index in range(len(val_encoder_input_data)):
    input_seq = val_encoder_input_data[seq_index : seq_index + 1]
    pred=decode_sequence(input_seq, beam_size,encoder_model,decoder_model)
    #print(input_seq,"pred:",pred,"true",val_target_texts[seq_index:seq_index+1])
    #print("yay",pred,val_target_texts[seq_index])

    if pred==(val_target_texts[seq_index:seq_index+1][0]+'\n'):
      correct_pred.append(1)
    else:
      correct_pred.append(0)
    
  val_accuracy=np.mean(np.array(correct_pred))

  print(val_accuracy)
  return(val_accuracy)


In [None]:
# Define sampling models
# Restore the model and construct the encoder and decoder.
model = keras.models.load_model("s2s_tamil")

encoder_model = Model(encoder_inputs, encoder_states)

decoder_states_inputs = []
decoder_states = []
d_outputs=embedded_word
for j in range(n_decoder)[::-1]:
  current_state_inputs = [Input(shape=(latent_dim,)) for _ in range(2)]

  temp = output_layers[n_decoder-j-1](d_outputs, initial_state=current_state_inputs)

  d_outputs, cur_states = temp[0], temp[1:]
  decoder_states += cur_states
  decoder_states_inputs += current_state_inputs

decoder_outputs = decoder_dense(d_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())



In [None]:

def decode_sequence(input_seq, beam_size,encoder_model,decoder_model,n_encoder=3,n_decoder=3):
    # Encode the input as state vectors.
    states_value = (encoder_model.predict(input_seq))
    states_value=states_value*n_decoder
    k=beam_size
    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0] = target_token_index["\t"]

    end_index=target_token_index["\n"] #sample index to check for end

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    sequences=[[list(),0.0,states_value,False]]
    while not stop_condition:
      new_sequences=list()
      stop_condition=True
      #print("a",np.shape([target_seq] + states_value))
      for i in range(len(sequences)):
        seq, score, states_value,stop_seq = sequences[i]
        
        #if particular sequence is finished
        if stop_seq:
          new_sequences.append(sequences[i])
        
        #if particular sequence is unfinished
        else:
          stop_condition=False                                         #if atleast one sequence is unfinished, decoding must continue
          if len(seq)!=0:                                              #dont run for first iteration
            sampled_token_index=seq[-1]
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

          temp = decoder_model.predict([target_seq] + states_value)
          
          output_tokens=temp[0]
          output_tokens=output_tokens[0, -1, :]
          new_states_value=temp[1:]
          for j in range(len(output_tokens)):
            if j == end_index or len(seq) > max_decoder_seq_length:
              new_seq = [seq + [j], score - np.log(output_tokens[j]),new_states_value,True]
            else:
              new_seq = [seq + [j], score - np.log(output_tokens[j]),new_states_value,False]
            new_sequences.append(new_seq)


      ordered = sorted(new_sequences, key=lambda tup:tup[1])

      # select k best
      sequences = ordered[:k]

    decoded_sentence = ""

    best_seq=sequences[0][0]
    for sampled_token_index in best_seq:
      sampled_char = reverse_target_char_index[sampled_token_index]
      decoded_sentence += sampled_char

    return decoded_sentence

In [None]:
a=decode_sequence(val_encoder_input_data[27:28],3, encoder_model,decoder_model)
b=val_target_texts[27:28][0]

print(a)
print(b)

len(a)

len(b)

அசின்

அசின்


6

In [None]:
a[5]

'\n'

In [None]:
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)

In [None]:
epochs=10