<a href="https://colab.research.google.com/github/Shlok-Agarwal-7/DL-Assignment-2/blob/main/DL_assignment_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting Data

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tarfile

In [2]:
TarFile= tarfile.open("/content/drive/MyDrive/dakshina_dataset_v1.0.tar")

In [3]:
TarFile.getnames()

['dakshina_dataset_v1.0/bn',
 'dakshina_dataset_v1.0/bn/lexicons',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.info.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.shuf.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.nonblock.sections.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.omit_pages.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-

In [4]:
files_to_extract = [
     'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv',
      'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv',
]

In [5]:
for file in files_to_extract:
    TarFile.extract(file)

In [6]:
TarFile.close()

#Preprocessing

In [7]:
import pandas as pd

In [8]:
Train_df = pd.read_csv("/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv",sep="\t",header=None,names=["native", "romanized", "count"])
Test_df = pd.read_csv("/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv",sep="\t",header=None,names=["native", "romanized", "count"])

In [9]:
Train_df.head()

Unnamed: 0,native,romanized,count
0,अं,an,3
1,अंकगणित,ankganit,3
2,अंकल,uncle,4
3,अंकुर,ankur,4
4,अंकुरण,ankuran,3


In [10]:
Test_df.head()

Unnamed: 0,native,romanized,count
0,अंक,ank,5
1,अंक,anka,1
2,अंकित,ankit,3
3,अंकों,anakon,1
4,अंकों,ankhon,1


In [11]:
Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44204 entries, 0 to 44203
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   native     44204 non-null  object
 1   romanized  44202 non-null  object
 2   count      44204 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [12]:
Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   native     4502 non-null   object
 1   romanized  4502 non-null   object
 2   count      4502 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 105.6+ KB


In [13]:
for df in [Train_df, Test_df]:
    df.dropna(subset=['romanized', 'native'], inplace=True)
    df['romanized'] = df['romanized'].astype(str)
    df['native']    = df['native'].astype(str)

In [14]:
def build_vocab(sequences, add_special_tokens=False):
    chars = sorted(set(''.join(sequences)))
    idx = 1
    char2idx = {'<pad>': 0}
    for c in chars:
        char2idx[c] = idx; idx += 1
    if add_special_tokens:
        char2idx['<s>']  = idx; idx += 1
        char2idx['</s>'] = idx; idx += 1
    return char2idx

src_char2idx = build_vocab(Train_df['romanized'])
# add <s>, </s> to target
tgt_char2idx = build_vocab(Train_df['native'], add_special_tokens=True)

src_vocab_size = len(src_char2idx)
tgt_vocab_size = len(tgt_char2idx)

# max lengths
max_enc_len = max(Train_df['romanized'].str.len().max(), Test_df['romanized'].str.len().max())
max_dec_len = max(Train_df['native'].str.len().max(),    Test_df['native'].str.len().max()) + 2

In [20]:
def encode_sequences(texts, char2idx, maxlen, add_tokens=False):
    seqs = []
    for t in texts:
        s = [char2idx.get(c, 0) for c in t]
        if add_tokens:
            s = [char2idx['<s>']] + s + [char2idx['</s>']]
        seqs.append(s)
    return pad_sequences(seqs, maxlen=maxlen, padding='post')

encoder_input_train = encode_sequences(Train_df['romanized'], src_char2idx, max_enc_len)
decoder_input_train = encode_sequences(Train_df['native'],    tgt_char2idx, max_dec_len, add_tokens=True)
encoder_input_test  = encode_sequences(Test_df['romanized'],  src_char2idx, max_enc_len)
decoder_input_test  = encode_sequences(Test_df['native'],     tgt_char2idx, max_dec_len, add_tokens=True)

def make_one_hot(seqs, vocab_size):
    one_hot = np.zeros((len(seqs), seqs.shape[1], vocab_size), dtype='float32')
    for i, seq in enumerate(seqs):
        for t, idx in enumerate(seq[1:]):
            if idx > 0:
                one_hot[i, t, idx] = 1.0
    return one_hot

decoder_target_train = make_one_hot(decoder_input_train, tgt_vocab_size)
decoder_target_test  = make_one_hot(decoder_input_test,  tgt_vocab_size)

In [21]:
def build_seq2seq_model(embedding_dim, hidden_dim, cell_type='LSTM', num_layers=1):
    # select RNN cell
    Cell = {'LSTM': LSTM, 'GRU': GRU, 'SimpleRNN': SimpleRNN}[cell_type]

    # Encoder
    enc_inputs = Input(shape=(max_enc_len,), name='encoder_inputs')
    enc_emb    = Embedding(input_dim=src_vocab_size, output_dim=embedding_dim,
                           mask_zero=True, name='enc_embedding')(enc_inputs)
    states = None
    x = enc_emb
    for i in range(num_layers):
        ret = {'return_state': True, 'return_sequences': (i < num_layers-1)}
        x_and_states = Cell(hidden_dim, **ret, name=f'enc_{cell_type.lower()}_{i}')(x)
        if cell_type == 'LSTM':
            x, h, c = x_and_states
            states = [h, c]
        else:
            x, h   = x_and_states
            states = [h]

    # Decoder
    dec_inputs = Input(shape=(max_dec_len,), name='decoder_inputs')
    dec_emb    = Embedding(input_dim=tgt_vocab_size, output_dim=embedding_dim,
                           mask_zero=True, name='dec_embedding')(dec_inputs)
    y = dec_emb
    for i in range(num_layers):
        ret2 = {'return_sequences': True, 'return_state': True}
        out_and_states = Cell(hidden_dim, **ret2, name=f'dec_{cell_type.lower()}_{i}')(y, initial_state=states)
        if cell_type == 'LSTM':
            y, _, _ = out_and_states
        else:
            y, _    = out_and_states

    dense = Dense(tgt_vocab_size, activation='softmax', name='output_dense')
    outputs = dense(y)

    model = Model([enc_inputs, dec_inputs], outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [23]:
results = []

# Model A: LSTM, embed=64, hidden=128, layers=1
model_a = build_seq2seq_model(64, 128, 'LSTM', 1)
model_a.fit([encoder_input_train, decoder_input_train], decoder_target_train, epochs=20, batch_size=64, verbose=0)
loss_a, acc_a = model_a.evaluate([encoder_input_test, decoder_input_test], decoder_target_test, verbose=0)
results.append({'model':'A', 'cell':'LSTM', 'embed':64, 'hidden':128, 'layers':1, 'test_accuracy':acc_a})

# # Model B: GRU, embed=128, hidden=256, layers=2
# model_b = build_seq2seq_model(128, 256, 'GRU', 2)
# model_b.fit([encoder_input_train, decoder_input_train], decoder_target_train, epochs=20, batch_size=64, verbose=0)
# loss_b, acc_b = model_b.evaluate([encoder_input_test, decoder_input_test], decoder_target_test, verbose=0)
# results.append({'model':'B', 'cell':'GRU', 'embed':128, 'hidden':256, 'layers':2, 'test_accuracy':acc_b})

# Model C: SimpleRNN, embed=32, hidden=64, layers=1
model_c = build_seq2seq_model(32, 64, 'SimpleRNN', 1)
model_c.fit([encoder_input_train, decoder_input_train], decoder_target_train, epochs=20, batch_size=64, verbose=0)
loss_c, acc_c = model_c.evaluate([encoder_input_test, decoder_input_test], decoder_target_test, verbose=0)
results.append({'model':'C', 'cell':'SimpleRNN', 'embed':32, 'hidden':64, 'layers':1, 'test_accuracy':acc_c})

# Display results
df_results = pd.DataFrame(results)
print(df_results)


  model       cell  embed  hidden  layers  test_accuracy
0     A       LSTM     64     128       1       0.266940
1     C  SimpleRNN     32      64       1       0.201297
