<a href="https://colab.research.google.com/github/Shlok-Agarwal-7/DL-Assignment-2/blob/main/DL_assignment_2_Question_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting Data

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, GRU, SimpleRNN, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tarfile

In [2]:
TarFile= tarfile.open("/content/drive/MyDrive/dakshina_dataset_v1.0.tar")

In [3]:
TarFile.getnames()

['dakshina_dataset_v1.0/bn',
 'dakshina_dataset_v1.0/bn/lexicons',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.info.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.shuf.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.nonblock.sections.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.omit_pages.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-

In [4]:
files_to_extract = [
     'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv',
      'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv',
]

In [5]:
for file in files_to_extract:
    TarFile.extract(file)

In [6]:
TarFile.close()

#Preprocessing

In [7]:
import pandas as pd

In [8]:
Train_df = pd.read_csv("/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv",sep="\t",header=None,names=["native", "romanized", "count"])
Test_df = pd.read_csv("/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv",sep="\t",header=None,names=["native", "romanized", "count"])

In [9]:
Train_df.head()

Unnamed: 0,native,romanized,count
0,अं,an,3
1,अंकगणित,ankganit,3
2,अंकल,uncle,4
3,अंकुर,ankur,4
4,अंकुरण,ankuran,3


In [10]:
Test_df.head()

Unnamed: 0,native,romanized,count
0,अंक,ank,5
1,अंक,anka,1
2,अंकित,ankit,3
3,अंकों,anakon,1
4,अंकों,ankhon,1


In [11]:
Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44204 entries, 0 to 44203
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   native     44204 non-null  object
 1   romanized  44202 non-null  object
 2   count      44204 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [12]:
Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   native     4502 non-null   object
 1   romanized  4502 non-null   object
 2   count      4502 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 105.6+ KB


In [13]:
for df in [Train_df, Test_df]:
    df.dropna(subset=['romanized', 'native'], inplace=True)
    df['romanized'] = df['romanized'].astype(str)
    df['native']    = df['native'].astype(str)

# Encoding Data to feed into RNN

In [14]:
def preprocess_seq2seq_data(df, input_col='romanized', target_col='native'):
    input_texts = []
    output_texts = []
    input_characters = set([" "])
    output_characters = set(["\t", "\n", " "])  # include start and end markers

    # Collect unique characters and wrap target in start/end tokens
    for _, row in df.iterrows():
        input_seq = row[input_col]
        target_seq = row[target_col]
        input_texts.append(input_seq)
        output_texts.append("\t" + target_seq + "\n")

        input_characters.update(set(input_seq))
        output_characters.update(set(target_seq))

    # Sort and build token indices
    input_characters = sorted(list(input_characters))
    target_characters = sorted(list(output_characters))

    input_token_index = {char: i for i, char in enumerate(input_characters)}
    target_token_index = {char: i for i, char in enumerate(target_characters)}

    num_encoder_tokens = len(input_characters)
    num_decoder_tokens = len(target_characters)
    max_encoder_seq_length = max(len(txt) for txt in input_texts)
    max_decoder_seq_length = max(len(txt) for txt in output_texts)

    # Initialize empty one-hot encoded arrays
    encoder_input_data = np.zeros(
        (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
    )
    decoder_input_data = np.zeros(
        (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
    )
    decoder_target_data = np.zeros(
        (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
    )

    # Populate one-hot arrays
    for i, (inp_text, tar_text) in enumerate(zip(input_texts, output_texts)):
        for t, char in enumerate(inp_text):
            encoder_input_data[i, t, input_token_index[char]] = 1.0
        encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0  # padding

        for t, char in enumerate(tar_text):
            decoder_input_data[i, t, target_token_index[char]] = 1.0
            if t > 0:
                decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
        decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
        decoder_target_data[i, t:, target_token_index[" "]] = 1.0

    return {
        'encoder_input_data': encoder_input_data,
        'decoder_input_data': decoder_input_data,
        'decoder_target_data': decoder_target_data,
        'input_token_index': input_token_index,
        'target_token_index': target_token_index,
        'input_characters': input_characters,
        'target_characters': target_characters,
        'max_encoder_seq_length': max_encoder_seq_length,
        'max_decoder_seq_length': max_decoder_seq_length,
        'num_encoder_tokens': num_encoder_tokens,
        'num_decoder_tokens': num_decoder_tokens,
        'input_texts': input_texts,
        'output_texts': output_texts,
    }

In [15]:
Data_Info_train = preprocess_seq2seq_data(Train_df);

#Test Data Preprocessing

In [18]:
def preprocess_test_inputs(
    test_df,
    input_col,
    input_token_index,
    max_encoder_seq_length,
    num_encoder_tokens
):
    input_texts = []
    encoder_input_data = np.zeros(
        (len(test_df), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
    )

    for i, text in enumerate(test_df[input_col]):
        input_texts.append(text)
        for t, char in enumerate(text):
            if char in input_token_index:
                encoder_input_data[i, t, input_token_index[char]] = 1.0
            else:
                # Handle unknown character (optional)
                pass
        # Pad the rest with space (assuming space is used for padding)
        if " " in input_token_index:
            encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0

    return {
        'input_texts': input_texts,
        'encoder_input_data': encoder_input_data,
    }

In [20]:
test_inputs = preprocess_test_inputs(
    test_df=Test_df,
    input_col='romanized',  # replace with actual column name
    input_token_index=Data_Info_train['input_token_index'],
    max_encoder_seq_length=Data_Info_train['max_encoder_seq_length'],
    num_encoder_tokens=Data_Info_train['num_encoder_tokens']
)

# Access processed array:
encoder_input_data_for_test = test_inputs['encoder_input_data']

#Model A

In [16]:
import keras

In [17]:
latent_dim = 64

In [None]:
encoder_inputs = keras.Input(shape=(None, Data_Info_train['num_encoder_tokens']))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, Data_Info_train['num_decoder_tokens']))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(Data_Info_train['num_decoder_tokens'], activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [Data_Info_train['encoder_input_data'],Data_Info_train['decoder_input_data']],
    Data_Info_train['decoder_target_data'],
    batch_size=64,
    epochs=20,
    validation_split=0.2,
)

Epoch 1/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 11ms/step - accuracy: 0.6713 - loss: 1.4855 - val_accuracy: 0.7207 - val_loss: 1.1399
Epoch 2/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.7454 - loss: 0.9897 - val_accuracy: 0.7391 - val_loss: 1.0496
Epoch 3/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.7649 - loss: 0.8633 - val_accuracy: 0.7460 - val_loss: 0.9943
Epoch 4/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.7844 - loss: 0.7706 - val_accuracy: 0.7672 - val_loss: 0.9100
Epoch 5/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.8020 - loss: 0.6890 - val_accuracy: 0.7805 - val_loss: 0.8633
Epoch 6/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - accuracy: 0.8182 - loss: 0.6179 - val_accuracy: 0.7832 - val_loss: 0.8411
Epoch 7/20
[1m553/553[0m 

<keras.src.callbacks.history.History at 0x7f715ceca850>

#model B

In [None]:
import keras

In [None]:
latent_dim = 32

In [None]:
encoder_inputs = keras.Input(shape=(None, Data_Info_train['num_encoder_tokens']))
encoder = keras.layers.SimpleRNN(latent_dim, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

# For SimpleRNN, there's only one state (state_h)
encoder_states = [state_h]

# Decoder
decoder_inputs = keras.Input(shape=(None, Data_Info_train['num_decoder_tokens']))
decoder_rnn = keras.layers.SimpleRNN(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_inputs, initial_state=encoder_states)

# Dense output layer
decoder_dense = keras.layers.Dense(Data_Info_train['num_decoder_tokens'], activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Final model
model_B = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model_B.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model_B.fit(
    [Data_Info_train['encoder_input_data'],Data_Info_train['decoder_input_data']],
    Data_Info_train['decoder_target_data'],
    batch_size=64,
    epochs=20,
    validation_split=0.2,
)

Epoch 1/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.6835 - loss: 1.3917 - val_accuracy: 0.7322 - val_loss: 1.0978
Epoch 2/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.7782 - loss: 0.8485 - val_accuracy: 0.7406 - val_loss: 1.0960
Epoch 3/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 7ms/step - accuracy: 0.7973 - loss: 0.7554 - val_accuracy: 0.7487 - val_loss: 1.0829
Epoch 4/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 8ms/step - accuracy: 0.8050 - loss: 0.7169 - val_accuracy: 0.7438 - val_loss: 1.1119
Epoch 5/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8114 - loss: 0.6894 - val_accuracy: 0.7493 - val_loss: 1.1118
Epoch 6/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.8140 - loss: 0.6734 - val_accuracy: 0.7508 - val_loss: 1.1142
Epoch 7/20
[1m553/553[0m

<keras.src.callbacks.history.History at 0x7f7125f82d10>

#model C

In [None]:
latent_dim = 128

# --- Encoder ---
encoder_inputs = keras.Input(shape=(None, Data_Info_train['num_encoder_tokens']), name="encoder_input")


x = keras.layers.LSTM(latent_dim, return_sequences=True, name="encoder_lstm_1")(encoder_inputs)

encoder_outputs, state_h, state_c = keras.layers.LSTM(latent_dim, return_state=True, name="encoder_lstm_2")(x)
encoder_states = [state_h, state_c]

# --- Decoder ---
decoder_inputs = keras.Input(shape=(None, Data_Info_train['num_decoder_tokens']), name="decoder_input")

# First decoder LSTM (uses encoder states as initial state)
x = keras.layers.LSTM(latent_dim, return_sequences=True, name="decoder_lstm_1")(decoder_inputs, initial_state=encoder_states)


decoder_outputs, _, _ = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm_2")(x)

# Output layer
decoder_dense = keras.layers.Dense(Data_Info_train['num_decoder_tokens'], activation="softmax", name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)


model_C= keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


model_C.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
model_C.fit(
    [Data_Info_train['encoder_input_data'], Data_Info_train['decoder_input_data']],
    Data_Info_train['decoder_target_data'],
    batch_size=64,
    epochs=20,
    validation_split=0.2
)

Epoch 1/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 13ms/step - accuracy: 0.6775 - loss: 1.3679 - val_accuracy: 0.6937 - val_loss: 1.1981
Epoch 2/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 12ms/step - accuracy: 0.7433 - loss: 0.9892 - val_accuracy: 0.7454 - val_loss: 1.0032
Epoch 3/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 13ms/step - accuracy: 0.7768 - loss: 0.8036 - val_accuracy: 0.7681 - val_loss: 0.9108
Epoch 4/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 10ms/step - accuracy: 0.8096 - loss: 0.6481 - val_accuracy: 0.7675 - val_loss: 0.9158
Epoch 5/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11ms/step - accuracy: 0.8377 - loss: 0.5359 - val_accuracy: 0.7999 - val_loss: 0.7689
Epoch 6/20
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.8629 - loss: 0.4425 - val_accuracy: 0.8058 - val_loss: 0.7224
Epoch 7/20
[1m553/5

<keras.src.callbacks.history.History at 0x7f70b7f1ad10>