<a href="https://colab.research.google.com/github/Shlok-Agarwal-7/DL-Assignment-2/blob/main/DL_assignment_2_Question_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Extracting Data

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, LSTM, SimpleRNN, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import tarfile

In [2]:
TarFile= tarfile.open("/content/drive/MyDrive/dakshina_dataset_v1.0.tar")

In [3]:
TarFile.getnames()

['dakshina_dataset_v1.0/bn',
 'dakshina_dataset_v1.0/bn/lexicons',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.test.tsv',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.train.tsv',
 'dakshina_dataset_v1.0/bn/lexicons/bn.translit.sampled.dev.tsv',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.valid.text.shuf.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.info.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.info.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.sorted.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-filt.train.text.shuf.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.nonblock.sections.tsv.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-full.omit_pages.txt.gz',
 'dakshina_dataset_v1.0/bn/native_script_wikipedia/bn.wiki-

In [4]:
files_to_extract = [
     'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv',
      'dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv',
]

In [5]:
for file in files_to_extract:
    TarFile.extract(file)

In [6]:
TarFile.close()

#Preprocessing

In [7]:
import pandas as pd

In [8]:
Train_df = pd.read_csv("/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.train.tsv",sep="\t",header=None,names=["native", "romanized", "count"])
Test_df = pd.read_csv("/content/dakshina_dataset_v1.0/hi/lexicons/hi.translit.sampled.test.tsv",sep="\t",header=None,names=["native", "romanized", "count"])

In [9]:
Train_df.head()

Unnamed: 0,native,romanized,count
0,अं,an,3
1,अंकगणित,ankganit,3
2,अंकल,uncle,4
3,अंकुर,ankur,4
4,अंकुरण,ankuran,3


In [10]:
Test_df.head()

Unnamed: 0,native,romanized,count
0,अंक,ank,5
1,अंक,anka,1
2,अंकित,ankit,3
3,अंकों,anakon,1
4,अंकों,ankhon,1


In [11]:
Train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44204 entries, 0 to 44203
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   native     44204 non-null  object
 1   romanized  44202 non-null  object
 2   count      44204 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 1.0+ MB


In [12]:
Test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   native     4502 non-null   object
 1   romanized  4502 non-null   object
 2   count      4502 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 105.6+ KB


In [13]:
for df in [Train_df, Test_df]:
    df.dropna(subset=['romanized', 'native'], inplace=True)
    df['romanized'] = df['romanized'].astype(str)
    df['native']    = df['native'].astype(str)

# Encoding Data to feed into RNN

In [14]:
input_texts = []
output_texts = []
input_characters = set([" "])
output_characters = set(["\t", "\n", " "])  # include start and end markers

# Collect unique characters and wrap target in start/end tokens
for _, row in Train_df.iterrows():
    input_seq = row['romanized']
    target_seq = row['native']
    input_texts.append(input_seq)
    output_texts.append("\t" + target_seq + "\n")

    input_characters.update(set(input_seq))
    output_characters.update(set(target_seq))

# Sort and build token indices
input_characters = sorted(list(input_characters))
target_characters = sorted(list(output_characters))

input_token_index = {char: i for i, char in enumerate(input_characters)}
target_token_index = {char: i for i, char in enumerate(target_characters)}
reverse_target_char_index = {i: char for char, i in target_token_index.items()}

num_encoder_tokens = len(input_characters)
num_decoder_tokens = len(target_characters)
max_encoder_seq_length = max(len(txt) for txt in input_texts)
max_decoder_seq_length = max(len(txt) for txt in output_texts)

# Initialize empty one-hot encoded arrays
encoder_input_data = np.zeros(
    (len(input_texts), max_encoder_seq_length, num_encoder_tokens), dtype="float32"
)
decoder_input_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)
decoder_target_data = np.zeros(
    (len(input_texts), max_decoder_seq_length, num_decoder_tokens), dtype="float32"
)

# Populate one-hot arrays
for i, (inp_text, tar_text) in enumerate(zip(input_texts, output_texts)):
    for t, char in enumerate(inp_text):
        encoder_input_data[i, t, input_token_index[char]] = 1.0
    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0  # padding

    for t, char in enumerate(tar_text):
        decoder_input_data[i, t, target_token_index[char]] = 1.0
        if t > 0:
            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
    decoder_target_data[i, t:, target_token_index[" "]] = 1.0

In [15]:
print(num_encoder_tokens)
print(num_decoder_tokens)

27
66


#Model A

In [16]:
import keras

In [17]:
latent_dim = 128

In [None]:
encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
encoder = keras.layers.LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)

# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))

# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [18]:
model.summary()

NameError: name 'model' is not defined

In [None]:
model.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model.fit(
    [encoder_input_data, decoder_input_data],
    decoder_target_data,
    batch_size=32,
    epochs=5,
    validation_split=0.2,
)
# Save model


Epoch 1/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 11ms/step - accuracy: 0.7085 - loss: 1.2239 - val_accuracy: 0.7396 - val_loss: 1.0506
Epoch 2/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 11ms/step - accuracy: 0.7810 - loss: 0.8007 - val_accuracy: 0.7514 - val_loss: 0.9871
Epoch 3/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 7ms/step - accuracy: 0.8185 - loss: 0.6286 - val_accuracy: 0.7447 - val_loss: 1.0182
Epoch 4/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 8ms/step - accuracy: 0.8457 - loss: 0.5115 - val_accuracy: 0.7648 - val_loss: 0.9770
Epoch 5/5
[1m1106/1106[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 7ms/step - accuracy: 0.8683 - loss: 0.4236 - val_accuracy: 0.7875 - val_loss: 0.7665


<keras.src.callbacks.history.History at 0x7a0fea03fd90>

In [None]:
model.save("s2s_model.keras")

In [None]:
model = keras.models.load_model("s2s_model.keras")

encoder_inputs = model.input[0]  # input_1
encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
encoder_states = [state_h_enc, state_c_enc]
encoder_model = keras.Model(encoder_inputs, encoder_states)

decoder_inputs = model.input[1]  # input_2
decoder_state_input_h = keras.Input(shape=(latent_dim,))
decoder_state_input_c = keras.Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_lstm = model.layers[3]
decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
    decoder_inputs, initial_state=decoder_states_inputs
)
decoder_states = [state_h_dec, state_c_dec]
decoder_dense = model.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model(
    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to
# something readable.
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0
        )

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence

In [None]:
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-")
    print("Input sentence:", input_texts[seq_index])
    print("Decoded sentence:", decoded_sentence)

-
Input sentence: an
Decoded sentence: ना

-
Input sentence: ankganit
Decoded sentence: अनकजेंता

-
Input sentence: uncle
Decoded sentence: एंसेड

-
Input sentence: ankur
Decoded sentence: अनकरों

-
Input sentence: ankuran
Decoded sentence: अनकरों

-
Input sentence: ankurit
Decoded sentence: अन्कृतिक

-
Input sentence: aankush
Decoded sentence: आनक्षों

-
Input sentence: ankush
Decoded sentence: अनुक्षा

-
Input sentence: ang
Decoded sentence: अंग

-
Input sentence: anga
Decoded sentence: अंगा

-
Input sentence: agandh
Decoded sentence: अगंडा

-
Input sentence: angad
Decoded sentence: अंगड़ा

-
Input sentence: angane
Decoded sentence: अनजने

-
Input sentence: angbhang
Decoded sentence: अंगंगार

-
Input sentence: angarakshak
Decoded sentence: अंसर्जार्ण

-
Input sentence: angrakshak
Decoded sentence: अंसर्जार्क

-
Input sentence: angara
Decoded sentence: अंगरा

-
Input sentence: angaare
Decoded sentence: अंगरान

-
Input sentence: angare
Decoded sentence: अंगरें

-
Input sentence: angi
D

#model B

In [None]:
import keras

In [None]:
latent_dim = 32

In [19]:
encoder_inputs = keras.Input(shape=(None,num_encoder_tokens))
encoder = keras.layers.SimpleRNN(latent_dim, return_state=True)
encoder_outputs, state_h = encoder(encoder_inputs)

# For SimpleRNN, there's only one state (state_h)
encoder_states = [state_h]

# Decoder
decoder_inputs = keras.Input(shape=(None,num_decoder_tokens))
decoder_rnn = keras.layers.SimpleRNN(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_rnn(decoder_inputs, initial_state=encoder_states)

# Dense output layer
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
decoder_outputs = decoder_dense(decoder_outputs)

# Final model
model_B = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [20]:
model_B.summary()

In [None]:
model_B.compile(
    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
)
model_B.fit(
    [encoder_input_data,decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs = 5,
    validation_split=0.2,
)

Epoch 1/5
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 15ms/step - accuracy: 0.6670 - loss: 1.6493 - val_accuracy: 0.7133 - val_loss: 1.1734
Epoch 2/5
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7456 - loss: 0.9704 - val_accuracy: 0.7346 - val_loss: 1.0876
Epoch 3/5
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7547 - loss: 0.9129 - val_accuracy: 0.7396 - val_loss: 1.0599
Epoch 4/5
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.7645 - loss: 0.8753 - val_accuracy: 0.7415 - val_loss: 1.0410
Epoch 5/5
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7732 - loss: 0.8281 - val_accuracy: 0.7429 - val_loss: 1.0209


<keras.src.callbacks.history.History at 0x7a0fb5fc3590>

#model C

In [21]:
latent_dim = 128

# --- Encoder ---
encoder_inputs = keras.Input(shape=(None,num_encoder_tokens), name="encoder_input")


x = keras.layers.LSTM(latent_dim, return_sequences=True, name="encoder_lstm_1")(encoder_inputs)

encoder_outputs, state_h, state_c = keras.layers.LSTM(latent_dim, return_state=True, name="encoder_lstm_2")(x)
encoder_states = [state_h, state_c]

# --- Decoder ---
decoder_inputs = keras.Input(shape=(None,num_decoder_tokens), name="decoder_input")

# First decoder LSTM (uses encoder states as initial state)
x = keras.layers.LSTM(latent_dim, return_sequences=True, name="decoder_lstm_1")(decoder_inputs, initial_state=encoder_states)


decoder_outputs, _, _ = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True, name="decoder_lstm_2")(x)

# Output layer
decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax", name="decoder_dense")
decoder_outputs = decoder_dense(decoder_outputs)


model_C= keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)


model_C.compile(optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"])

In [22]:
model_C.summary()

In [None]:
model_C.fit(
    [encoder_input_data,decoder_input_data],
    decoder_target_data,
    batch_size=64,
    epochs=10,
    validation_split=0.2
)

Epoch 1/10
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.6794 - loss: 1.3654 - val_accuracy: 0.7135 - val_loss: 1.1365
Epoch 2/10
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 15ms/step - accuracy: 0.7460 - loss: 0.9796 - val_accuracy: 0.7461 - val_loss: 0.9962
Epoch 3/10
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 10ms/step - accuracy: 0.7755 - loss: 0.8075 - val_accuracy: 0.7589 - val_loss: 0.9425
Epoch 4/10
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.8067 - loss: 0.6670 - val_accuracy: 0.7710 - val_loss: 0.8827
Epoch 5/10
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 12ms/step - accuracy: 0.8371 - loss: 0.5393 - val_accuracy: 0.7966 - val_loss: 0.7716
Epoch 6/10
[1m553/553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 11ms/step - accuracy: 0.8603 - loss: 0.4507 - val_accuracy: 0.8063 - val_loss: 0.7107
Epoch 7/10
[1m553/

<keras.src.callbacks.history.History at 0x7f8c056ec590>