<a href="https://colab.research.google.com/github/Pown137/HTIC_Deep-Learning_seq2seq-model/blob/main/DL_seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import files
import zipfile, os

# Upload dataset
uploaded = files.upload()

# Extrac zip file
for filename in uploaded.keys():
    if filename.endswith('.zip'):
        with zipfile.ZipFile(filename, 'r') as zip_ref:
            zip_ref.extractall("aksharantar_data")

print("Dataset uprooted successfully!")

Saving aksharantar_sampled (1).zip to aksharantar_sampled (1).zip
Dataset uprooted successfully!


In [2]:
import pandas as pd

# Load one language (example: Hindi)
datapath = "aksharantar_data/aksharantar_sampled/tam/tam_train.csv"
dfile = pd.read_csv(datapath)

print("Total samples:", len(dfile))
print(dfile.head())

Total samples: 51199
             thottacharya       தொட்டாச்சார்ய
0             menmaithaan          மென்மைதான்
1               avarantri             அவரன்றி
2             mudiyarathu            முடியறது
3         aadaiyanigalaal         ஆடையணிகளால்
4  muzhumaiyaakkugindrana  முழுமையாக்குகின்றன


In [3]:
#Fix column name
dfile.columns = ['tam_roman', 'tam_native']
print(dfile.head())

                tam_roman          tam_native
0             menmaithaan          மென்மைதான்
1               avarantri             அவரன்றி
2             mudiyarathu            முடியறது
3         aadaiyanigalaal         ஆடையணிகளால்
4  muzhumaiyaakkugindrana  முழுமையாக்குகின்றன


In [4]:
import numpy as pown
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare source and target lists
input_text = dfile['tam_roman'].astype(str).tolist()
target_text = ['\t' + t + '\n' for t in dfile['tam_native'].astype(str)]  # add start/end tokens

# Tokenize
input_token = Tokenizer(char_level=True)
input_token.fit_on_texts(input_text)
target_token= Tokenizer(char_level=True)
target_token.fit_on_texts(target_text)

# Convert to sequences
encode_input = input_token.texts_to_sequences(input_text)
decode_input = target_token.texts_to_sequences(target_text)

# Pad sequences
maxencoder_seqlength = max(len(s) for s in encode_input)
maxdecoder_seqlength = max(len(s) for s in decode_input)
encode_input = pad_sequences(encode_input, maxlen=maxencoder_seqlength, padding='post')
decode_input = pad_sequences(decode_input, maxlen=maxdecoder_seqlength, padding='post')
print("Data tokenized and padded.")


Data tokenized and padded.


In [5]:
# Model definition
import tensorflow as ft
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
from tensorflow.keras.models import Model

latent_dim = 128

#Encode
enc_input = Input(shape=(None,))
x = Embedding(len(input_token.word_index)+1, 64)(enc_input)
enc_lstm, state_h, state_c = LSTM(latent_dim, return_state=True)(x)
enc_state = [state_h, state_c]

#Decode
dec_input = Input(shape=(None,))
y = Embedding(len(target_token.word_index)+1, 64)(dec_input)
dec_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
dec_output, _, _ = dec_lstm(y, initial_state=enc_state)
dec_dense = Dense(len(target_token.word_index)+1, activation='softmax')
dec_output = dec_dense(dec_output)

model = Model([enc_input, dec_input], dec_output)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [6]:
# Training
dec_target = pown.expand_dims(decode_input[:,1:], -1)

model.fit([encode_input, decode_input[:,:-1]], dec_target,
          batch_size=64,
          epochs=10,
          validation_split=0.2)

print("Training completed.")


Epoch 1/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 104ms/step - loss: 1.6459 - val_loss: 1.0996
Epoch 2/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 105ms/step - loss: 1.0636 - val_loss: 0.9186
Epoch 3/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 106ms/step - loss: 0.8436 - val_loss: 0.6336
Epoch 4/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 105ms/step - loss: 0.5834 - val_loss: 0.4588
Epoch 5/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 104ms/step - loss: 0.4269 - val_loss: 0.3489
Epoch 6/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 104ms/step - loss: 0.3286 - val_loss: 0.2808
Epoch 7/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 105ms/step - loss: 0.2646 - val_loss: 0.2364
Epoch 8/10
[1m640/640[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 103ms/step - loss: 0.2215 - val_loss: 0.2095
Epoch 9/10
[1m6

In [7]:
# testing
for i in range(5):
    print(f"input : {dfile.iloc[i,0]}")
    print(f"target: {dfile.iloc[i,1]}")
    print()

input : menmaithaan
target: மென்மைதான்

input : avarantri
target: அவரன்றி

input : mudiyarathu
target: முடியறது

input : aadaiyanigalaal
target: ஆடையணிகளால்

input : muzhumaiyaakkugindrana
target: முழுமையாக்குகின்றன

