<a href="https://colab.research.google.com/github/SheikhMudassarHanif/NLP/blob/main/assignment2rnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load your dataset
columns = pd.read_csv('/content/fulldatafullcleaned.csv', nrows=0).columns.tolist()
df = pd.read_csv('/content/fulldatafullcleaned.csv', usecols=[columns[1], columns[2]])

# Drop NaN and duplicates
df.dropna(inplace=True)
df = df.drop_duplicates()

# Split sentences longer than threshold (e.g., 30 words)
threshold = 30
df = df[df['SENTENCES '].str.split().str.len() <= threshold]
df = df[df['MEANING'].str.split().str.len() <= threshold]

# Train-validation-test split
train, val, test = np.split(df.sample(frac=1), [int(0.8*len(df)), int(0.9*len(df))])

# Parameters
max_length = 30  # Maximum sequence length
vocab_size = 20000
embedding_dim = 256

# Tokenization for English sentences
eng_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
eng_tokenizer.fit_on_texts(train['SENTENCES '].values)
train_eng_seq = eng_tokenizer.texts_to_sequences(train['SENTENCES '].values)
val_eng_seq = eng_tokenizer.texts_to_sequences(val['SENTENCES '].values)
test_eng_seq = eng_tokenizer.texts_to_sequences(test['SENTENCES '].values)

# Pad the sequences
X_train = pad_sequences(train_eng_seq, maxlen=max_length, padding='post', truncating='post')
X_val = pad_sequences(val_eng_seq, maxlen=max_length, padding='post', truncating='post')
X_test = pad_sequences(test_eng_seq, maxlen=max_length, padding='post', truncating='post')

# Tokenization for Urdu translations (labels)
urd_tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
urd_tokenizer.fit_on_texts(train['MEANING'].values)
train_urd_seq = urd_tokenizer.texts_to_sequences(train['MEANING'].values)
val_urd_seq = urd_tokenizer.texts_to_sequences(val['MEANING'].values)
test_urd_seq = urd_tokenizer.texts_to_sequences(test['MEANING'].values)

# Pad the sequences
y_train = pad_sequences(train_urd_seq, maxlen=max_length, padding='post', truncating='post')
y_val = pad_sequences(val_urd_seq, maxlen=max_length, padding='post', truncating='post')
y_test = pad_sequences(test_urd_seq, maxlen=max_length, padding='post', truncating='post')


  return bound(*args, **kwds)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense, Bidirectional, Dropout, TimeDistributed
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters (adjust as needed)
embedding_dim = 256  # Embedding dimension for words
max_length = 30      # Maximum sentence length
eng_vocab_size = 20000  # Example vocab size for English
urd_vocab_size = 20000  # Example vocab size for Urdu
rnn_units = 128      # Number of units in SimpleRNN layers

# Model Architecture
model = Sequential()

# Encoder
model.add(Embedding(input_dim=eng_vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(Bidirectional(SimpleRNN(rnn_units, return_sequences=True)))  # First Bidirectional RNN layer
model.add(Dropout(0.3))

model.add(Bidirectional(SimpleRNN(rnn_units, return_sequences=True)))  # Second Bidirectional RNN layer
model.add(Dropout(0.3))

model.add(Bidirectional(SimpleRNN(rnn_units, return_sequences=True)))  # Third Bidirectional RNN layer
model.add(Dropout(0.3))

# Decoder
model.add(TimeDistributed(Dense(urd_vocab_size, activation='softmax')))  # TimeDistributed for word predictions at each time step

# Compile the Model
model.compile(optimizer=Adam(learning_rate=0.001), loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# EarlyStopping and ReduceLROnPlateau callbacks for training
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=0.00001)

# Example of training
# Changed X_train_padded, y_train_padded, X_val_padded, y_val_padded to X_train, y_train, X_val, y_val respectively
history = model.fit(
    X_train,  # Input sequences (already padded)
    y_train,  # Target sequences (shifted right for decoder)
    validation_data=(X_val, y_val),  # Validation data
    batch_size=64,
    epochs=20,
    callbacks=[early_stopping, reduce_lr]
)

# Save the trained model
model.save('bidirectional_rnn_model.h5')

# Evaluate the model on the test data
# Changed X_test_padded, y_test_padded to X_test, y_test respectively
model.evaluate(X_test, y_test)



Epoch 1/20
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m173s[0m 464ms/step - accuracy: 0.6214 - loss: 4.0296 - val_accuracy: 0.6587 - val_loss: 2.3978 - learning_rate: 0.0010
Epoch 2/20
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.6555 - loss: 2.3925 - val_accuracy: 0.6692 - val_loss: 2.2560 - learning_rate: 0.0010
Epoch 3/20
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 68ms/step - accuracy: 0.6642 - loss: 2.2439 - val_accuracy: 0.6760 - val_loss: 2.1969 - learning_rate: 0.0010
Epoch 4/20
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 68ms/step - accuracy: 0.6709 - loss: 2.1545 - val_accuracy: 0.6810 - val_loss: 2.1535 - learning_rate: 0.0010
Epoch 5/20
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 67ms/step - accuracy: 0.6803 - loss: 2.0455 - val_accuracy: 0.6855 - val_loss: 2.1214 - learning_rate: 0.0010
Epoch 6/20
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[



[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 23ms/step - accuracy: 0.6884 - loss: 2.0736


[2.059185266494751, 0.6893100142478943]

In [None]:
# Evaluate on the test set
model.evaluate(X_test, y_test)


[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.6884 - loss: 2.0736


[2.059185266494751, 0.6893100142478943]

In [None]:
# Function to translate an English sentence to Urdu
def translate_sentence(sentence):
    # Tokenize and pad the input sentence
    eng_seq = eng_tokenizer.texts_to_sequences([sentence])
    pad_eng_seq = pad_sequences(eng_seq, maxlen=max_length, padding='post')

    # Initialize the decoder input sequence
    decoder_input = np.zeros((1, max_length))  # Batch size = 1, max_length = 30
    decoder_input[0, 0] = urd_tokenizer.word_index.get('<start>', 1)  # Assuming <start> token exists

    # Generate translation
    for i in range(1, max_length):
        output = model.predict([pad_eng_seq, decoder_input])  # Predict the next token
        sampled_token_index = np.argmax(output[0, i - 1, :])
        decoder_input[0, i] = sampled_token_index

        # Stop if <end> token is predicted
        if sampled_token_index == urd_tokenizer.word_index.get('<end>', 2):
            break

    # Convert predicted token indices back to words
    predicted_urdu_translation = urd_tokenizer.sequences_to_texts(decoder_input)[0]
    return predicted_urdu_translation

# Example usage
test_sentence = "I am good"
translated_output = translate_sentence(test_sentence)
print(f'Translated Sentence: {translated_output}')


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17



Epoch 1/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 366ms/step - accuracy: 0.6479 - loss: 7.1673 - val_accuracy: 0.6681 - val_loss: 2.4171
Epoch 2/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 72ms/step - accuracy: 0.6705 - loss: 2.2959 - val_accuracy: 0.6770 - val_loss: 2.2010
Epoch 3/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 70ms/step - accuracy: 0.6794 - loss: 2.1176 - val_accuracy: 0.6791 - val_loss: 2.1410
Epoch 4/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 73ms/step - accuracy: 0.6839 - loss: 2.0479 - val_accuracy: 0.6817 - val_loss: 2.1268
Epoch 5/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 0.6855 - loss: 1.9944 - val_accuracy: 0.6832 - val_loss: 2.1160
Epoch 6/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 70ms/step - accuracy: 0.6903 - loss: 1.9224 - val_accuracy: 0.6855 - val_loss: 2.0923
Epoch 7/40
[1

#RNN FINAL one

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional,Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_input_length = 31  # or 30 based on your target
X_train_padded = pad_sequences(X_train, maxlen=max_input_length, padding='post')
X_val_padded = pad_sequences(X_val, maxlen=max_input_length, padding='post')
y_train_padded = pad_sequences(y_train, maxlen=max_input_length, padding='post')
y_val_padded = pad_sequences(y_val, maxlen=max_input_length, padding='post')
model2=tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=eng_vocab_size,output_dim=256,input_length=30),
    # tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True)),
    tf.keras.layers.Dense(256,activation='relu'),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(256,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(256,return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(256, kernel_regularizer=tf.keras.regularizers.l2(0.01), return_sequences=True)),
    tf.keras.layers.TimeDistributed(Dense(urd_vocab_size,activation='softmax')) # Added TimeDistributed layer

])
# callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=3)]
model2.compile(optimizer=Adam(learning_rate=0.0005),loss='sparse_categorical_crossentropy',metrics=['accuracy'])
model2.summary()
history2 = model2.fit(X_train_padded, y_train_padded, epochs=40, batch_size=64, validation_data=(X_val_padded, y_val_padded))




Epoch 1/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 366ms/step - accuracy: 0.6479 - loss: 7.1673 - val_accuracy: 0.6681 - val_loss: 2.4171
Epoch 2/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 72ms/step - accuracy: 0.6705 - loss: 2.2959 - val_accuracy: 0.6770 - val_loss: 2.2010
Epoch 3/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 70ms/step - accuracy: 0.6794 - loss: 2.1176 - val_accuracy: 0.6791 - val_loss: 2.1410
Epoch 4/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 73ms/step - accuracy: 0.6839 - loss: 2.0479 - val_accuracy: 0.6817 - val_loss: 2.1268
Epoch 5/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 73ms/step - accuracy: 0.6855 - loss: 1.9944 - val_accuracy: 0.6832 - val_loss: 2.1160
Epoch 6/40
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 70ms/step - accuracy: 0.6903 - loss: 1.9224 - val_accuracy: 0.6855 - val_loss: 2.0923
Epoch 7/40
[1

In [None]:
sentence='i am good'
eng_tokenizer.texts_to_sequences([sentence])
pad_eng_sentence=pad_sequences(eng_tokenizer.texts_to_sequences([sentence]),maxlen=30,padding='post')
predictions=model2.predict(pad_eng_sentence)
predicted_seq=np.argmax(predictions,axis=-1)
# predicted_seq
# Create a reverse mapping from index to word for Urdu
urd_index_word = {v: k for k, v in urd_tokenizer.word_index.items()}

# Convert predicted sequences to words
predicted_urdu_translation = []
for word_index in predicted_seq[0]:  # Assuming a single input sentence
    if word_index != 0:  # Skip padding
        predicted_urdu_translation.append(urd_index_word.get(word_index, ''))

# Join the predicted words to form the translated sentence
translated_sentence = ' '.join(predicted_urdu_translation)
print("Predicted Urdu Translation:", translated_sentence)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
Predicted Urdu Translation: میں ٹھیک ہے


In [None]:
model2.save('RNN2.keras')

In [None]:
model2=tf.keras.models.load_model('/content/RNN.keras')

In [None]:
model2.fit(X_train_padded, y_train_padded, epochs=10, batch_size=64, validation_data=(X_val_padded, y_val_padded))

Epoch 1/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 253ms/step - accuracy: 0.8248 - loss: 0.8150 - val_accuracy: 0.6941 - val_loss: 2.3453
Epoch 2/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 71ms/step - accuracy: 0.8250 - loss: 0.8136 - val_accuracy: 0.6963 - val_loss: 2.3483
Epoch 3/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 76ms/step - accuracy: 0.8282 - loss: 0.7958 - val_accuracy: 0.6946 - val_loss: 2.3812
Epoch 4/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 70ms/step - accuracy: 0.8296 - loss: 0.7890 - val_accuracy: 0.6960 - val_loss: 2.3762
Epoch 5/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 68ms/step - accuracy: 0.8347 - loss: 0.7647 - val_accuracy: 0.6965 - val_loss: 2.3942
Epoch 6/10
[1m244/244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 69ms/step - accuracy: 0.8364 - loss: 0.7533 - val_accuracy: 0.6971 - val_loss: 2.4026
Epoch 7/10
[1m

<keras.src.callbacks.history.History at 0x7a29c8e374c0>

#RNN FINAL ONE END