In [79]:
import pandas as pd

In [80]:
df=pd.read_excel('parallel-corpus.xlsx')
df.head(2)
df.columns
# df.info()

Index(['SENTENCES ', 'MEANING', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
       'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9',
       'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
       'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
       'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
       'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
       'Unnamed: 30', 'Unnamed: 31'],
      dtype='object')

In [81]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4','Unnamed: 5', 'Unnamed: 6', 
         'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9','Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
         'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17',
         'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21',
         'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25',
         'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29',
         'Unnamed: 30', 'Unnamed: 31'], axis = 1, inplace = True)
df.head(2)

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟


In [82]:
print(df.isna().sum())

SENTENCES      44
MEANING       546
dtype: int64


In [83]:
# df.dropna(inplace=True)
# Clean the dataset by keeping only the relevant columns: 'SENTENCES' (English) and 'MEANING' (Urdu)
df_cleaned = df[['SENTENCES ', 'MEANING']].dropna()

# Renaming columns for clarity
df_cleaned.columns = ['English', 'Urdu']

# Display cleaned data
df_cleaned.head()


Unnamed: 0,English,Urdu
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [84]:
df.head()

Unnamed: 0,SENTENCES,MEANING
0,How can I communicate with my parents?,میں اپنے والدین سے کیسے بات کروں ؟
1,How can I make friends?’,میں دوست کیسے بنائوں ؟
2,Why do I get so sad?’,میں اتنا اداس کیوں ہوں؟.
3,"If you’ve asked yourself such questions, you’r...",اگر آپ نے اپنے آپ سے ایسے سوالات کیے ہیں، تو آ...
4,"Depending on where you’ve turned for guidance,...",اس بات پر منحصر ہے کہ آپ رہنمائی کے لیے کہاں ...


In [86]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Hyperparameters
MAX_VOCAB_SIZE = 10000  # Limit the vocabulary size
MAX_SEQUENCE_LENGTH = 20  # Max length of sentences (after padding)

# Tokenizer for English and Urdu
eng_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')
urdu_tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE, filters='')

# Ensure that all data in the 'English' and 'Urdu' columns are strings
df_cleaned['English'] = df_cleaned['English'].astype(str)
df_cleaned['Urdu'] = df_cleaned['Urdu'].astype(str)

# Now, retry tokenization
eng_tokenizer.fit_on_texts(df_cleaned['English'])
urdu_tokenizer.fit_on_texts(df_cleaned['Urdu'])
# Convert the text into sequences of integers
eng_sequences = eng_tokenizer.texts_to_sequences(df_cleaned['English'])
urdu_sequences = urdu_tokenizer.texts_to_sequences(df_cleaned['Urdu'])

# Pad the sequences to ensure uniform input size
eng_padded = pad_sequences(eng_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
urdu_padded = pad_sequences(urdu_sequences, maxlen=MAX_SEQUENCE_LENGTH, padding='post')

# Split into training, validation, and test sets (80%, 10%, 10%)
eng_train, eng_test, urdu_train, urdu_test = train_test_split(eng_padded, urdu_padded, test_size=0.2, random_state=42)
eng_train, eng_val, urdu_train, urdu_val = train_test_split(eng_train, urdu_train, test_size=0.1, random_state=42)

# Displaying the shape of the data
print(eng_train.shape, urdu_train.shape, eng_val.shape, urdu_val.shape, eng_test.shape, urdu_test.shape)


(21323, 20) (21323, 20) (2370, 20) (2370, 20) (5924, 20) (5924, 20)


In [88]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense

# Hyperparameters
embedding_dim = 128
rnn_units = 256

# Define the model architecture
model = Sequential()
model.add(Embedding(input_dim=MAX_VOCAB_SIZE, output_dim=embedding_dim, input_length=MAX_SEQUENCE_LENGTH))
model.add(SimpleRNN(rnn_units, return_sequences=True))
model.add(Dense(MAX_VOCAB_SIZE, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()




In [89]:
# Train the model
history = model.fit(eng_train, urdu_train, 
                    epochs=10, 
                    validation_data=(eng_val, urdu_val))


Epoch 1/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 221ms/step - accuracy: 0.4754 - loss: 4.3913 - val_accuracy: 0.5071 - val_loss: 3.4502
Epoch 2/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 222ms/step - accuracy: 0.5100 - loss: 3.3233 - val_accuracy: 0.5229 - val_loss: 3.2806
Epoch 3/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 236ms/step - accuracy: 0.5349 - loss: 3.0430 - val_accuracy: 0.5307 - val_loss: 3.1786
Epoch 4/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m164s[0m 245ms/step - accuracy: 0.5412 - loss: 2.8959 - val_accuracy: 0.5363 - val_loss: 3.1228
Epoch 5/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m148s[0m 222ms/step - accuracy: 0.5480 - loss: 2.7578 - val_accuracy: 0.5401 - val_loss: 3.0835
Epoch 6/10
[1m667/667[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m152s[0m 227ms/step - accuracy: 0.5524 - loss: 2.6481 - val_accuracy: 0.5429 - val_loss: 3.0653
Epoc

In [90]:
# Predict translations for the test set
predictions = model.predict(eng_test)

# Convert predictions from sequences back to words
def sequence_to_text(tokenizer, sequences):
    reverse_word_map = dict(map(reversed, tokenizer.word_index.items()))
    return " ".join([reverse_word_map.get(i, '') for i in sequences])

for i in range(5):  # Check the first 5 examples
    predicted_sequence = predictions[i]
    actual_sequence = urdu_test[i]

    predicted_sentence = sequence_to_text(urdu_tokenizer, predicted_sequence)
    actual_sentence = sequence_to_text(urdu_tokenizer, actual_sequence)

    print(f"English: {sequence_to_text(eng_tokenizer, eng_test[i])}")
    print(f"Predicted Urdu: {predicted_sentence}")
    print(f"Actual Urdu: {actual_sentence}")
    print(f"BLEU Score: {sentence_bleu([actual_sentence.split()], predicted_sentence.split())}")
    print("\n")


[1m186/186[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 40ms/step


TypeError: unhashable type: 'numpy.ndarray'