In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense,LSTM,TimeDistributed,RepeatVector,GRU,Embedding
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
df=pd.read_csv("/content/ara_eng.txt",delimiter="\t",names=["english","arabic"])

In [None]:
df.head()

Unnamed: 0,english,arabic
0,Hi.,مرحبًا.
1,Run!,اركض!
2,Help!,النجدة!
3,Jump!,اقفز!
4,Stop!,قف!


In [None]:
english_tokenizer=Tokenizer()
english_tokenizer.fit_on_texts(df["english"])

In [None]:
vocab_size_english=len(english_tokenizer.word_index)

In [None]:
vocab_size_english

26062

In [None]:
english_word_2_idx=english_tokenizer.word_index
english_idx_2_word={idx:word for word,idx in english_word_2_idx.items()}

In [None]:
arabic_tokenizer=Tokenizer()
arabic_tokenizer.fit_on_texts(df["arabic"])

In [None]:
vocab_size_arabic=len(arabic_tokenizer.word_index)+1

In [None]:
arabic_word_2_idx=arabic_tokenizer.word_index
arabic_idx_2_word={idx:word for word,idx in arabic_word_2_idx.items()}

In [None]:
token_eng=english_tokenizer.texts_to_sequences(df["english"])
token_ara=arabic_tokenizer.texts_to_sequences(df["arabic"])

In [None]:
padded_eng=pad_sequences(token_eng,maxlen=50,padding="post")
padded_ara=pad_sequences(token_ara,maxlen=50,padding="post")

In [None]:
model=Sequential()
model.add(Embedding(vocab_size_english,100,input_length=50))
model.add(tf.keras.layers.Bidirectional(LSTM(units=256)))
model.add(tf.keras.layers.RepeatVector(50))
model.add(LSTM(256,return_sequences=True))
model.add(TimeDistributed(Dense(vocab_size_arabic,activation="softmax")))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 100)           2606200   
                                                                 
 bidirectional (Bidirection  (None, 512)               731136    
 al)                                                             
                                                                 
 repeat_vector (RepeatVecto  (None, 50, 512)           0         
 r)                                                              
                                                                 
 lstm_1 (LSTM)               (None, 50, 256)           787456    
                                                                 
 time_distributed (TimeDist  (None, 50, 57847)         14866679  
 ributed)                                                        
                                                        

In [None]:
model.compile(loss="sparse_categorical_crossentropy",optimizer=tf.keras.optimizers.RMSprop(),metrics=["accuracy"])

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(padded_eng,padded_ara,test_size=0.2,random_state=42)

In [None]:
model.fit(X_train,y_train,batch_size=50,validation_split=0.2,verbose=2,epochs=10)

Epoch 1/10
316/316 - 102s - loss: 3.3081 - accuracy: 0.7169 - val_loss: 3.0634 - val_accuracy: 0.7116 - 102s/epoch - 323ms/step
Epoch 2/10
316/316 - 85s - loss: 2.7211 - accuracy: 0.7204 - val_loss: 2.7303 - val_accuracy: 0.7176 - 85s/epoch - 269ms/step
Epoch 3/10
316/316 - 86s - loss: 2.6027 - accuracy: 0.7236 - val_loss: 2.6767 - val_accuracy: 0.7161 - 86s/epoch - 271ms/step
Epoch 4/10
316/316 - 86s - loss: 2.5727 - accuracy: 0.7241 - val_loss: 2.6492 - val_accuracy: 0.7180 - 86s/epoch - 273ms/step
Epoch 5/10
316/316 - 97s - loss: 2.5531 - accuracy: 0.7245 - val_loss: 2.6491 - val_accuracy: 0.7179 - 97s/epoch - 306ms/step
Epoch 6/10
316/316 - 86s - loss: 2.5374 - accuracy: 0.7248 - val_loss: 2.6503 - val_accuracy: 0.7174 - 86s/epoch - 274ms/step
Epoch 7/10
316/316 - 87s - loss: 2.5257 - accuracy: 0.7252 - val_loss: 2.6334 - val_accuracy: 0.7184 - 87s/epoch - 275ms/step
Epoch 8/10
316/316 - 97s - loss: 2.5150 - accuracy: 0.7255 - val_loss: 2.6499 - val_accuracy: 0.7176 - 97s/epoch - 3

<keras.src.callbacks.History at 0x7f968c0abdc0>

In [None]:
model.evaluate(X_test,y_test)



[2.55092453956604, 0.727321445941925]

In [None]:
X_test.shape

(4928, 50)

In [None]:
preds = model.predict(X_test[:10])

predicts = []
for i in preds:
    predicts.append(np.argmax(i[[0]]))

print([english_idx_2_word[w] for w in X_test[10] if w != 0])
print([arabic_idx_2_word[w] for w in y_test[10] if w != 0])
print([arabic_idx_2_word[w] for w in predicts if w != 0])

["he's", 'my', 'brother']
['هو', 'أخي']
['هل', 'في', 'في', 'في', 'هل', 'في', 'لم', 'هل', 'في', 'هل']


In [None]:
preds.shape

(10, 50, 57847)

In [None]:
from nltk.translate.bleu_score import corpus_bleu

import re

# Function to clean and normalize text
def clean_text(text):
    # Remove extra spaces, special characters, and repeated values
    cleaned_text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove special characters
    cleaned_text = re.sub(r'(\b\w+\b)(?=.*\1)', '', cleaned_text)  # Remove repeated words

    return cleaned_text.strip()

df["arabic_reference_cleaned"] = df["arabic"].apply(clean_text)

# Extract candidate and reference translations
candidates = [arabic_idx_2_word[w] for w in predicts if w != 0]
references = df['arabic_reference_cleaned'].tolist()[:10]

# Tokenize the sentences
candidates = [sentence.split() for sentence in candidates]
references = [[sentence.split()] for sentence in references]

# Calculate BLEU score with 2-gram without smoothing
bleu_score_value = corpus_bleu(references, candidates, weights=(0.5, 0.5), smoothing_function=None)

# Print the BLEU score
print(f"Overall BLEU Score: {bleu_score_value * 100:.2f}%")


Overall BLEU Score: 0.00%


In [None]:
!pip install sentencepiece transformers==4.26.1

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.26.1
  Downloading transformers-4.26.1-py3-none-any.whl (6.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m75.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.26.1)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m99.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, sentencepiece, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.15.0
    Uninstalling tokenizers-0.15.0:
      Successfully uninstalled tokenizers-0.15.0
  

In [None]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
import torch

In [None]:
# Function for translation
def translate_text(source_text, source_lang, target_lang):
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)

    input_ids = tokenizer.encode(source_text, return_tensors="pt")

    with torch.no_grad():
        output = model.generate(input_ids)

    translated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    return translated_text

In [None]:
# Read your DataFrame from the CSV file
df = pd.read_csv("/content/ara_eng.txt", delimiter="\t", names=["english", "arabic"])
df = df[:400]

In [None]:
# Choose the source and target languages
source_lang = "en"
target_lang = "ar"

In [None]:
# Translate the English text to Arabic and create a new column for the translations
df["arabic_translation"] = df["english"].apply(lambda x: translate_text(x, source_lang, target_lang))

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]



In [None]:
df.head()

Unnamed: 0,english,arabic,arabic_translation
0,Hi.,مرحبًا.,مرحباً.. مرحباً..
1,Run!,اركض!,! أركض! أركض!
2,Help!,النجدة!,النجدة، النجدة، المساعدة، المساعدة، المساعدة، ...
3,Jump!,اقفز!,! اقفزوا! اقفزوا
4,Stop!,قف!,! توقّف! توقّف! توقّف!


In [None]:
import re

# Function to clean and normalize text
def clean_text(text):
    # Remove extra spaces, special characters, and repeated values
    cleaned_text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)  # Remove special characters
    cleaned_text = re.sub(r'(\b\w+\b)(?=.*\1)', '', cleaned_text)  # Remove repeated words

    return cleaned_text.strip()

In [None]:
# Clean and normalize reference and translation columns
df["arabic_reference_cleaned"] = df["arabic"].apply(clean_text)
df["arabic_translation_cleaned"] = df["arabic_translation"].apply(clean_text)

In [None]:
df["arabic_reference_cleaned"]

0            مرحبا
1             اركض
2           النجدة
3             اقفز
4               قف
          ...     
395          لنبدأ
396         لنحتفل
397    دعونا ننقسم
398     دعنا ننقسم
399    دعينا ننقسم
Name: arabic_reference_cleaned, Length: 400, dtype: object

In [None]:
df["arabic_translation_cleaned"]

0                            مرحبا
1                             أركض
2      النجدة             المساعدة
3                           اقفزوا
4                             توقف
                  ...             
395                          لنبدأ
396                     دعنا نحتفل
397                         لننقسم
398                         لننقسم
399                         لننقسم
Name: arabic_translation_cleaned, Length: 400, dtype: object

In [None]:
# Function to calculate accuracy considering partial match
def partial_accuracy(reference, translation):
    reference_tokens = set(reference.split())
    translation_tokens = set(translation.split())
    common_tokens = reference_tokens.intersection(translation_tokens)

    return len(common_tokens) > 0

# Calculate overall accuracy
correct_translations = df.apply(lambda row: partial_accuracy(row["arabic_reference_cleaned"], row["arabic_translation_cleaned"]), axis=1)
accuracy = correct_translations.sum() / len(df)

print(f"Overall Accuracy: {accuracy:.2%}")

Overall Accuracy: 60.25%


In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.23.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.23.0 (from python-Levenshtein)
  Downloading Levenshtein-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (169 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m169.4/169.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.23.0->python-Levenshtein)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.23.0 python-Levenshtein-0.23.0 rapidfuzz-3.5.2


In [None]:
import Levenshtein

def calculate_similarity(original_text, comparison_text):
    distance = Levenshtein.distance(original_text.lower(), comparison_text.lower())
    max_len = max(len(original_text), len(comparison_text))
    similarity_percentage = ((max_len - distance) / max_len) * 100
    return similarity_percentage

# Calculate similarity percentages
similarity = df.apply(lambda row: calculate_similarity(row["arabic_reference_cleaned"], row["arabic_translation_cleaned"]), axis=1)

# Calculate and print the average similarity
average_similarity = similarity.mean()
print(f"Average Similarity: {average_similarity:.2f}%")

Average Similarity: 55.62%


In [None]:
!pip install nltk



In [None]:
from nltk.translate.bleu_score import corpus_bleu

# Extract candidate and reference translations
candidates = df['arabic_translation_cleaned'].tolist()
references = df['arabic_reference_cleaned'].tolist()

# Tokenize the sentences
candidates = [sentence.split() for sentence in candidates]
references = [[sentence.split()] for sentence in references]

# Calculate BLEU score with 2-gram without smoothing
bleu_score_value = corpus_bleu(references, candidates, weights=(0.5, 0.5), smoothing_function=None)

# Print the BLEU score
print(f"Overall BLEU Score: {bleu_score_value * 100:.2f}%")


Overall BLEU Score: 29.06%
