In [44]:
import tensorflow as tf
import re

# Enable memory growth for GPUs if available
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

# Load tokenizer and model
model_name = 'armenian_tokenizer'
tokenizer = tf.saved_model.load(model_name).am
save_path = "/home/vahan/Documents/machine_translation/latest_correction_model"
loaded_model = tf.saved_model.load(save_path)
print("Model and tokenizer loaded successfully")

class Corrector(tf.Module):
    def __init__(self, tokenizer, model):
        self.tokenizer = tokenizer
        self.model = model

    def __call__(self, sentence, max_length=65):
        assert isinstance(sentence, tf.Tensor)
        if len(sentence.shape) == 0:
            sentence = sentence[tf.newaxis]

        sentence = self.tokenizer.tokenize(sentence).to_tensor()
        encoder_input = sentence

        start, end = self.tokenizer.tokenize([''])[0]
        start, end = start[tf.newaxis], end[tf.newaxis]

        output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
        output_array = output_array.write(0, start)

        for i in tf.range(max_length):
            output = tf.transpose(output_array.stack())
            predictions = self.model.generate(
                tf.cast(encoder_input, tf.int32),
                tf.cast(output, tf.int32)
            )
            predictions = predictions[:, -1:, :]
            predicted_id = tf.argmax(predictions, axis=-1)

            output_array = output_array.write(i + 1, predicted_id[0])

            if predicted_id == end:
                break

        output = tf.transpose(output_array.stack())
        text = self.tokenizer.detokenize(output)[0]
        tokens = self.tokenizer.lookup(output)[0]

        return text, tokens

corrector = Corrector(tokenizer, loaded_model)

class Preprocessing:
    def __init__(self, normalized_transcription):
        self.normalized_transcription = normalized_transcription
        self.new_sentences = []

    def perform_preprocessing(self):
        pattern = r'.*\d.*'
        filtered_data = [sentence.lower() for sentence in self.normalized_transcription if not re.match(pattern, sentence)]
        for sent in filtered_data:
            new_sent = "".join(
                [ch for ch in sent if ord("ա") <= ord(ch) <= ord("և") or ch in {",", "։", " ", ":"}]
            ).strip()
            new_sent = new_sent.replace(":", "։").replace("եւ", "և")
            new_sent = re.sub(r"\s+", " ", new_sent)

            if new_sent:
                self.new_sentences.append(new_sent)
        return self.new_sentences

# Example usage
sentence = "բարև,վսկու առհեքս ինցքան է"
prp = Preprocessing([sentence])
filtered_sentence = prp.perform_preprocessing()

translated_text, translated_tokens = corrector(tf.constant(filtered_sentence))
translated_text = translated_text.numpy().decode("utf-8")
translated_text

Model and tokenizer loaded successfully


'բարեւ , ոսկու արտահոսք ինչքան է'

In [45]:
updated_sentence = translated_text.replace('եւ', 'և')
print(updated_sentence)


բարև , ոսկու արտահոսք ինչքան է


In [46]:
'բարեւ , ոսկու առեք ինչքան է'

'բարեւ , ոսկու առեք ինչքան է'

In [48]:
print(55)

55
