# Notebook for Training Machine Translation Model and Translating the Whisper file

## Import the required packages

In [7]:
import os
import sys
import transformers
import tensorflow as tf
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import TFAutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
from transformers import AdamWeightDecay

### Load the pre-trained model - [opus-mt-bg-en](https://huggingface.co/Helsinki-NLP/opus-mt-bg-en)

In [6]:
model_chechpoint ="Helsinki-NLP/opus-mt-bg-en"

### Load the dataset - [opus100/bg-en](https://huggingface.co/datasets/Helsinki-NLP/opus-100)

In [5]:
dataset = load_dataset("opus100", "bg-en")

README.md:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/160k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/71.3M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/154k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2000 [00:00<?, ? examples/s]

### Check the sets

In [6]:
dataset

DatasetDict({
    test: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
    train: Dataset({
        features: ['translation'],
        num_rows: 1000000
    })
    validation: Dataset({
        features: ['translation'],
        num_rows: 2000
    })
})

### Check an example of the dataset

In [7]:
dataset['train'][0]

{'translation': {'bg': 'Сериозно ли?', 'en': 'Are you serious?'}}

## Preprocessing the data

In [None]:
# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_chechpoint)



### Check the tokenizer

In [None]:
# tokenize sentences to check the tokenizer
tokenizer(['Здравей, как си днес?', 'How are you today?'])

{'input_ids': [[671, 3, 339, 35, 880, 5, 0], [578, 1131, 22, 6784, 22, 1930, 12, 2250, 5, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

### Define PreProcessing function to tokenize the dataset

In [None]:
# define max length of input and target
max_input_length = 128
max_target_length = 128
# define the source and target language
source_lang = 'bg'
target_lang = 'en'
# define preprocess function to tokenize the input and target
def preprocess_function(examples):
  # Tokenize the inputs and targets
  inputs = [ex[source_lang] for ex in examples["translation"]]
  targets = [ex[target_lang] for ex in examples["translation"]]
  # Setup the tokenizer for targets
  model_inputs = tokenizer(inputs, max_length=max_target_length, truncation=True)

  # Setup the tokenizer for targets
  with tokenizer.as_target_tokenizer():
    labels = tokenizer(targets, max_length=max_target_length, truncation=True)

  model_inputs["labels"] = labels["input_ids"]
  # return the model inputs
  return model_inputs

### Apply the function to a sample from the dataset

In [12]:
preprocess_function(dataset["train"][:2])

{'input_ids': [[4258, 49, 5, 0], [644, 103, 28, 0]], 'attention_mask': [[1, 1, 1, 1], [1, 1, 1, 1]], 'labels': [[468, 14, 1681, 5, 0], [22, 662, 28, 0]]}

### Apply the preprocess function to the whole dataset

In [None]:
# tokenize the dataset
tokenized_datasets = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [None]:
# Define model configuration
model = TFAutoModelForSeq2SeqLM.from_pretrained(model_chechpoint)

tf_model.h5:   0%|          | 0.00/306M [00:00<?, ?B/s]

2025-03-22 13:37:05.952852: W tensorflow/core/common_runtime/gpu/gpu_bfc_allocator.cc:47] Overriding orig_value setting because the TF_FORCE_GPU_ALLOW_GROWTH environment variable is set. Original config value was 0.
2025-03-22 13:37:05.954336: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46228 MB memory:  -> device: 0, name: NVIDIA RTX 6000 Ada Generation, pci bus id: 0000:41:00.0, compute capability: 8.9
All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at Helsinki-NLP/opus-mt-bg-en.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


### Define parameter for training

In [23]:
batch_size = 16
learning_rate = 2e-5
weight_decay = 0.01
num_train_epochs = 3

### Define data collator

In [None]:
# Define data collator
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf")

In [None]:
# generate data collator
generation_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, return_tensors="tf", pad_to_multiple_of=128)

### Prepare the training and validation sets

In [None]:
# define training set
train_dataset = model.prepare_tf_dataset(
    tokenized_datasets["train"], # Use thre train set from the tokenized dataset
    batch_size=batch_size, # define the batch size
    shuffle=True,
    collate_fn=data_collator, # collate function
)

In [None]:
# define validation set
validation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"], # Use thre validation set from the tokenized dataset
    batch_size=batch_size, # define the batch size
    shuffle=True,
    collate_fn=data_collator, # collate function
)

In [None]:
# define generator dataset
generation_dataset = model.prepare_tf_dataset(
    tokenized_datasets["validation"], # Use thre validation set from the tokenized dataset 
    batch_size=8, # Set the batch size to 8
    shuffle=False,
    collate_fn=generation_data_collator, # use the generation data collator
)

In [None]:
# define optimizer - AdamWeightDecay
optimizer = AdamWeightDecay(learning_rate=learning_rate, weight_decay_rate=weight_decay)
# compile the model with the optimizer
model.compile(optimizer=optimizer)

### Train the model and save it

In [None]:
# train the model using the train and validation sets for the defined number of epochs
model.fit(train_dataset, validation_data=validation_dataset, epochs=num_train_epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x7fdc84111350>

In [None]:
# save the model in folder tf_model
model.save_pretrained("tf_model/")

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[61812]]}


----
### Test the model on translating a sentence

In [None]:
# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_chechpoint)
# load the model from tf_model
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

All model checkpoint layers were used when initializing TFMarianMTModel.

All the layers of TFMarianMTModel were initialized from the model checkpoint at tf_model/.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFMarianMTModel for predictions without further training.


In [None]:
# define input text to test the model
input_text = 'Обичам да чета книги, когато навън вали дъжд.'
# tokenize the input text
tokenized = tokenizer([input_text], return_tensors='np')
# generate the output
out = model.generate(**tokenized, max_length=128)
# print the tokens
out

<tf.Tensor: shape=(1, 13), dtype=int32, numpy=
array([[61812,    11,   111,    12,  1366,  4669,   220,    33,  5266,
           10,  1629,     2,     0]])>

In [None]:
# decode the output tokens to get the translation
with tokenizer.as_target_tokenizer():
    # decode the output tokens
    print(tokenizer.decode(out[0], skip_special_tokens=True))

I like to read books when it rains outside.


-----------

## Use the model to translate the transcript from Whisper

In [None]:
# import pandas to read the csv file
import pandas as pd

### Load the model tokenizer

In [None]:
# load the model and tokenizer from the saved model
model_chechpoint ="Helsinki-NLP/opus-mt-bg-en"
# load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_chechpoint)
model = TFAutoModelForSeq2SeqLM.from_pretrained("tf_model/")

### Load the Whisper CSV file and use the first 200 lines (4 group members/50 for each for quality check)

In [None]:
# define the file path to Whisper CSV file
file_path = "STT_Whisper.csv"
# read the csv file
df = pd.read_csv(file_path).head(200)

### Check the first 10 lines before translation

In [None]:
df.head(10)

Unnamed: 0,Start Time,End Time,Transcription
0,0,2,СОФИЯ
1,30,3468,"Той беше обещал, че ще я сълбажда, че ще я тъ..."
2,3546,381,"Тя много страдаше, много плачеше."
3,39,4292,Аз по някакъв начин исках да компенсирам него...
4,4352,473,"затова си позволих да ѝ купувам всичко, какво..."
5,4796,5012,обаче с годините ми се качи на главата.
6,5112,5404,Отношенията между майка и дъщеря се влушават ...
7,544,5638,"когато новият приятел на Лилияна, Теодор,"
8,5638,5996,се нанася да живее при тях заедно с сина си Н...
9,60,6516,Дори и най-дредната молба за помощ от страна ...


### Define function to iterate over each of the sentences and get their translations

In [None]:
# List to store the translations
translations = []
# for loop to iterate over the sentences in the Transcription column
for sentence in df["Transcription"]:
    # tokenize the sentence
    tokenized = tokenizer([sentence], return_tensors='np', max_length=128, truncation=True)
    # generate the output
    output = model.generate(**tokenized, max_length=128)
    # decode the output tokens
    translation = tokenizer.decode(output[0], skip_special_tokens=True)
    # append the translation to the list
    translations.append(translation)
# add the translations to the dataframe as a new column
df["Translation"] = translations

### Check the first 10 lines after translation

In [None]:
df.head(10)

Unnamed: 0,Start Time,End Time,Transcription,Translation
0,0,2,СОФИЯ,SOFIA
1,30,3468,"Той беше обещал, че ще я сълбажда, че ще я тъ...","He promised he'd say he'd look for her, but he..."
2,3546,381,"Тя много страдаше, много плачеше.","She was in a lot of pain, a lot of crying."
3,39,4292,Аз по някакъв начин исках да компенсирам него...,"I somehow wanted to compensate for his absence,"
4,4352,473,"затова си позволих да ѝ купувам всичко, какво...",So I took the liberty of buying her everything...
5,4796,5012,обаче с годините ми се качи на главата.,"But over the years, he got on my head."
6,5112,5404,Отношенията между майка и дъщеря се влушават ...,The relationship between a mother and a daught...
7,544,5638,"когато новият приятел на Лилияна, Теодор,","When Liliana's new friend, Theodore,"
8,5638,5996,се нанася да живее при тях заедно с сина си Н...,"moved in with his son, Nicholas, to live with ..."
9,60,6516,Дори и най-дредната молба за помощ от страна ...,Even the most appropriate request for help fro...


### Save the dataframe az Excell file

In [None]:
df.to_excel("translations_score.xlsx", index=False)
print("Translation done!")

Translation done!
