<div class="markdown-google-sans">

## **Badaga - English Transliteration Project**
## **MarianMT**
</div>

Installation

In [None]:
!pip install transformers[torch] sentencepiece accelerate datasets sacrebleu

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacrebleu
  Downloading sacrebleu-2.4.1-py3-none-any.whl (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.6/106.6 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, DataCollatorForSeq2Seq, AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset, load_metric
import datasets
import random
from IPython.display import display, HTML

In [None]:
df = pd.read_excel('Badaga-v0.1.0.xlsx')
df.head()

Unnamed: 0,items,translated_transcript,audio_file_name,translterated_script,user_id,gender,locale,split_label,duration
0,1,what is the recipe in home,F002_1_1.mp3,manaya aena udhaka,F002,F,ba,train,2.377187
1,2,who is there near you,F002_1_2.mp3,pakka dhara edhdharae,F002,F,ba,train,2.377187
2,3,what did you prepare for lunch,F002_1_3.mp3,hagulu hasuga aena maditha,F002,F,ba,train,2.351062
3,4,did you brush,F002_1_4.mp3,hallu ujjithaya,F002,F,ba,train,2.194313
4,5,did you eat,F002_1_5.mp3,nee thindhubutaya,F002,F,ba,train,2.272687


In [None]:
df.split_label.value_counts()

train         6897
validation    1470
test          1470
Name: split_label, dtype: int64

In [None]:
df.dropna(inplace=True)
df.split_label.value_counts()

train         6895
validation    1470
test          1469
Name: split_label, dtype: int64

In [None]:
# Splitting data into train, test and valid

train_df = df[df['split_label']=='train']
valid_df = df[df['split_label']=='validation']
test_df  = df[df['split_label']=='test']

In [None]:
def generate_pairs(df):
  eng = df['translated_transcript'].to_list()
  bad = df['translterated_script'].to_list()
  en_ba = []
  for i,j in zip(eng,bad):
    # en_ba.append({'translation':{}})
    en_ba.append({'en':i,'ba':j})
  return en_ba

train_pairs = generate_pairs(train_df)
valid_pairs = generate_pairs(valid_df)
test_pairs  = generate_pairs(test_df)

In [None]:
# Generating new dataframes from dictionary pairs

new_train_df = pd.DataFrame({'translation':train_pairs})
new_train_df = new_train_df.reset_index(drop=True)
new_train_df.to_csv("translation_train.csv", sep="\t", encoding="utf-8", index=False)

new_valid_df = pd.DataFrame({'translation':valid_pairs})
new_valid_df = new_valid_df.reset_index(drop=True)
new_valid_df.to_csv("translation_valid.csv", sep="\t", encoding="utf-8", index=False)

new_test_df  = pd.DataFrame({'translation':test_pairs})
new_test_df  = new_test_df.reset_index(drop=True)
new_test_df.to_csv("translation_test.csv", sep="\t", encoding="utf-8", index=False)

In [None]:
# Loading csv files in Dataset format

data_files = {
    "train": "translation_train.csv",
    "validation": "translation_valid.csv",
    "test": "translation_test.csv"
}

raw_datasets  = load_dataset("csv", data_files=data_files, delimiter="\t", )
metric = load_metric("sacrebleu")

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

  metric = load_metric("sacrebleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/2.85k [00:00<?, ?B/s]

In [None]:
tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-ro")
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ro")



In [None]:
max_input_length = 128
max_target_length = 128
source_lang = 'en'
target_lang = 'ba'
prefix = ""
import ast

def preprocess_function(examples):
    inputs = [ast.literal_eval(ex)[source_lang] for ex in examples["translation"]]
    targets = [ast.literal_eval(ex)[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    labels = tokenizer(targets, max_length=max_target_length, truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

In [None]:
batch_size = 16

args = Seq2SeqTrainingArguments(
    "opus-mt-en-ba",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=10,
    save_steps=100,
    logging_steps=10,
    predict_with_generate=True,
    push_to_hub=False,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
def postprocess_text(preds, labels):
    preds = [pred.strip() for pred in preds]
    labels = [[label.strip()] for label in labels]

    return preds, labels

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)

    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels)
    result = {"bleu": result["score"]}

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens)
    result = {k: round(v, 4) for k, v in result.items()}
    return result

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.9504,1.847862,28.1364,15.3061
2,1.3218,1.17258,40.4907,13.4333
3,1.0355,0.809072,49.6057,13.1932
4,0.6852,0.586931,57.6951,13.4565
5,0.6033,0.442769,67.4908,13.2497
6,0.3725,0.361802,73.1138,13.2646
7,0.3739,0.305775,78.4597,13.1796
8,0.3456,0.274885,80.8872,13.1966
9,0.2472,0.256818,82.8258,13.1483
10,0.2669,0.251786,83.3166,13.2034


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}


TrainOutput(global_step=4310, training_loss=0.8721012235517568, metrics={'train_runtime': 1172.1297, 'train_samples_per_second': 58.825, 'train_steps_per_second': 3.677, 'total_flos': 195358395727872.0, 'train_loss': 0.8721012235517568, 'epoch': 10.0})

In [None]:
path = "/content/drive/MyDrive/opus-mt-en-ba"

model.save_pretrained(path)
tokenizer.save_pretrained(path)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}


('/content/drive/MyDrive/opus-mt-en-ba/tokenizer_config.json',
 '/content/drive/MyDrive/opus-mt-en-ba/special_tokens_map.json',
 '/content/drive/MyDrive/opus-mt-en-ba/vocab.json',
 '/content/drive/MyDrive/opus-mt-en-ba/source.spm',
 '/content/drive/MyDrive/opus-mt-en-ba/target.spm',
 '/content/drive/MyDrive/opus-mt-en-ba/added_tokens.json')

In [None]:
import torch

# Check if CUDA is available and select the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

n=1000
test_example = test_pairs[n]

true_ba = []
pred_ba = []

def preprocess_function(examples):
    inputs = examples["en"]
    model_inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(device)  # Move tensors to the desired device
    return model_inputs

tokenized_test_dataset = new_test_df['translation'].map(preprocess_function)

# Move model to the same device
model.to(device)

# Inference
model.eval()

# with torch.no_grad():
output = model.generate(**tokenized_test_dataset[n])
decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
print("Original English Sentence:",test_example['en'])
print("Original Badaga Sentence:",test_example['ba'])
print("Predicted Badaga Sentence:", decoded_output[0])


Original English Sentence: he is stronger
Original Badaga Sentence: emma jasthi bala bethu thanae
Predicted Badaga Sentence: emma jasthi balarava edhdhanae


In [None]:
input_en = []
true_ba = []
pred_ba = []

for i in range(len(test_pairs)):
  print(i)
  test_example = test_pairs[i]
  output = model.generate(**tokenized_test_dataset[i])
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
  # print("Original English Sentence:",test_example['en'])
  # print("Original Badaga Sentence:",test_example['ba'])
  # print("Predicted Badaga Sentence:", decoded_output[0])
  input_en.append(test_example['en'])
  true_ba.append(test_example['ba'])
  pred_ba.append(decoded_output[0])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
marian_res = pd.DataFrame({'input_en':input_en, 'true_ba':true_ba, 'output':pred_ba})

In [None]:
marian_res

Unnamed: 0,input_en,true_ba,output
0,lets see what is going to happen,nadatharava aenthu nooduva,nadatharava aenthu noodithae
1,come to village i will tell something important,hatti ga ba baevara aegina,hatti ga ba baevara aegina baarivi
2,ther is a little water in this bottle,ei putti ya osi neeru tha hadathae,ei putti ya osi neeru tha hadathae
3,did you buy a new vechile,osa bandi esithiya,osa bandi esithiya
4,i buyed a new shirt,na ondhu osa kamisu esithae,na ondhu osa kamisu esinae
...,...,...,...
1464,which fruit you like the most,ninnaga aena hannu appara edathara,ninnaga aena hannu appara edathara
1465,i feel sleepy,aenaga orruku bapaningae hadathae,aenaga orruku bapaningae hadathae
1466,how many people are there in your house,ninga maennaya aesaga edhi,ninga maennaya aesaga edhi
1467,what is price for gold,chinna baellae aesaga vario,chinna baellae aesaga vario


In [None]:
import sacrebleu

bleu_scores = [sacrebleu.raw_corpus_bleu([pred], [[true]]) for pred, true in zip(pred_ba, true_ba)]
bleu_scores_values = [score.score for score in bleu_scores]

# Compute average BLEU score
average_bleu_score = sum(bleu_scores_values) / len(bleu_scores_values)

print("Average BLEU Score:", average_bleu_score)

Average BLEU Score: 82.10501907611821


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
from nltk.translate import meteor_score

# Tokenize hypotheses and references
tokenized_pred_ba = [pred.split() for pred in pred_ba]
tokenized_true_ba = [true.split() for true in true_ba]

# Calculate METEOR scores
meteor_scores = [meteor_score.meteor_score([true], pred) for pred, true in zip(tokenized_pred_ba, tokenized_true_ba)]

# Compute average METEOR score
average_meteor_score = sum(meteor_scores) / len(meteor_scores)

print("Average METEOR Score:", average_meteor_score)

Average METEOR Score: 0.8665459415174618


In [None]:
marian_res.to_csv('marian_res_enba.csv')

### Badaga to English

In [None]:
max_input_length = 128
max_target_length = 128
source_lang = 'ba'
target_lang = 'en'
prefix = ""
import ast
def preprocess_function(examples):
    inputs = [ast.literal_eval(ex)[source_lang] for ex in examples["translation"]]
    targets = [ast.literal_eval(ex)[target_lang] for ex in examples["translation"]]
    model_inputs = tokenizer(inputs, max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [None]:
tokenized_datasets = raw_datasets.map(preprocess_function, batched=True)

Map:   0%|          | 0/6895 [00:00<?, ? examples/s]



Map:   0%|          | 0/1470 [00:00<?, ? examples/s]

Map:   0%|          | 0/1469 [00:00<?, ? examples/s]

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-ro")

In [None]:
batch_size = 16

args = Seq2SeqTrainingArguments(
    "opus-mt-ba-en",
    evaluation_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    save_steps=100,
    logging_steps=10,
    num_train_epochs=10,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=False,
)

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Bleu,Gen Len
1,1.7102,1.643929,28.2615,10.4442
2,1.2736,1.076613,38.9514,10.4748
3,0.9541,0.734604,51.0404,10.5524
4,0.6971,0.538681,60.9822,11.1122
5,0.5569,0.411375,70.7437,10.6408
6,0.3596,0.333214,77.631,10.7
7,0.3874,0.284705,81.2343,10.8816
8,0.3326,0.252562,83.2786,10.849
9,0.2502,0.237929,85.6599,10.8544
10,0.2594,0.231616,85.934,10.8408


Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}
Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}


TrainOutput(global_step=4310, training_loss=0.7909199087083202, metrics={'train_runtime': 1200.7973, 'train_samples_per_second': 57.42, 'train_steps_per_second': 3.589, 'total_flos': 389926535823360.0, 'train_loss': 0.7909199087083202, 'epoch': 10.0})

In [None]:
path = "/content/drive/MyDrive/opus-mt-ba-en"

model.save_pretrained(path)
tokenizer.save_pretrained(path)

Non-default generation parameters: {'max_length': 512, 'num_beams': 4, 'bad_words_ids': [[59542]], 'forced_eos_token_id': 0}


('/content/drive/MyDrive/opus-mt-ba-en/tokenizer_config.json',
 '/content/drive/MyDrive/opus-mt-ba-en/special_tokens_map.json',
 '/content/drive/MyDrive/opus-mt-ba-en/vocab.json',
 '/content/drive/MyDrive/opus-mt-ba-en/source.spm',
 '/content/drive/MyDrive/opus-mt-ba-en/target.spm',
 '/content/drive/MyDrive/opus-mt-ba-en/added_tokens.json')

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

input_ba = []
true_en = []
pred_en = []
n=100
test_example = test_pairs[n]
def preprocess_function(examples):
    inputs = examples["ba"]
    model_inputs = tokenizer(inputs, return_tensors="pt", padding=True, truncation=True).to(device)  # Move tensors to the desired device
    return model_inputs

tokenized_test_dataset = new_test_df['translation'].map(preprocess_function)

# Move model to the same device
model.to(device)

# Inference
model.eval()
with torch.no_grad():
  output = model.generate(**tokenized_test_dataset[n])
  decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
  print("Original Badaga Sentence:",test_example['ba'])
  print("Original English Sentence:",test_example['en'])
  print("Predicted Badaga Sentence:", decoded_output[0])


Original Badaga Sentence: aella baruthu butaya
Original English Sentence: did you write everything
Predicted Badaga Sentence: did you write everyone


In [None]:
input_ba = []
true_en = []
pred_en = []

for i in range(len(test_pairs)):
  print(i)
  test_example = test_pairs[i]
  with torch.no_grad():
    output = model.generate(**tokenized_test_dataset[i])
    decoded_output = tokenizer.batch_decode(output, skip_special_tokens=True)
    # print("Original English Sentence:",test_example['en'])
    # print("Original Badaga Sentence:",test_example['ba'])
    # print("Predicted Badaga Sentence:", decoded_output[0])
    input_ba.append(test_example['ba'])
    true_en.append(test_example['en'])
    pred_en.append(decoded_output[0])

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
27

In [None]:
marian_res_baen = pd.DataFrame({'input_ba': input_ba, 'true_en': true_en, 'pred_en': pred_en})
marian_res_baen


Unnamed: 0,input_ba,true_en,pred_en
0,nadatharava aenthu nooduva,lets see what is going to happen,lets see what and see
1,hatti ga ba baevara aegina,come to village i will tell something important,come to village i will tell something
2,ei putti ya osi neeru tha hadathae,ther is a little water in this bottle,ther is a little water in this bottle
3,osa bandi esithiya,did you buy a new vechile,did you buy a new vechile
4,na ondhu osa kamisu esithae,i buyed a new shirt,i buyed buy a new shirts
...,...,...,...
1464,ninnaga aena hannu appara edathara,which fruit you like the most,which fruit you like the most
1465,aenaga orruku bapaningae hadathae,i feel sleepy,i feel sleepy
1466,ninga maennaya aesaga edhi,how many people are there in your house,how many people are there in your home
1467,chinna baellae aesaga vario,what is price for gold,what is price for gold


In [None]:
marian_res_baen.to_csv('marian_res_baen.csv')

In [None]:
bleu_scores = [sacrebleu.raw_corpus_bleu([pred], [[true]]) for pred, true in zip(pred_en, true_en)]
bleu_scores_values = [score.score for score in bleu_scores]

# Compute average BLEU score
average_bleu_score = sum(bleu_scores_values) / len(bleu_scores_values)

print("Average BLEU Score:", average_bleu_score)

Average BLEU Score: 86.27449514783655


In [None]:
tokenized_pred_en = [pred.split() for pred in pred_en]
tokenized_true_en = [true.split() for true in true_en]

# Calculate METEOR scores
meteor_scores = [meteor_score.meteor_score([true], pred) for pred, true in zip(tokenized_pred_en, tokenized_true_en)]

# Compute average METEOR score
average_meteor_score = sum(meteor_scores) / len(meteor_scores)

print("Average METEOR Score:", average_meteor_score)

Average METEOR Score: 0.9067790313368381
