In [None]:
#### LIST OF ALL EXPERIMENTS [En-Mr] ####
# 1.  Experiment-1 TRANSQUEST(Train data then predict on dev data)
# 2.  Experiment-2 COMET(COMET wmt21-comet-qe-da )
# 3.  Experiment-3 META/LLAMA (LLama3.2 3B parameters)
# 4.  Experiment-4 META/LLAMA (Meta-Llama-3-8B-Instruct)
# 5.  Experiment-5 Fine-Tuning 8B-Instruct

In [None]:
!pip install git+https://github.com/tharindudr/TransQuest.git
!pip install transformers --upgrade
!pip install datasets

In [None]:
#  ************************ Experiment-1 TRANSQUEST(Train data then predict on dev data)***********************************

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import AdamW
from datasets import load_dataset
import pandas as pd
from transformers import RobertaTokenizer
from datasets import Dataset # Import Dataset here
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer # Import XLMRobertaTokenizer here

import os
os.environ["WANDB_DISABLED"] = "true"

train_df = pd.read_csv("train.enmr.df.short.csv")
dev_df = pd.read_csv("dev.enmr.df.short.csv")

# Convert 'original' and 'translation' to strings (to avoid type issues)
train_df['original'] = train_df['original'].astype(str)
train_df['translation'] = train_df['translation'].astype(str)
dev_df['original'] = dev_df['original'].astype(str)
dev_df['translation'] = dev_df['translation'].astype(str)

# Rename columns as required
train_df = train_df.rename(columns={"mean": "labels"})
dev_df = dev_df.rename(columns={"mean": "labels"})

# Convert pandas DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['original', 'translation', 'labels']])
dev_dataset = Dataset.from_pandas(dev_df[['original', 'translation', 'labels']])

# Tokenization function
def tokenize_function(examples):
    # Instantiate the tokenizer inside the function
    # The tokenizer is now accessible within this function's scope
    tokenizer = XLMRobertaTokenizer.from_pretrained('TransQuest/monotransquest-da-multilingual')
    return tokenizer(examples['original'], examples['translation'], padding="max_length", truncation=True)


# Apply tokenizer to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Load model and tokenizer  # Moved this block here
model_name = 'TransQuest/monotransquest-da-multilingual'

try:
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Ensure this is a valid path
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',  # Ensure logging_dir is also valid
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    report_to=None,  # Disable WandB
    disable_tqdm=True  # Disable progress bars (which are tied to wandb)
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the tokenized train dataset
    eval_dataset=dev_dataset,  # Use the tokenized dev dataset
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Map:   0%|          | 0/26000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Some weights of the model checkpoint at TransQuest/monotransquest-da-multilingual were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model and tokenizer loaded successfully.


  trainer = Trainer(


{'loss': 4866.1137, 'grad_norm': 3113.35986328125, 'learning_rate': 1.999076923076923e-05, 'epoch': 0.0015384615384615385}
{'loss': 4324.6008, 'grad_norm': 9137.740234375, 'learning_rate': 1.9980512820512822e-05, 'epoch': 0.003076923076923077}
{'loss': 4385.0148, 'grad_norm': 12059.947265625, 'learning_rate': 1.9970256410256414e-05, 'epoch': 0.004615384615384616}
{'loss': 4223.5188, 'grad_norm': 4595.1171875, 'learning_rate': 1.9960000000000002e-05, 'epoch': 0.006153846153846154}
{'loss': 3877.5578, 'grad_norm': 15948.2626953125, 'learning_rate': 1.994974358974359e-05, 'epoch': 0.007692307692307693}
{'loss': 3570.6469, 'grad_norm': 2817.83203125, 'learning_rate': 1.993948717948718e-05, 'epoch': 0.009230769230769232}
{'loss': 3585.8953, 'grad_norm': 4167.68359375, 'learning_rate': 1.9929230769230773e-05, 'epoch': 0.010769230769230769}
{'loss': 3328.5629, 'grad_norm': 4443.01708984375, 'learning_rate': 1.991897435897436e-05, 'epoch': 0.012307692307692308}
{'loss': 3436.3707, 'grad_norm':

TrainOutput(global_step=19500, training_loss=249.94785128236427, metrics={'train_runtime': 14196.7076, 'train_samples_per_second': 5.494, 'train_steps_per_second': 1.374, 'train_loss': 249.94785128236427, 'epoch': 3.0})

In [None]:
# 1. Evaluate
eval_results = trainer.evaluate()
print(eval_results)

# 2. Save the model
trainer.save_model("./my_finetuned_model")
tokenizer.save_pretrained("./my_finetuned_model")

# 3. Predict on dev set
predictions = trainer.predict(dev_dataset)
preds = predictions.predictions.squeeze()
print(preds)

# 4. Save predictions
dev_df['predicted'] = preds
dev_df.to_csv("Tranquest_with_predictions[en-mr].tsv", sep='\t', index=False)
print("******** Tranquest_with_predictions[en-mr] Saved Successfully !!! *********")

{'eval_loss': 120.61016845703125, 'eval_runtime': 38.4225, 'eval_samples_per_second': 26.026, 'eval_steps_per_second': 6.507, 'epoch': 3.0}
{'eval_loss': 120.61016845703125, 'eval_runtime': 38.4225, 'eval_samples_per_second': 26.026, 'eval_steps_per_second': 6.507, 'epoch': 3.0}
[70.72689  70.727    70.72692  70.726875 70.72695  70.72698  70.72692
 70.72686  70.72686  70.727    70.72697  70.726906 70.727    70.72685
 70.7269   70.72694  70.72695  70.72692  70.72692  70.72701  70.72688
 70.72695  70.72683  70.72694  70.72694  70.72694  70.72688  70.72694
 70.72688  70.726944 70.72685  70.72692  70.72692  70.72698  70.72691
 70.72697  70.72692  70.72693  70.72695  70.72695  70.72692  70.72688
 70.72693  70.7269   70.726875 70.72695  70.726974 70.72689  70.72698
 70.72703  70.72695  70.726906 70.72698  70.72694  70.72703  70.72697
 70.72697  70.72698  70.72697  70.72695  70.72697  70.726875 70.72698
 70.72698  70.72691  70.72695  70.726906 70.72688  70.72694  70.72694
 70.72695  70.727   

In [None]:
df = pd.read_csv("Tranquest_with_predictions[en-mr].tsv", sep='\t')
df.head()

Unnamed: 0.1,Unnamed: 0,index,original,translation,scores,labels,z_scores,z_mean,predicted
0,0,26000,There might be a problem with taking some herb...,काही वनौषधी उपचार आणि डिपायरिडामोलची पूरक औषधे...,"[68, 77, 78, 82]",76.25,"[-0.002773661572827, 0.10647697703997357, 1.39...",0.364989,70.72689
1,1,26001,"To help diagnose asthma, your result can be co...","अस्थमाचे निदान करण्यात मदत करण्यासाठी, तुमच्या...","[70, 78, 62, 82]",73.0,"[0.10390562968973616, 0.16305347918661187, 0.4...",0.178078,70.727
2,2,26002,The combination of crisp hot Jaleba with chill...,थंड राब्री आणि गरम जलेबाचे मिश्रण हे इंदूरचे स...,"[70, 23, 52, 80]",56.25,"[0.10390562968973616, -2.948654138878495, -0.0...",-0.776942,70.72692
3,3,26003,You may be referred to a specialist in diagnos...,त्वचेच्या आजाराचे निदान आणि उपचार करणाऱ्या तज...,"[72, 77, 54, 72]",68.75,"[0.2105849209522993, 0.10647697703997357, 0.03...",-0.097082,70.726875
4,4,26004,Prime Minister said that the same vested inter...,ज्या स्वार्थापोटी शेतकऱ्यांचे शोषण केले जात ह...,"[80, 59, 56, 92]",71.75,"[0.637302086002552, -0.9119000615995159, 0.144...",0.131115,70.72695


In [None]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
import pandas as pd

# Load the saved predictions
df = pd.read_csv("Tranquest_with_predictions[en-mr].tsv", sep="\t")

# Ground truth and predicted scores
y_true = df["labels"]  # These are normalized then inverse transformed = original mean
y_pred = df["predicted"]

# Compute metrics
spearman_corr, _ = spearmanr(y_true, y_pred)
pearson_corr, _ = pearsonr(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman correlation: {spearman_corr:.4f}")
print(f"Pearson correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")

Spearman correlation: 0.1962
Pearson correlation : 0.1997
Mean Absolute Error : 7.7810


In [None]:
# /////////////////// After Tuning

In [None]:
from transformers import Trainer, TrainingArguments, XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import Dataset
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

# Disable WandB in Colab
os.environ["WANDB_DISABLED"] = "true"

train_df = pd.read_csv("train.enmr.df.short.csv")
dev_df = pd.read_csv("dev.enmr.df.short.csv")

# Convert columns to string to avoid type issues
train_df['original'] = train_df['original'].astype(str)
train_df['translation'] = train_df['translation'].astype(str)
dev_df['original'] = dev_df['original'].astype(str)
dev_df['translation'] = dev_df['translation'].astype(str)

# Rename 'mean' column to 'labels'
train_df = train_df.rename(columns={"mean": "labels"})
dev_df = dev_df.rename(columns={"mean": "labels"})

# Normalize labels to [0, 1]
scaler = MinMaxScaler()
train_df['labels'] = scaler.fit_transform(train_df[['labels']])
dev_df['labels'] = scaler.transform(dev_df[['labels']])

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df[['original', 'translation', 'labels']])
dev_dataset = Dataset.from_pandas(dev_df[['original', 'translation', 'labels']])

# Tokenizer
model_name = 'TransQuest/monotransquest-da-multilingual'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['original'], examples['translation'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Load model with regression configuration
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    problem_type="regression"
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    report_to=None,
    disable_tqdm=True,
    max_grad_norm=1.0  # Clip gradients to avoid explosion
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
)

# Train
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

Map:   0%|          | 0/26000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at TransQuest/monotransquest-da-multilingual were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


{'loss': 0.0219, 'grad_norm': 1.9586580991744995, 'learning_rate': 1.9981538461538464e-05, 'epoch': 0.003076923076923077}
{'loss': 0.0128, 'grad_norm': 1.511494755744934, 'learning_rate': 1.9961025641025643e-05, 'epoch': 0.006153846153846154}
{'loss': 0.0134, 'grad_norm': 2.43178129196167, 'learning_rate': 1.9940512820512823e-05, 'epoch': 0.009230769230769232}
{'loss': 0.0163, 'grad_norm': 2.2590365409851074, 'learning_rate': 1.9920000000000002e-05, 'epoch': 0.012307692307692308}
{'loss': 0.0127, 'grad_norm': 1.5147675275802612, 'learning_rate': 1.989948717948718e-05, 'epoch': 0.015384615384615385}
{'loss': 0.0119, 'grad_norm': 0.7182744741439819, 'learning_rate': 1.987897435897436e-05, 'epoch': 0.018461538461538463}
{'loss': 0.0124, 'grad_norm': 0.5403687953948975, 'learning_rate': 1.985846153846154e-05, 'epoch': 0.021538461538461538}
{'loss': 0.0141, 'grad_norm': 1.5355591773986816, 'learning_rate': 1.983794871794872e-05, 'epoch': 0.024615384615384615}
{'loss': 0.0086, 'grad_norm': 1

TrainOutput(global_step=9750, training_loss=0.007798639588249035, metrics={'train_runtime': 11261.5818, 'train_samples_per_second': 6.926, 'train_steps_per_second': 0.866, 'train_loss': 0.007798639588249035, 'epoch': 3.0})

In [None]:
# Run prediction
preds = trainer.predict(dev_dataset).predictions

# Convert predictions to original scale
original_scale_preds = scaler.inverse_transform(preds)

# Flatten predictions and add to dev_df
dev_df['Transquest_predicted_score'] = original_scale_preds.flatten()

# Save to TSV
dev_df.to_csv("transquest_predictions.enmr.tsv", sep='\t', index=False)

print("Predictions saved to transquest_predictions.enmr.tsv")


Predictions saved to transquest_predictions.enmr.tsv


In [None]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
import pandas as pd

# Load the saved predictions
df = pd.read_csv("transquest_predictions.enmr.tsv",sep="\t")

# Ground truth and predicted scores
y_true = df["labels"]  # These are normalized then inverse transformed = original mean
y_pred = df["predicted"]

# Compute metrics
spearman_corr, _ = spearmanr(y_true, y_pred)
pearson_corr, _ = pearsonr(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman correlation: {spearman_corr:.4f}")
print(f"Pearson correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")

Spearman correlation: -0.1131
Pearson correlation : -0.0741
Mean Absolute Error : 0.0605


In [None]:
# ***********************  COMET wmt21-comet-qe-da ******************************

In [None]:
!pip uninstall -y comet_ml -q
!pip install -U unbabel-comet -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m91.0/91.0 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.4/101.4 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.3/18.3 MB[0m [31m99.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.6/294.6 kB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.1/823.1 kB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Step 2: Load TSV and convert to JSONL format
import pandas as pd
import json

# Load the TSV file
df = pd.read_csv("dev.enmr.df.short.csv")

# COMET expects columns: src, mt, but your DataFrame may use 'original', 'translation'
# Change 'src' and 'mt' to the correct column names
jsonl_data = [{"src": row["original"], "mt": row["translation"]} for _, row in df.iterrows()]

# Save to JSONL for prediction
with open("temp_input.jsonl", "w") as f:
    for line in jsonl_data:
        f.write(json.dumps(line) + "\n")

In [None]:
!pip install --upgrade unbabel-comet



In [None]:
# Step 3: Run COMET inference
from comet import download_model, load_from_checkpoint

# Load pretrained model - updated to a supported model name
# Note: Check the COMET model hub for a suitable model if this one is not supported
model_path = download_model("wmt21-comet-qe-da")
model = load_from_checkpoint(model_path)

# Load the TSV file (this may be redundant if 'df' is already in memory)
df = pd.read_csv("dev.enmr.df.short.csv")

# Prepare data in the expected format for COMET
data = [{"src": row["original"], "mt": row["translation"]} for _, row in df.iterrows()]

# Predict
predictions = model.predict(data, batch_size=8, gpus=1)

# Add scores to original dataframe
df["comet_score"] = predictions["scores"]

# Save results
df.to_csv("dev_with_comet_scores.tsv", sep="\t", index=False)
df.head()

wmt21-comet-qe-da.tar.gz: 1.72GB [00:48, 35.8MB/s]                            
INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt21-comet-qe-da/checkpoints/model.ckpt`


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/616 [00:00<?, ?B/s]

/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero:You are using a CUDA device ('NVIDIA L4') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch

Unnamed: 0.1,Unnamed: 0,index,original,translation,scores,mean,z_scores,z_mean,comet_score
0,0,26000,There might be a problem with taking some herb...,काही वनौषधी उपचार आणि डिपायरिडामोलची पूरक औषधे...,"[68, 77, 78, 82]",76.25,"[-0.002773661572827, 0.10647697703997357, 1.39...",0.364989,0.12944
1,1,26001,"To help diagnose asthma, your result can be co...","अस्थमाचे निदान करण्यात मदत करण्यासाठी, तुमच्या...","[70, 78, 62, 82]",73.0,"[0.10390562968973616, 0.16305347918661187, 0.4...",0.178078,0.162059
2,2,26002,The combination of crisp hot Jaleba with chill...,थंड राब्री आणि गरम जलेबाचे मिश्रण हे इंदूरचे स...,"[70, 23, 52, 80]",56.25,"[0.10390562968973616, -2.948654138878495, -0.0...",-0.776942,0.153329
3,3,26003,You may be referred to a specialist in diagnos...,त्वचेच्या आजाराचे निदान आणि उपचार करणाऱ्या तज...,"[72, 77, 54, 72]",68.75,"[0.2105849209522993, 0.10647697703997357, 0.03...",-0.097082,0.130036
4,4,26004,Prime Minister said that the same vested inter...,ज्या स्वार्थापोटी शेतकऱ्यांचे शोषण केले जात ह...,"[80, 59, 56, 92]",71.75,"[0.637302086002552, -0.9119000615995159, 0.144...",0.131115,0.11872


In [None]:
# Step 4: Calculate Pearson, Spearman, and MAE
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_absolute_error

# Assuming 'score' is the actual score in your dataset
# If 'score' is missing, you can replace it with any dummy value for testing

# Pearson Correlation
pearson_corr, _ = pearsonr(df["mean"], df["comet_score"])

# Spearman Correlation
spearman_corr, _ = spearmanr(df["mean"], df["comet_score"])

# MAE
mae = mean_absolute_error(df["mean"], df["comet_score"])

# Print results
print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")

Pearson Correlation: 0.4716
Spearman Correlation: 0.3882
Mean Absolute Error (MAE): 69.6762


In [None]:
# ************************  LLama3.2 3B parameters  ********************************

In [None]:
!pip install transformers accelerate --upgrade
!pip install sentencepiece
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m33.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.5


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")

# Correct model identifier
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    use_auth_token=True
)

# Create a text generation pipeline
qe_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)




tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def build_qe_prompt(src, hyp):
    return f"""You are a quality estimation expert for machine translation.
Rate the quality of the translation from 0 (worst) to 1 (best).

Source: {src}
Translation: {hyp}

Score:"""


In [None]:
def predict_qe_score(src, hyp):
    prompt = build_qe_prompt(src, hyp)
    output = qe_pipeline(prompt)[0]['generated_text']
    try:
        score = output.split("Score:")[-1].strip()
        score = float(score.split()[0])  # Extract just the number
    except:
        score = None
    return score

In [None]:
import pandas as pd

df = pd.read_csv("dev.enmr.df.short.csv")

df['LLama_3.2_3B_predicted_score'] = df.apply(lambda row: predict_qe_score(row['original'], row['translation']), axis=1)
df.to_csv("LLama_3.2_3B_predicted_Score[en-mr].csv", sep="\t", index=False)
print("******* LLama_3.2_3B_predicted_Score[en-mr] created Successfully !!! ********")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to

******* LLama_3.2_3B_predicted_Score[en-mr] created Successfully !!! ********


In [None]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
df = pd.read_csv("LLama_3.2_3B_predicted_Score[en-mr].csv", sep="\t")
# Extract true and predicted scores
y_true = df['mean']  # Replace with the actual column name for true scores if different
y_pred = df['LLama_3.2_3B_predicted_score']

# Spearman Correlation
spearman_corr, _ = spearmanr(y_true, y_pred)

# Pearson Correlation
pearson_corr, _ = pearsonr(y_true, y_pred)

# Mean Absolute Error
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Pearson Correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")

Spearman Correlation: 0.0274
Pearson Correlation : 0.0848
Mean Absolute Error : 69.2245


In [None]:
######################### LLama 3.2 8B parameters  ####################################

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")

# Correct model identifier
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    use_auth_token=True
)

# Create a text generation pipeline
qe_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)



tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def build_qe_prompt(src, hyp):
    return f"""You are a quality estimation expert for machine translation.
Rate the quality of the translation from 0 (worst) to 1 (best).

Source: {src}
Translation: {hyp}

Score:"""

In [None]:
def predict_qe_score(src, hyp):
    prompt = build_qe_prompt(src, hyp)
    output = qe_pipeline(prompt)[0]['generated_text']
    try:
        score = output.split("Score:")[-1].strip()
        score = float(score.split()[0])  # Extract just the number
    except:
        score = None
    return score

In [None]:
import pandas as pd

df = pd.read_csv("dev.enmr.df.short.csv")

df['LLama_3.2_8B_predicted_score'] = df.apply(lambda row: predict_qe_score(row['original'], row['translation']), axis=1)
df.to_csv("LLama_3.2_8B_predicted_Score[en-mr].csv", sep="\t", index=False)
print("******* LLama_3.2_8B_predicted_Score[en-mr] created Successfully !!! ********")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

******* LLama_3.2_8B_predicted_Score[en-mr] created Successfully !!! ********


In [None]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error

df = pd.read_csv("LLama_3.2_8B_predicted_Score[en-mr].csv", sep="\t")

# Extract true and predicted scores
y_true = df['mean']  # Replace with the actual column name for true scores if different
y_pred = df['LLama_3.2_8B_predicted_score']

# Drop rows with missing values (NaN) in either y_true or y_pred
df = df.dropna(subset=['mean', 'LLama_3.2_8B_predicted_score'])
y_true = df['mean']
y_pred = df['LLama_3.2_8B_predicted_score']


# Spearman Correlation
spearman_corr, _ = spearmanr(y_true, y_pred)

# Pearson Correlation
pearson_corr, _ = pearsonr(y_true, y_pred)

# Mean Absolute Error
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Pearson Correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")

Spearman Correlation: 0.1283
Pearson Correlation : 0.1877
Mean Absolute Error : 69.1203
