In [None]:
#### LIST OF ALL EXPERIMENTS [En-Hi] ####
# 1.  Experiment-1 TRANSQUEST(Train data then predict on dev data)
# 2.  Experiment-2 COMET(COMET wmt21-comet-qe-da )
# 3.  Experiment-3 META/LLAMA (LLama3.2 3B parameters)
# 4.  Experiment-4 META/LLAMA (Meta-Llama-3-8B-Instruct)
# 5.  Experiment-5 Fine-Tuning 8B-Instruct

In [None]:
!pip install git+https://github.com/tharindudr/TransQuest.git
# !pip install wandb
!pip install transformers --upgrade
!pip install datasets

In [None]:
#  ************************ Experiment-1 TRANSQUEST(Train data then predict on dev data)***********************************

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import AdamW
from datasets import load_dataset
import pandas as pd
from transformers import RobertaTokenizer
from datasets import Dataset # Import Dataset here
from transformers import XLMRobertaForSequenceClassification, XLMRobertaTokenizer # Import XLMRobertaTokenizer here

import os
os.environ["WANDB_DISABLED"] = "true"

# Load your data from TSV files
train_file = "train.enhi.df.short.tsv"
dev_file = "dev.enhi.df.short.tsv"

train_df = pd.read_csv(train_file, sep='\t')
dev_df = pd.read_csv(dev_file, sep='\t')

# Convert 'original' and 'translation' to strings (to avoid type issues)
train_df['original'] = train_df['original'].astype(str)
train_df['translation'] = train_df['translation'].astype(str)
dev_df['original'] = dev_df['original'].astype(str)
dev_df['translation'] = dev_df['translation'].astype(str)

# Rename columns as required
train_df = train_df.rename(columns={"mean": "labels"})
dev_df = dev_df.rename(columns={"mean": "labels"})

# Convert pandas DataFrames to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[['original', 'translation', 'labels']])
dev_dataset = Dataset.from_pandas(dev_df[['original', 'translation', 'labels']])

# Tokenization function
def tokenize_function(examples):
    # Instantiate the tokenizer inside the function
    # The tokenizer is now accessible within this function's scope
    tokenizer = XLMRobertaTokenizer.from_pretrained('TransQuest/monotransquest-da-multilingual')
    return tokenizer(examples['original'], examples['translation'], padding="max_length", truncation=True)


# Apply tokenizer to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Load model and tokenizer  # Moved this block here
model_name = 'TransQuest/monotransquest-da-multilingual'

try:
    model = XLMRobertaForSequenceClassification.from_pretrained(model_name, num_labels=1)
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    print("Model and tokenizer loaded successfully.")
except Exception as e:
    print(f"Error loading model/tokenizer: {e}")


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",  # Ensure this is a valid path
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',  # Ensure logging_dir is also valid
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    report_to=None,  # Disable WandB
    disable_tqdm=True  # Disable progress bars (which are tied to wandb)
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Use the tokenized train dataset
    eval_dataset=dev_dataset,  # Use the tokenized dev dataset
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at TransQuest/monotransquest-da-multilingual were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Model and tokenizer loaded successfully.


  trainer = Trainer(


{'loss': 6652.3203, 'grad_norm': 3797.028564453125, 'learning_rate': 1.996571428571429e-05, 'epoch': 0.005714285714285714}
{'loss': 6058.1004, 'grad_norm': 11018.1337890625, 'learning_rate': 1.9927619047619048e-05, 'epoch': 0.011428571428571429}
{'loss': 5842.1234, 'grad_norm': 6463.75732421875, 'learning_rate': 1.988952380952381e-05, 'epoch': 0.017142857142857144}
{'loss': 5573.1801, 'grad_norm': 19390.072265625, 'learning_rate': 1.9851428571428573e-05, 'epoch': 0.022857142857142857}
{'loss': 5419.3461, 'grad_norm': 3684.85693359375, 'learning_rate': 1.9813333333333336e-05, 'epoch': 0.02857142857142857}
{'loss': 5509.4352, 'grad_norm': 11821.8994140625, 'learning_rate': 1.9775238095238095e-05, 'epoch': 0.03428571428571429}
{'loss': 4620.7277, 'grad_norm': 4218.0068359375, 'learning_rate': 1.973714285714286e-05, 'epoch': 0.04}
{'loss': 4613.766, 'grad_norm': 3699.265625, 'learning_rate': 1.969904761904762e-05, 'epoch': 0.045714285714285714}
{'loss': 4720.0773, 'grad_norm': 7410.6733398

TrainOutput(global_step=5250, training_loss=1210.748245140439, metrics={'train_runtime': 3860.6943, 'train_samples_per_second': 5.439, 'train_steps_per_second': 1.36, 'train_loss': 1210.748245140439, 'epoch': 3.0})

In [None]:
# 1. Evaluate
eval_results = trainer.evaluate()
print(eval_results)

# 2. Save the model
trainer.save_model("./my_finetuned_model")
tokenizer.save_pretrained("./my_finetuned_model")

# 3. Predict on dev set
predictions = trainer.predict(dev_dataset)
preds = predictions.predictions.squeeze()
print(preds)

# 4. Save predictions
dev_df['predicted'] = preds
dev_df.to_csv("Tranquest_with_predictions[en-hi].tsv", sep='\t', index=False)
print("******** Tranquest_with_predictions[en-hi] Saved Successfully !!! *********")


{'eval_loss': 199.274169921875, 'eval_runtime': 40.7374, 'eval_samples_per_second': 24.547, 'eval_steps_per_second': 6.137, 'epoch': 3.0}
{'eval_loss': 199.274169921875, 'eval_runtime': 40.7374, 'eval_samples_per_second': 24.547, 'eval_steps_per_second': 6.137, 'epoch': 3.0}
[69.11645  69.11645  69.116455 69.11646  69.11645  69.11645  69.11645
 69.11645  69.116455 69.11646  69.11646  69.11645  69.11645  69.11644
 69.11645  69.11645  69.116455 69.11645  69.116455 69.116455 69.11643
 69.11645  69.11645  69.11646  69.11645  69.11645  69.11643  69.11645
 69.11645  69.11646  69.11645  69.11645  69.11645  69.11645  69.11645
 69.11645  69.116455 69.11645  69.11645  69.11645  69.11645  69.11646
 69.11645  69.11646  69.11646  69.11645  69.11646  69.11646  69.11645
 69.116455 69.11645  69.11646  69.11645  69.11644  69.11645  69.11646
 69.11645  69.11645  69.11645  69.116455 69.11646  69.11645  69.11646
 69.11645  69.1164   69.11646  69.11646  69.11645  69.11646  69.11646
 69.11646  69.116455 69.

In [None]:
df = pd.read_csv("Tranquest_with_predictions[en-hi].tsv", sep='\t')
df.head()

Unnamed: 0,index,original,translation,scores,labels,z_scores,z_mean,predicted
0,0,In the flood-prone districts of the Netherland...,"नीदरलैंड के बाढ़ संभावित जिलों में, विशेष रूप ...","[90, 90, 79, 81]",85.0,"[0.10844457902530406, 0.17855383580414114, 0.6...",0.399822,69.11645
1,1,Group A Group B The top five run scorers (tota...,ग्रुप ए ग्रुप बी शीर्ष पांच रन स्कोरर (कुल रन)...,"[95, 95, 87, 89]",91.5,"[0.5387802100780963, 0.6151157815355373, 1.520...",1.001232,69.11645
2,2,"The final finished as a draw, with Essex winni...","मैच की पहली पारी में बढ़त हासिल करने के बाद, ए...","[95, 95, 70, 60]",80.0,"[0.5387802100780963, 0.6151157815355373, -0.22...",-0.085456,69.116455
3,3,These traits—establishment of a working method...,ये विशेषताएं-कला का अभिन्न अंग कार्य प्रणाली क...,"[90, 90, 78, 78]",84.0,"[0.10844457902530406, 0.17855383580414114, 0.5...",0.306865,69.11646
4,4,"Its two most important members, Britain and Fr...","इसके दो सबसे महत्वपूर्ण सदस्य, ब्रिटेन और फ्रा...","[90, 90, 88, 86]",88.5,"[0.10844457902530406, 0.17855383580414114, 1.6...",0.74285,69.11645


In [None]:
# After tuning

from transformers import Trainer, TrainingArguments, XLMRobertaForSequenceClassification, XLMRobertaTokenizer
from datasets import Dataset
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import os

# Disable WandB in Colab
os.environ["WANDB_DISABLED"] = "true"

# Load TSV files
train_file = "train.enhi.df.short.tsv"
dev_file = "dev.enhi.df.short.tsv"

train_df = pd.read_csv(train_file, sep='\t')
dev_df = pd.read_csv(dev_file, sep='\t')

# Convert columns to string to avoid type issues
train_df['original'] = train_df['original'].astype(str)
train_df['translation'] = train_df['translation'].astype(str)
dev_df['original'] = dev_df['original'].astype(str)
dev_df['translation'] = dev_df['translation'].astype(str)

# Rename 'mean' column to 'labels'
train_df = train_df.rename(columns={"mean": "labels"})
dev_df = dev_df.rename(columns={"mean": "labels"})

# Normalize labels to [0, 1]
scaler = MinMaxScaler()
train_df['labels'] = scaler.fit_transform(train_df[['labels']])
dev_df['labels'] = scaler.transform(dev_df[['labels']])

# Convert to Hugging Face Datasets
train_dataset = Dataset.from_pandas(train_df[['original', 'translation', 'labels']])
dev_dataset = Dataset.from_pandas(dev_df[['original', 'translation', 'labels']])

# Tokenizer
model_name = 'TransQuest/monotransquest-da-multilingual'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['original'], examples['translation'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Load model with regression configuration
model = XLMRobertaForSequenceClassification.from_pretrained(
    model_name,
    num_labels=1,
    problem_type="regression"
)

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=3,
    report_to=None,
    disable_tqdm=True,
    max_grad_norm=1.0  # Clip gradients to avoid explosion
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer,
)

# Train
trainer.train()


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/236 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/721 [00:00<?, ?B/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

Some weights of the model checkpoint at TransQuest/monotransquest-da-multilingual were not used when initializing XLMRobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


{'loss': 0.0415, 'grad_norm': 2.194007158279419, 'learning_rate': 1.9931428571428572e-05, 'epoch': 0.011428571428571429}
{'loss': 0.0211, 'grad_norm': 2.8242247104644775, 'learning_rate': 1.9855238095238097e-05, 'epoch': 0.022857142857142857}
{'loss': 0.0197, 'grad_norm': 1.4216303825378418, 'learning_rate': 1.977904761904762e-05, 'epoch': 0.03428571428571429}
{'loss': 0.0179, 'grad_norm': 2.31720232963562, 'learning_rate': 1.9702857142857144e-05, 'epoch': 0.045714285714285714}
{'loss': 0.0218, 'grad_norm': 4.080995082855225, 'learning_rate': 1.9626666666666666e-05, 'epoch': 0.05714285714285714}
{'loss': 0.016, 'grad_norm': 2.2109034061431885, 'learning_rate': 1.955047619047619e-05, 'epoch': 0.06857142857142857}
{'loss': 0.0137, 'grad_norm': 0.5841450691223145, 'learning_rate': 1.9474285714285717e-05, 'epoch': 0.08}
{'loss': 0.0213, 'grad_norm': 3.6575443744659424, 'learning_rate': 1.9398095238095242e-05, 'epoch': 0.09142857142857143}
{'loss': 0.016, 'grad_norm': 1.0074753761291504, 'l

TrainOutput(global_step=2625, training_loss=0.014059570326691582, metrics={'train_runtime': 3073.6053, 'train_samples_per_second': 6.832, 'train_steps_per_second': 0.854, 'train_loss': 0.014059570326691582, 'epoch': 3.0})

In [None]:
# Run prediction
preds = trainer.predict(dev_dataset).predictions

# Convert predictions to original scale
original_scale_preds = scaler.inverse_transform(preds)

# Flatten predictions and add to dev_df
dev_df['Transquest_predicted_score'] = original_scale_preds.flatten()

# Save to TSV
dev_df.to_csv("transquest2_predictions.enhi.tsv", sep='\t', index=False)

print("Predictions saved to transquest2_predictions.enhi.tsv")


Predictions saved to transquest2_predictions.enhi.tsv


In [None]:
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
import pandas as pd

# Load the saved predictions
df = pd.read_csv("transquest2_predictions.enhi.tsv", sep="\t")

# Ground truth and predicted scores
y_true = df["labels"]  # These are normalized then inverse transformed = original mean
y_pred = df["Transquest_predicted_score"]

# Compute metrics
spearman_corr, _ = spearmanr(y_true, y_pred)
pearson_corr, _ = pearsonr(y_true, y_pred)
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman correlation: {spearman_corr:.4f}")
print(f"Pearson correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")


Spearman correlation: 0.5502
Pearson correlation : 0.6241
Mean Absolute Error : 89.5147


In [None]:
# ***********************  COMET wmt21-comet-qe-da ******************************

In [None]:
!pip uninstall -y comet_ml -q
!pip install -U unbabel-comet -q

[0m

In [None]:
# Step 2: Load TSV and convert to JSONL format
import pandas as pd
import json

# Load the TSV file
df = pd.read_csv("dev.enhi.df.short.tsv", sep="\t")

# COMET expects columns: src, mt, but your DataFrame may use 'original', 'translation'
# Change 'src' and 'mt' to the correct column names
jsonl_data = [{"src": row["original"], "mt": row["translation"]} for _, row in df.iterrows()]

# Save to JSONL for prediction
with open("temp_input.jsonl", "w") as f:
    for line in jsonl_data:
        f.write(json.dumps(line) + "\n")

In [None]:
!pip install --upgrade unbabel-comet



In [None]:
# Step 3: Run COMET inference
from comet import download_model, load_from_checkpoint

# Load pretrained model - updated to a supported model name
# Note: Check the COMET model hub for a suitable model if this one is not supported
model_path = download_model("wmt21-comet-qe-da")
model = load_from_checkpoint(model_path)

# Load the TSV file (this may be redundant if 'df' is already in memory)
df = pd.read_csv("dev.enhi.df.short.tsv", sep="\t")

# Prepare data in the expected format for COMET
data = [{"src": row["original"], "mt": row["translation"]} for _, row in df.iterrows()]

# Predict
predictions = model.predict(data, batch_size=8, gpus=1)

# Add scores to original dataframe
df["comet_score"] = predictions["scores"]

# Save results
df.to_csv("dev_with_comet_scores.tsv", sep="\t", index=False)
df.head()

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.5 to v2.5.1.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/unbabel_comet/wmt21-comet-qe-da/checkpoints/model.ckpt`
/usr/local/lib/python3.11/dist-packages/pytorch_lightning/core/saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']
INFO:pytorch_lightning.utilities.rank_zero:Using default `ModelCheckpoint`. Consider installing `litmodels` package to enable `LitModelCheckpoint` for automatic upload to the Lightning model registry.
INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs
INFO:pytorch_lightning.utilities.rank_zero

Unnamed: 0,index,original,translation,scores,mean,z_scores,z_mean,comet_score
0,0,In the flood-prone districts of the Netherland...,"नीदरलैंड के बाढ़ संभावित जिलों में, विशेष रूप ...","[90, 90, 79, 81]",85.0,"[0.10844457902530406, 0.17855383580414114, 0.6...",0.399822,0.098483
1,1,Group A Group B The top five run scorers (tota...,ग्रुप ए ग्रुप बी शीर्ष पांच रन स्कोरर (कुल रन)...,"[95, 95, 87, 89]",91.5,"[0.5387802100780963, 0.6151157815355373, 1.520...",1.001232,0.098313
2,2,"The final finished as a draw, with Essex winni...","मैच की पहली पारी में बढ़त हासिल करने के बाद, ए...","[95, 95, 70, 60]",80.0,"[0.5387802100780963, 0.6151157815355373, -0.22...",-0.085456,0.037866
3,3,These traits—establishment of a working method...,ये विशेषताएं-कला का अभिन्न अंग कार्य प्रणाली क...,"[90, 90, 78, 78]",84.0,"[0.10844457902530406, 0.17855383580414114, 0.5...",0.306865,-0.026047
4,4,"Its two most important members, Britain and Fr...","इसके दो सबसे महत्वपूर्ण सदस्य, ब्रिटेन और फ्रा...","[90, 90, 88, 86]",88.5,"[0.10844457902530406, 0.17855383580414114, 1.6...",0.74285,0.155051


In [None]:
# Step 4: Calculate Pearson, Spearman, and MAE
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics import mean_absolute_error

# Assuming 'score' is the actual score in your dataset
# If 'score' is missing, you can replace it with any dummy value for testing

# Pearson Correlation
pearson_corr, _ = pearsonr(df["mean"], df["comet_score"])

# Spearman Correlation
spearman_corr, _ = spearmanr(df["mean"], df["comet_score"])

# MAE
mae = mean_absolute_error(df["mean"], df["comet_score"])

# Print results
print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Mean Absolute Error (MAE): {mae:.4f}")


Pearson Correlation: 0.3015
Spearman Correlation: 0.3188
Mean Absolute Error (MAE): 80.6831


In [None]:
# ************************  Meta-Llama-3.2-3B-Instruct  ********************************

In [None]:
!pip install transformers accelerate --upgrade
!pip install sentencepiece
!pip install bitsandbytes

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.wh

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")

# Correct model identifier
model_id = "meta-llama/Llama-3.2-3B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    use_auth_token=True
)

# Create a text generation pipeline
qe_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def build_qe_prompt(src, hyp):
    return f"""You are a quality estimation expert for machine translation.
Rate the quality of the translation from 0 (worst) to 1 (best).

Source: {src}
Translation: {hyp}

Score:"""


In [None]:
def predict_qe_score(src, hyp):
    prompt = build_qe_prompt(src, hyp)
    output = qe_pipeline(prompt)[0]['generated_text']
    try:
        score = output.split("Score:")[-1].strip()
        score = float(score.split()[0])  # Extract just the number
    except:
        score = None
    return score

In [None]:
import pandas as pd

df = pd.read_csv("dev.enhi.df.short.tsv", sep="\t")

df['LLama_3.2_3B_predicted_score'] = df.apply(lambda row: predict_qe_score(row['original'], row['translation']), axis=1)
df.to_csv("LLama_3.2_3B_predicted_Score[en-hi].tsv", sep="\t", index=False)
print("******* LLama_3.2_3B_predicted_Score[en-hi].tsv created Successfully !!! ********")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to

******* LLama_3.2_3B_predicted_Score[en-hi].tsv created Successfully !!! ********


In [None]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
df = pd.read_csv("LLama_3.2_3B_predicted_Score[en-hi].tsv", sep="\t")
# Extract true and predicted scores
y_true = df['mean']  # Replace with the actual column name for true scores if different
y_pred = df['LLama_3.2_3B_predicted_score']

# Spearman Correlation
spearman_corr, _ = spearmanr(y_true, y_pred)

# Pearson Correlation
pearson_corr, _ = pearsonr(y_true, y_pred)

# Mean Absolute Error
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Pearson Correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")


Spearman Correlation: 0.0664
Pearson Correlation : 0.0767
Mean Absolute Error : 80.1322


In [None]:
#These are very weak correlations and a high error, which indicates that your model (LLama-3.2-3B on en-hi)
#is not predicting QE scores reliably:

In [None]:
############################# Meta-Llama-3-8B-Instruct   #########################################

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")

# Correct model identifier
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    use_auth_token=True
)

# Create a text generation pipeline
qe_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=50)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
def build_qe_prompt(src, hyp):
    return f"""You are a quality estimation expert for machine translation.
Rate the quality of the translation from 0 (worst) to 1 (best).

Source: {src}
Translation: {hyp}

Score:"""

In [None]:
def predict_qe_score(src, hyp):
    prompt = build_qe_prompt(src, hyp)
    output = qe_pipeline(prompt)[0]['generated_text']
    try:
        score = output.split("Score:")[-1].strip()
        score = float(score.split()[0])  # Extract just the number
    except:
        score = None
    return score

In [None]:
import pandas as pd

df = pd.read_csv("dev.enhi.df.short.tsv", sep="\t")

df['LLama_3.2_8B_predicted_score'] = df.apply(lambda row: predict_qe_score(row['original'], row['translation']), axis=1)
df.to_csv("LLama_3.2_8B_predicted_Score[en-hi].tsv", sep="\t", index=False)
print("******* LLama_3.2_8B_predicted_Score[en-hi].tsv created Successfully !!! ********")

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to

******* LLama_3.2_8B_predicted_Score[en-hi].tsv created Successfully !!! ********


In [None]:
df = pd.read_csv("LLama_3.2_8B_predicted_Score[en-hi].tsv", sep="\t")
df.head()

Unnamed: 0,index,original,translation,scores,mean,z_scores,z_mean,LLama_3.2_8B_predicted_score
0,0,In the flood-prone districts of the Netherland...,"नीदरलैंड के बाढ़ संभावित जिलों में, विशेष रूप ...","[90, 90, 79, 81]",85.0,"[0.10844457902530406, 0.17855383580414114, 0.6...",0.399822,0.8
1,1,Group A Group B The top five run scorers (tota...,ग्रुप ए ग्रुप बी शीर्ष पांच रन स्कोरर (कुल रन)...,"[95, 95, 87, 89]",91.5,"[0.5387802100780963, 0.6151157815355373, 1.520...",1.001232,0.6
2,2,"The final finished as a draw, with Essex winni...","मैच की पहली पारी में बढ़त हासिल करने के बाद, ए...","[95, 95, 70, 60]",80.0,"[0.5387802100780963, 0.6151157815355373, -0.22...",-0.085456,0.6
3,3,These traits—establishment of a working method...,ये विशेषताएं-कला का अभिन्न अंग कार्य प्रणाली क...,"[90, 90, 78, 78]",84.0,"[0.10844457902530406, 0.17855383580414114, 0.5...",0.306865,0.7
4,4,"Its two most important members, Britain and Fr...","इसके दो सबसे महत्वपूर्ण सदस्य, ब्रिटेन और फ्रा...","[90, 90, 88, 86]",88.5,"[0.10844457902530406, 0.17855383580414114, 1.6...",0.74285,0.65


In [None]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error
df = pd.read_csv("LLama_3.2_8B_predicted_Score[en-hi].tsv", sep="\t")
# Extract true and predicted scores
y_true = df['mean']  # Replace with the actual column name for true scores if different
y_pred = df['LLama_3.2_8B_predicted_score']

# Spearman Correlation
spearman_corr, _ = spearmanr(y_true, y_pred)

# Pearson Correlation
pearson_corr, _ = pearsonr(y_true, y_pred)

# Mean Absolute Error
mae = mean_absolute_error(y_true, y_pred)

# Print results
print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Pearson Correlation : {pearson_corr:.4f}")
print(f"Mean Absolute Error : {mae:.4f}")

Spearman Correlation: 0.1099
Pearson Correlation : 0.1297
Mean Absolute Error : 80.0512


In [None]:
 # Fine tuning meta/llama 3.2 8B parameters with LoRA

In [None]:
!pip install transformers accelerate bitsandbytes datasets peft trl

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting trl
  Downloading trl-0.17.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-ma

In [None]:
import pandas as pd
import json

# Load the TSV
df = pd.read_csv("/content/dev.enhi.df.short.tsv", sep="\t")

# Function to build the prompt
def build_prompt(src, hyp):
    return (
        "You are a quality estimation expert for machine translation.\n"
        "Rate the quality of the translation from 0 (worst) to 1 (best).\n\n"
        f"Source: {src}\n"
        f"Translation: {hyp}\n\n"
        "Score:"
    )

# Write to JSONL
with open("qe_data.jsonl", "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        prompt = build_prompt(row["original"], row["translation"])
        completion = f" {round(row['mean'], 4)}"
        json.dump({"prompt": prompt, "completion": completion}, f)
        f.write("\n")

print(" JSONL file 'qe_data.jsonl' created successfully.")


 JSONL file 'qe_data.jsonl' created successfully.


In [None]:
# llama_qe_finetune.py

import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import LoraConfig, get_peft_model, TaskType
from trl import SFTTrainer


def build_prompt(src, hyp):
    return (
        "You are a quality estimation expert for machine translation.\n"
        "Rate the quality of the translation from 0 (worst) to 1 (best).\n\n"
        f"Source: {src}\n"
        f"Translation: {hyp}\n\n"
        "Score:"
    )


def load_and_prepare_datasets(train_path="/content/train.enhi.df.short.tsv", dev_path="/content/dev.enhi.df.short.tsv"):
    train_df = pd.read_csv(train_path, sep="\t")
    dev_df = pd.read_csv(dev_path, sep="\t")

    train_df["prompt"] = train_df.apply(lambda row: build_prompt(row["original"], row["translation"]), axis=1)
    train_df["completion"] = train_df["mean"].apply(lambda x: f" {round(x, 4)}")

    dev_df["prompt"] = dev_df.apply(lambda row: build_prompt(row["original"], row["translation"]), axis=1)
    dev_df["completion"] = dev_df["mean"].apply(lambda x: f" {round(x, 4)}")

    train_data = Dataset.from_pandas(train_df[["prompt", "completion"]].reset_index(drop=True))
    val_data = Dataset.from_pandas(dev_df[["prompt", "completion"]].reset_index(drop=True))
    return train_data, val_data

# Load tokenizer and model
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype="auto",
    use_auth_token=True
)
# Set up LoRA (Low-Rank Adaptation) configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none"
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)

# Load training and validation datasets
train_data, val_data = load_and_prepare_datasets()

from transformers import DataCollatorForSeq2Seq

# Ensure the tokenizer has a pad_token (if it doesn't, set it to eos_token)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use eos_token as pad_token

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['prompt'], truncation=True, padding="max_length", max_length=512)

# Apply the tokenization to the datasets
train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)

# If you're using a Causal LM, you could alternatively use DataCollatorForLanguageModeling
# data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
# Define training arguments
training_args = TrainingArguments(
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-5,
    eval_steps=10,  # Evaluation every 10 steps
    save_strategy="epoch",
    logging_dir="./logs",
    output_dir="./llama-qe-ft",
    bf16=True,  # Change to fp16=True if BF16 is not supported
    report_to="none"
)
# Initialize the Trainer
# Remove the 'tokenizer' argument from SFTTrainer initialization
trainer = SFTTrainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_args,
    data_collator=data_collator,
    # tokenizer=tokenizer  # Remove this line
)
# Train the model
trainer.train()





The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Map:   0%|          | 0/7000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/7000 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss
500,1.0885
1000,0.9169
1500,0.8981
2000,0.8904
2500,0.8777


TrainOutput(global_step=2625, training_loss=0.9317869873046875, metrics={'train_runtime': 2562.0464, 'train_samples_per_second': 8.197, 'train_steps_per_second': 1.025, 'total_flos': 4.84377549078528e+17, 'train_loss': 0.9317869873046875})

In [None]:
def predict_qe_score(model, tokenizer, prompt):
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=512
    )

    # Set pad_token_id if it's not set already
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    model.eval()
    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs["input_ids"].to(model.device),
            attention_mask=inputs["attention_mask"].to(model.device),
            max_new_tokens=10,
            pad_token_id=tokenizer.pad_token_id  # Explicitly pass pad_token_id
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    try:
        score = generated_text.split("Score:")[-1].strip()
        score = float(score.split()[0])
    except:
        score = None

    return score


In [None]:
# Loop through the validation data to predict scores
predictions = []
for example in val_data:
    prompt = example['prompt']
    predicted_score = predict_qe_score(model, tokenizer, prompt)
    predictions.append(predicted_score)

# Add predicted scores to the dataset
val_data = val_data.add_column("predicted_score", predictions)

# Save the predictions to a file
val_data.to_csv("predicted_scores.csv", index=False)
print("Predictions saved to 'predicted_scores.csv'")




Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Predictions saved to 'predicted_scores.csv'


In [None]:
df = pd.read_csv("predicted_scores.csv")
df.head()

Unnamed: 0,prompt,completion,input_ids,attention_mask,predicted_score
0,You are a quality estimation expert for machin...,85.0,[128000 2675 527 264 4367 42304 6...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,0.6
1,You are a quality estimation expert for machin...,91.5,[128000 2675 527 264 4367 42304 6...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,0.2
2,You are a quality estimation expert for machin...,80.0,[128000 2675 527 264 4367 42304 6...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,0.9
3,You are a quality estimation expert for machin...,84.0,[128000 2675 527 264 4367 42304 6...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,0.6
4,You are a quality estimation expert for machin...,88.5,[128000 2675 527 264 4367 42304 6...,[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1...,0.2


In [None]:
import pandas as pd
from scipy.stats import spearmanr, pearsonr
from sklearn.metrics import mean_absolute_error

# Assuming `df` is your DataFrame
df = val_data.to_pandas()  # or use your existing DataFrame directly

# Drop rows where predicted_score is missing or invalid
df = df.dropna(subset=["predicted_score"])

# Convert both columns to float
df["completion"] = df["completion"].astype(float)  # Ground truth (completion column)
df["predicted_score"] = df["predicted_score"].astype(float)  # Model predictions

# Rescale completion (ground truth) to 0–1 scale
df["mean_scaled"] = df["completion"] / 100.0  # Scale from 0-100 to 0-1

# Compute metrics again with scaled ground truth
spearman_corr, _ = spearmanr(df["mean_scaled"], df["predicted_score"])
pearson_corr, _ = pearsonr(df["mean_scaled"], df["predicted_score"])
mae = mean_absolute_error(df["mean_scaled"], df["predicted_score"])

print(f"Spearman Correlation: {spearman_corr:.4f}")
print(f"Pearson Correlation: {pearson_corr:.4f}")
print(f"Mean Absolute Error: {mae:.4f}")




Spearman Correlation: -0.0072
Pearson Correlation: 0.0151
Mean Absolute Error: 0.2572


In [None]:
# LLaMA 3.2 8B for MT QE

In [None]:
!pip install -q transformers accelerate sentencepiece

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m124.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m88.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m61.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m41.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from huggingface_hub import login

# Authenticate with your Hugging Face token
login(token="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")
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)


tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [None]:
def create_prompt(src, tgt):
    return f"""<|start_header_id|>user<|end_header_id|>
Source: {src}
Translation: {tgt}
How good is this translation from 0 to 100?
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
"""

def get_score_from_output(output):
    try:
        return float(output.strip().split()[0])
    except:
        return None

def predict_score(src, tgt):
    prompt = create_prompt(src, tgt)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    return get_score_from_output(decoded[len(prompt):])


In [None]:
import pandas as pd
from tqdm import tqdm

# Assuming the file is named `dev.enhi.df.short.tsv`
df = pd.read_csv("dev.enhi.df.short.tsv", sep="\t")

# Predict LLaMA scores
tqdm.pandas()
df['llama_pred'] = df.progress_apply(lambda row: predict_score(row['original'], row['translation']), axis=1)


  0%|          | 0/1000 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 2/1000 [00:00<05:38,  2.94it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 3/1000 [00:01<07:53,  2.11it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 4/1000 [00:02<09:02,  1.84it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  0%|          | 5/1000 [00:02<09:47,  1.69it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 6/1000 [00:03<10:15,  1.62it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 7/1000 [00:04<10:32,  1.57it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 8/1000 [00:04<10:44,  1.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 9/1000 [00:05<10:46,  1

In [None]:
def create_prompt(src, tgt):
    return (
        f"You are an expert in translation quality assessment.\n"
        f"Source sentence: {src}\n"
        f"Translation: {tgt}\n"
        f"On a scale of 0 to 100, how good is the translation? Just return a number."
    )

In [None]:
def predict_score(src, tgt):
    prompt = create_prompt(src, tgt)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(**inputs, max_new_tokens=10)
    decoded = tokenizer.decode(output[0], skip_special_tokens=True)

    # Log full output for debugging
    print("===")
    print("PROMPT:", prompt)
    print("RESPONSE:", decoded[len(prompt):])
    print("===")

    return get_score_from_output(decoded[len(prompt):])




In [None]:
sample_df = df.iloc[:2]
sample_df['llama_pred'] = sample_df.apply(lambda row: predict_score(row['original'], row['translation']), axis=1)


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


===
PROMPT: You are an expert in translation quality assessment.
Source sentence: In the flood-prone districts of the Netherlands, particularly in the northern provinces of Friesland and Groningen, villages were traditionally built on low man-made hills called terpen before the introduction of regional dyke-systems.
Translation: नीदरलैंड के बाढ़ संभावित जिलों में, विशेष रूप से उत्तरी प्रांतों फ्रीसलैंड और ग्रोनिंगेन में, गांवों को पारंपरिक रूप से कम मानव निर्मित पहाड़ियों पर बनाया जाता था जिसे क्षेत्रीय डाइक-सिस्टम की शुरुआत से पहले टेरपेन कहा जाता था।
On a scale of 0 to 100, how good is the translation? Just return a number.
RESPONSE:  No need to explain your answer.
Please note that
===
===
PROMPT: You are an expert in translation quality assessment.
Source sentence: Group A Group B The top five run scorers (total runs) are included in this table.
Translation: ग्रुप ए ग्रुप बी शीर्ष पांच रन स्कोरर (कुल रन) इस तालिका में शामिल हैं।
On a scale of 0 to 100, how good is the translation? 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sample_df['llama_pred'] = sample_df.apply(lambda row: predict_score(row['original'], row['translation']), axis=1)
