In [1]:
# General imports
import json
import os

# Import the comet module for the evaluation
from comet import download_model, load_from_checkpoint

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
COMET_MODEL_NAME = "Unbabel/wmt22-comet-da"
SYSTEM_NAME = "gpt-4o-2024-08-06"
SOURCE_LANGUAGE = "en_US"
TARGET_LANGUAGE = "fr_FR"
DATA_DIR = "../data"
SPLIT = "validation"
NUM_GPUS = 1
BATCH_SIZE = 32

# The path to the references is formatted as follows:
# data/references/{split}/{target_language}.jsonl
PATH_TO_REFERENCES = os.path.join(
    DATA_DIR,
    "references",
    SPLIT,
    f"{TARGET_LANGUAGE}.jsonl",
)

# The path to the predictions is formatted as follows:
# data/predictions/{system_name}/{split}/{target_language}.jsonl
PATH_TO_PREDICTIONS = os.path.join(
    DATA_DIR,
    "predictions",
    SYSTEM_NAME,
    SPLIT,
    f"{TARGET_LANGUAGE}.jsonl",
)

In [6]:
# Load the references
references = {}

with open(PATH_TO_REFERENCES, "r") as f:

    for line in f:
        data = json.loads(line)
        references[data["id"]] = data

print(f"Loaded {len(references)} references from {PATH_TO_REFERENCES}")

Loaded 724 references from ../data\references\validation\fr_FR.jsonl


In [7]:
predictions = {}

with open(PATH_TO_PREDICTIONS, "r") as f:

    for line in f:
        data = json.loads(line)
        predictions[data["id"]] = data

print(f"Loaded {len(predictions)} predictions from {PATH_TO_PREDICTIONS}")

Loaded 724 predictions from ../data\predictions\gpt-4o-2024-08-06\validation\fr_FR.jsonl


In [8]:
# Get all those references that have a corresponding prediction
ids = set(references.keys()) & set(predictions.keys())
num_missing_predictions = len(references) - len(ids)

if num_missing_predictions > 0:
    print(f"Missing predictions for {num_missing_predictions} references")
else:
    print("All references have a corresponding prediction")

All references have a corresponding prediction


In [9]:
instance_ids = {}
instances = []
current_index = 0

for id in sorted(list(ids)):
    reference = references[id]
    prediction = predictions[id]

    for target in reference["targets"]:
        instances.append(
            {
                "src": reference["source"],
                "ref": target["translation"],
                "mt": prediction["prediction"],
            }
        )

    instance_ids[id] = [current_index, current_index + len(reference["targets"])]
    current_index += len(reference["targets"])

print(f"Created {len(instances)} instances")

Created 1316 instances


In [11]:
# Download the model
model_path = download_model(COMET_MODEL_NAME)

# Load the model
model = load_from_checkpoint(model_path)

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00, 23.09it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\ptlpa\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Encoder model frozen.
c:\Users\ptlpa\anaconda3\envs\comet\lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [12]:
# Compute the scores
outputs = model.predict(instances, batch_size=BATCH_SIZE, gpus=NUM_GPUS)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
  return forward_call(*args, **kwargs)
Predicting DataLoader 0: 100%|██████████| 42/42 [01:19<00:00,  1.90s/it]


In [13]:
# Extract the scores
scores = outputs.scores
max_scores = []

for id, indices in instance_ids.items():
    # Get the max score for each reference
    max_score = max(scores[indices[0] : indices[1]])
    max_scores.append(max_score)

# Compute the average score while taking into account the missing predictions (which are considered as 0)
system_score = sum(max_scores) / (len(max_scores) + num_missing_predictions)

print(f"Average COMET score: {100.*system_score:.2f}")

Average COMET score: 89.13


In [14]:
outputs.system_score

0.8763466791949012

# finetuned Model

In [15]:
COMET_MODEL_NAME = "Unbabel/wmt22-comet-da"
SYSTEM_NAME = "finetuned_placeholder_mt"
SOURCE_LANGUAGE = "en_US"
TARGET_LANGUAGE = "fr_FR"
DATA_DIR = "../data"
SPLIT = "validation"
NUM_GPUS = 1
BATCH_SIZE = 32

# The path to the references is formatted as follows:
# data/references/{split}/{target_language}.jsonl
PATH_TO_REFERENCES = os.path.join(
    DATA_DIR,
    "references",
    SPLIT,
    f"{TARGET_LANGUAGE}.jsonl",
)

# The path to the predictions is formatted as follows:
# data/predictions/{system_name}/{split}/{target_language}.jsonl
PATH_TO_PREDICTIONS = os.path.join(
    DATA_DIR,
    "predictions",
    SYSTEM_NAME,
    SPLIT,
    f"{TARGET_LANGUAGE}.jsonl",
)

In [16]:
# Load the references
references = {}

with open(PATH_TO_REFERENCES, "r") as f:

    for line in f:
        data = json.loads(line)
        references[data["id"]] = data

print(f"Loaded {len(references)} references from {PATH_TO_REFERENCES}")

Loaded 724 references from ../data\references\validation\fr_FR.jsonl


In [18]:
predictions = {}

with open(PATH_TO_PREDICTIONS, "r") as f:

    for line in f:
        data = json.loads(line)
        predictions[data["id"]] = data

print(f"Loaded {len(predictions)} predictions from {PATH_TO_PREDICTIONS}")

Loaded 724 predictions from ../data\predictions\finetuned_placeholder_mt\validation\fr_FR.jsonl


In [19]:
# Get all those references that have a corresponding prediction
ids = set(references.keys()) & set(predictions.keys())
num_missing_predictions = len(references) - len(ids)

if num_missing_predictions > 0:
    print(f"Missing predictions for {num_missing_predictions} references")
else:
    print("All references have a corresponding prediction")

All references have a corresponding prediction


In [20]:
instance_ids = {}
instances = []
current_index = 0

for id in sorted(list(ids)):
    reference = references[id]
    prediction = predictions[id]

    for target in reference["targets"]:
        instances.append(
            {
                "src": reference["source"],
                "ref": target["translation"],
                "mt": prediction["prediction"],
            }
        )

    instance_ids[id] = [current_index, current_index + len(reference["targets"])]
    current_index += len(reference["targets"])

print(f"Created {len(instances)} instances")

Created 1316 instances


In [21]:
# Download the model
model_path = download_model(COMET_MODEL_NAME)

# Load the model
model = load_from_checkpoint(model_path)

Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Lightning automatically upgraded your loaded checkpoint from v1.8.3.post1 to v2.5.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint C:\Users\ptlpa\.cache\huggingface\hub\models--Unbabel--wmt22-comet-da\snapshots\2760a223ac957f30acfb18c8aa649b01cf1d75f2\checkpoints\model.ckpt`
Encoder model frozen.
c:\Users\ptlpa\anaconda3\envs\comet\lib\site-packages\pytorch_lightning\core\saving.py:195: Found keys that are not in the model state dict but in the checkpoint: ['encoder.model.embeddings.position_ids']


In [22]:
# Compute the scores
outputs = model.predict(instances, batch_size=BATCH_SIZE, gpus=NUM_GPUS)

💡 Tip: For seamless cloud uploads and versioning, try installing [litmodels](https://pypi.org/project/litmodels/) to enable LitModelCheckpoint, which syncs automatically with the Lightning model registry.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
  return forward_call(*args, **kwargs)
Predicting DataLoader 0: 100%|██████████| 42/42 [01:17<00:00,  1.84s/it]


In [23]:
# Extract the scores
scores = outputs.scores
max_scores = []

for id, indices in instance_ids.items():
    # Get the max score for each reference
    max_score = max(scores[indices[0] : indices[1]])
    max_scores.append(max_score)

# Compute the average score while taking into account the missing predictions (which are considered as 0)
system_score = sum(max_scores) / (len(max_scores) + num_missing_predictions)

print(f"Average COMET score: {100.*system_score:.2f}")

Average COMET score: 84.15


In [24]:
outputs.system_score

0.8252155175687332