In [None]:
import pandas as pd
import torch
import csv
import os
import gzip

In [None]:
import pandas as pd
import torch
torch.set_float32_matmul_precision('high') 
import csv
import os
import gzip

DATA_DIR = "."
COMPRESSED_FILE = "mteval-task1-test25.tsv.gz"
DECOMPRESSED_FILE = "mteval-task1-test25.tsv"

compressed_path = os.path.join(DATA_DIR, COMPRESSED_FILE)
decompressed_path = os.path.join(DATA_DIR, DECOMPRESSED_FILE)

print(f"Looking for compressed file at: {compressed_path}")
if not os.path.exists(compressed_path):
    print(f"ERROR: File {COMPRESSED_FILE} not found in current directory!")
    print(f"Current directory contents: {os.listdir(DATA_DIR)}")
    raise FileNotFoundError(f"Required file {COMPRESSED_FILE} not found")
if not os.path.exists(decompressed_path):
    print(f"Decompressing {COMPRESSED_FILE}...")
    try:
        with gzip.open(compressed_path, 'rt', encoding='utf-8') as gz_file:
            with open(decompressed_path, 'w', encoding='utf-8') as out_file:
                for line in gz_file:
                    out_file.write(line)
        print(f"Successfully decompressed to {DECOMPRESSED_FILE}")
    except Exception as e:
        print(f"Failed to decompress file: {e}")
        raise
else:
    print(f"Using existing decompressed file: {DECOMPRESSED_FILE}")

print("Checking decompressed file content before loading...")
try:
    with open(decompressed_path, 'r', encoding='utf-8') as f:
        for i in range(5):
            line = f.readline()
            if not line:
                break
            print(f"Line {i+1}: {line.strip()}")
except Exception as e:
    print(f"Failed to read decompressed file: {e}")
    raise

print("Loading data into Pandas DataFrame...")
try:
    df_test = pd.read_csv(
        decompressed_path,
        sep='\t',
        header=0,
        encoding='utf-8',
        quoting=csv.QUOTE_NONE
    )
    print(f"Successfully loaded and parsed {len(df_test)} segments.")
    print("Data Schema Verified:")
    df_test.info()
    print("\nFirst 5 rows of test data:")
    print(df_test.head().to_string())
except Exception as e:
    print(f"Failed to load data into DataFrame: {e}")
    raise

In [None]:
from comet import download_model, load_from_checkpoint

MODEL_NAME = "Unbabel/wmt22-comet-da"
BATCH_SIZE = 4
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Execution device: {device}")

print(f"Loading SOTA model: {MODEL_NAME}...")
model_path = download_model(MODEL_NAME)
model = load_from_checkpoint(model_path).to(device)
model.eval()

print("Formatting data for prediction...")
data_to_score = []
for _, row in df_test.iterrows():
    src = str(row['source_segment']).replace(' \\n ', '\n')
    hyp = str(row['hypothesis_segment']).replace(' \\n ', '\n')
    
    ref_val = row['reference_segment']
    if pd.isna(ref_val) or ref_val == "NaN":
        ref_content = ""
    else:
        ref_content = str(ref_val).replace(' \\n ', '\n')
        
    data_to_score.append({"src": src, "mt": hyp, "ref": ref_content})
print(f"Data formatting complete for {len(data_to_score)} entries.")

In [None]:
print("Starting prediction... This may take some time.")
with torch.no_grad():
    seg_scores, _ = model.predict(
        data_to_score,
        batch_size=BATCH_SIZE,
        gpus=1 if device == "cuda" else 0,
        progress_bar=True
    )
print(f"Prediction finished. Generated {len(seg_scores)} scores.")

In [None]:
SUBMISSION_DIR = "."
SUBMISSION_FILE_PATH = os.path.join(SUBMISSION_DIR, "segments.tsv")

print("Creating the final submission DataFrame...")
submission_df = df_test[[
    'doc_id', 'segment_id', 'source_lang', 'target_lang',
    'set_id', 'system_id', 'domain_name', 'method'
]].copy()

submission_df['overall'] = seg_scores

print(f"Saving submission file to {SUBMISSION_FILE_PATH}...")
submission_df.to_csv(
    SUBMISSION_FILE_PATH,
    sep='\t',
    index=False,
    header=True,
    encoding='utf-8'
)
print("--- Submission file created successfully! ---")

print("\nVerifying submission file head:")
print(pd.read_csv(SUBMISSION_FILE_PATH, sep='\t').head().to_string())