In [1]:
import scipy.stats as stats
import torch
import torch.nn as nn
from datasets import load_dataset, load_from_disk
from sklearn.preprocessing import MinMaxScaler
from tqdm import tqdm
from transformers import AutoTokenizer, MT5EncoderModel
from transformers.modeling_outputs import BaseModelOutput

[2023-07-26 16:09:40,117] [INFO] [real_accelerator.py:133:get_accelerator] Setting ds_accelerator to cuda (auto detect)


In [2]:
model_encoder_name = "bigscience/mt0-base"
device = "cuda:0"

Evaluation results
1. Architecture 4 <br />
en-ru Kendall-tau: <strong>0.271</strong> <br />
zh-en Kendall-tau: 0.250 <br />
en-de Kendall-tau: 0.244
2. Architecture 4(sqm) <br />
en-ru Kendall-tau: 0.264 <br />
zh-en Kendall-tau: <strong>0.262</strong> <br />
en-de Kendall-tau: <strong>0.249</strong>
3. Architecture 5 <br />
en-ru Kendall-tau: 0.253 <br />
zh-en Kendall-tau: 0.239 <br />
en-de Kendall-tau: 0.249

# Load model

In [3]:
def mean_pooling(token_embeddings, attention_mask):
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )

In [4]:
class T5Regressor(nn.Module):
    def __init__(self, checkpoint, sizes_mlp, act=nn.Tanh):  # nn.Tanh
        super(T5Regressor, self).__init__()

        self.llm = MT5EncoderModel.from_pretrained(
            checkpoint, output_attentions=True, output_hidden_states=True
        )

        self.dropout = nn.Dropout(0.1)

        layers = []
        for i in range(len(sizes_mlp) - 1):
            layers.append(nn.Linear(sizes_mlp[i], sizes_mlp[i + 1]))
            # layers.append(lora.Linear(sizes_mlp[i], sizes_mlp[i + 1], r=16))
            if i < len(sizes_mlp) - 2:
                layers.append(act())

        layers.append(nn.Dropout(0.1))
        self.mlp = nn.Sequential(*layers)
        self.output_layer = nn.Sigmoid()

        self.loss_fc = nn.MSELoss()

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.llm(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = mean_pooling(
            outputs.last_hidden_state, outputs.attentions[-1][:, 0, :, 0]
        )
        outputs_sequence = self.dropout(embeddings)

        logits = self.output_layer(self.mlp(outputs_sequence)) * 100

        loss = None
        if labels is not None:
            loss = self.loss_fc(logits.view(-1, 1), labels.view(-1).unsqueeze(1))

        return (
            BaseModelOutput(
                last_hidden_state=outputs.last_hidden_state,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            ),
            logits,
            loss,
        )

In [5]:
model = T5Regressor(checkpoint=model_encoder_name, sizes_mlp=[768, 192, 48, 1])

In [6]:
model.load_state_dict(torch.load("checkpoints/model_arc_4_sqm.pt"))

<All keys matched successfully>

In [7]:
model.to(device)
model.eval()

T5Regressor(
  (llm): MT5EncoderModel(
    (shared): Embedding(250112, 768)
    (encoder): MT5Stack(
      (embed_tokens): Embedding(250112, 768)
      (block): ModuleList(
        (0): MT5Block(
          (layer): ModuleList(
            (0): MT5LayerSelfAttention(
              (SelfAttention): MT5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): MT5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): MT5LayerFF(
              (DenseReluDense): MT5DenseGatedActDense(
                (wi_0): Linear(in_features=768, out_features=2048, bias=False)
                (wi_1): Linear(in_fe

# Load data

In [8]:
dataset = load_dataset("RicardoRei/wmt-mqm-human-evaluation", split="train")

dataset

Found cached dataset csv (/home/jovyan/.cache/huggingface/datasets/RicardoRei___csv/RicardoRei--wmt-mqm-human-evaluation-13a9a3b878e9c5ea/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d)


Dataset({
    features: ['lp', 'src', 'mt', 'ref', 'score', 'system', 'annotators', 'domain', 'year'],
    num_rows: 150347
})

In [9]:
dataset_test = dataset.filter(
    lambda example: (example["year"] == 2022)
    and (
        (example["lp"] == "en-ru")
        or (example["lp"] == "zh-en")
        or (example["lp"] == "en-de")
    )
)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/RicardoRei___csv/RicardoRei--wmt-mqm-human-evaluation-13a9a3b878e9c5ea/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-792642c7a8b42625.arrow


In [10]:
prompt_column = []

prompt_template_da = """
Score the following translation from {source_lang} to {target_lang} with respect to the human reference on a continuous scale from 0 to 100, where score of zero means "no meaning preserved" and score of one hundred means "perfect meaning and grammar".
{source_lang} source: "{source_seg}"
{target_lang} human reference: {reference_seg}
{target_lang} translation: "{target_seg}"
Score:
"""

prompt_template_sqm = """
Score the following translation from {source_lang} to {target_lang} with respect to the human reference on a continuous scale from 0 to 100 that starts with "No meaning preserved", goes through "Some meaning preserved", then "Most meaning preserved and few grammar mistakes", up to "Perfect meaning and grammar".
{source_lang} source: "{source_seg}"
{target_lang} human reference: "{reference_seg}"
{target_lang} translation: "{target_seg}"
Score (0-100):
"""

for i in tqdm(range(len(dataset_test))):
    example = dataset_test[i]
    sl, tl = example["lp"].split("-")
    prompt_column.append(
        prompt_template_sqm.format(
            source_lang=sl,
            target_lang=tl,
            source_seg=example["src"],
            reference_seg=example["ref"],
            target_seg=example["mt"],
        )
    )

100%|██████████| 67575/67575 [00:07<00:00, 9080.41it/s]


In [11]:
dataset_test = dataset_test.add_column(name="prompt", column=prompt_column)

Loading cached processed dataset at /home/jovyan/.cache/huggingface/datasets/RicardoRei___csv/RicardoRei--wmt-mqm-human-evaluation-13a9a3b878e9c5ea/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d/cache-639f203620d94f62.arrow


In [12]:
dataset_test[41152]

{'lp': 'zh-en',
 'src': '用了一段时间，质量不错，屏幕分辨率也很清晰。',
 'mt': 'Been using it for a while and the quality is nice and the screen resolution is crystal clear.',
 'ref': 'I’ve used it for some time; it is of high quality, with clear screen resolution.',
 'score': -1.0,
 'system': 'comet_bestmbr',
 'annotators': 1,
 'domain': 'ecommerce',
 'year': 2022,
 'prompt': '\nScore the following translation from zh to en with respect to the human reference on a continuous scale from 0 to 100 that starts with "No meaning preserved", goes through "Some meaning preserved", then "Most meaning preserved and few grammar mistakes", up to "Perfect meaning and grammar".\nzh source: "用了一段时间，质量不错，屏幕分辨率也很清晰。"\nen human reference: "I’ve used it for some time; it is of high quality, with clear screen resolution."\nen translation: "Been using it for a while and the quality is nice and the screen resolution is crystal clear."\nScore (0-100):\n'}

# Initialize tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained(model_encoder_name)

# Tokenize data

In [14]:
dataset_tokenized = dataset_test.map(
    lambda example: tokenizer(
        example["prompt"],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    ),
    batched=True,
)

Map:   0%|          | 0/67575 [00:00<?, ? examples/s]

In [15]:
dataset_tokenized.save_to_disk("wmt-mqm_tokenized_sqm")

Saving the dataset (0/1 shards):   0%|          | 0/67575 [00:00<?, ? examples/s]

# Evaluate

## Preprocess

In [16]:
dataset_tokenized = load_from_disk("wmt-mqm_tokenized_sqm")

In [17]:
dataset_tokenized

Dataset({
    features: ['lp', 'src', 'mt', 'ref', 'score', 'system', 'annotators', 'domain', 'year', 'prompt', 'input_ids', 'attention_mask'],
    num_rows: 67575
})

In [18]:
dataset_tokenized = dataset_tokenized.with_format("torch").remove_columns(
    ["src", "mt", "ref", "annotators", "domain", "year", "prompt", "system"]
)

In [19]:
scaler = MinMaxScaler()

In [20]:
scaler.fit(dataset_tokenized["score"].reshape(-1, 1))

In [21]:
dataset_tokenized = dataset_tokenized.add_column(
    column=scaler.transform(dataset_tokenized["score"].reshape(-1, 1)).reshape(1, -1)[
        0
    ],
    name="labels",
).remove_columns(["score"])

## Calculate

In [22]:
def calculate_kendall(dataset):
    labels = []
    predicted = []

    for batch in tqdm(dataset):
        batch = {k: v.to(device).reshape(1, -1) for k, v in batch.items()}

        labels.append(batch["labels"][0][0].item())
        with torch.no_grad():
            predicted.append(model(**batch)[1][0][0].item() / 100)

    return stats.kendalltau(predicted, labels)[0]

### All languages

In [24]:
dataset_all = dataset_tokenized.remove_columns(["lp"])

In [25]:
print(f"All lang Kendall tau-b: {calculate_kendall(dataset_all)}")

100%|██████████| 67575/67575 [15:13<00:00, 73.96it/s] 

All lang Kendall tau-b: 0.25546316922823165





### en-ru

In [23]:
dataset_enru = dataset_tokenized.filter(lambda example: example["lp"] == "en-ru")

dataset_enru = dataset_enru.remove_columns(["lp"])

Filter:   0%|          | 0/67575 [00:00<?, ? examples/s]

In [24]:
print(f"en-ru lang Kendall tau-b: {calculate_kendall(dataset_enru)}")

100%|██████████| 19725/19725 [04:18<00:00, 76.32it/s]

en-ru lang Kendall tau-b: 0.26410225813730226





### zh-en

In [25]:
dataset_zhen = dataset_tokenized.filter(lambda example: example["lp"] == "zh-en")

dataset_zhen = dataset_zhen.remove_columns(["lp"])

Filter:   0%|          | 0/67575 [00:00<?, ? examples/s]

In [26]:
print(f"zh-en lang Kendall tau-b: {calculate_kendall(dataset_zhen)}")

100%|██████████| 28125/28125 [07:06<00:00, 66.00it/s]

zh-en lang Kendall tau-b: 0.26238626620342803





### en-de

In [27]:
dataset_ende = dataset_tokenized.filter(lambda example: example["lp"] == "en-de")

dataset_ende = dataset_ende.remove_columns(["lp"])

Filter:   0%|          | 0/67575 [00:00<?, ? examples/s]

In [28]:
print(f"en-de lang Kendall tau-b: {calculate_kendall(dataset_ende)}")

100%|██████████| 19725/19725 [04:21<00:00, 75.34it/s]

en-de lang Kendall tau-b: 0.24947199046267782



