# Testsuites:

In this notebook we will test COMET recommended metric against some sentence perturbations.

- Lowercased MT hypotheis
- Lack of punctuation
- Random Word Drops
- Typos
- Word Swaps
- No-Translation

The goal is that our models rank the original MT higher than the corrrupted version.

We will measure ties, and fails.

In [None]:
from comet.models import download_model

In [None]:
model = download_model("wmt-large-da-estimator-1719")

In [None]:
sources = [
    "Darbų sutartis buvo pasirašyta spalio 2 dieną.",
    "I was sure I canceled, since I went through the entire process.",
    "I don't see anything new changing.",
    "中国人对俄罗斯文化有着浓厚的兴趣。",
    "Sorry, I think it is good now.",
]

hypothesis = [
    "The works contract was signed on 2 October.",
    "Eu tenho certeza de que cancelei, dado que passei pelo processo todo.",
    "Ich sehe nichts Neues, das sich ändert“.",
    "The Chinese have a strong interest in Russian culture.",
    "Ursäkta, jag tror att det är bra nu."
]

references = [
    "The contract for works was signed on 2 October.",
    "Eu tinha a certeza que cancelei, pois passei pelo processo inteiro.",
    "Ich sehe nichts Neues, das sich ändert.",
    "Chinese people have a keen interest in Russian culture.",
    "Tyvärr, jag tror att det är bra nu."
]

In [None]:
ties = 0
fails = 0 
tests = 0

In [None]:
samples = {"src": sources, "mt": hypothesis, "ref": references}
samples = [dict(zip(samples, t)) for t in zip(*samples.values())]
model.predict(samples)
samples

## Test Lowercase:

In [None]:
lowercase_samples = {"src": sources, "mt": [h.lower() for h in hypothesis], "ref": references}
lowercase_samples = [dict(zip(lowercase_samples, t)) for t in zip(*lowercase_samples.values())]

In [None]:
model.predict(lowercase_samples)
for i in range(len(lowercase_samples)):
    if abs(lowercase_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - lowercase_samples[i]["predicted_score"] < threshold:
        fail += 1
    tests += 1
    
print (f"{fails} fails and {ties} ties.")

## Test Punctuation Missing:

In [None]:
lack_punct_samples = {"src": sources, "mt": [h[:-1] for h in hypothesis], "ref": references}
lack_punct_samples = [dict(zip(lack_punct_samples, t)) for t in zip(*lack_punct_samples.values())]

In [None]:
model.predict(lack_punct_samples)
for i in range(len(lack_punct_samples)):
    if abs(lack_punct_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - lack_punct_samples[i]["predicted_score"] < threshold:
        fail += 1
    tests += 1
    
print (f"{fails} fails and {ties} ties.")

## Test Punctuation Change:

In [None]:
change_punct_samples = {"src": sources, "mt": [h[:-1]+'!' for h in hypothesis], "ref": references}
change_punct_samples = [dict(zip(change_punct_samples, t)) for t in zip(*change_punct_samples.values())]

In [None]:
model.predict(change_punct_samples)
for i in range(len(change_punct_samples)):
    if abs(change_punct_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - change_punct_samples[i]["predicted_score"] < threshold:
        fail += 1
    tests += 1
    
print (f"{fails} fails and {ties} ties.")

## Random Word Drop:

In [None]:
import random
random.seed(3)
wordrop_samples = []
for s in samples:
    mt = s['mt'].split(' ')
    mt.pop(random.randrange(len(mt)))
    wordrop_samples.append({'src': s['src'], 'mt': ' '.join(mt), 'ref': s['ref']})

In [None]:
model.predict(wordrop_samples)
for i in range(len(wordrop_samples)):
    if abs(wordrop_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - wordrop_samples[i]["predicted_score"] < threshold:
        fails += 1
    tests += 1
        
print (f"{fails} fails and {ties} ties.")

In [None]:
samples, wordrop_samples

## Test Typos:
Test typos is not too important because usually MT does not generate typos.

In [None]:
import numpy as np
def add_typos(string, typos=1):
    string = list(string)
    swaps = np.random.choice(len(string) - 1, typos)
    for swap in swaps:
        tmp = string[swap]
        string[swap] = string[swap + 1]
        string[swap + 1] = tmp
    return ''.join(string)

In [None]:
typos_samples = {"src": sources, "mt": [add_typos(h) for h in hypothesis], "ref": references}
typos_samples = [dict(zip(typos_samples, t)) for t in zip(*typos_samples.values())]

In [None]:
model.predict(typos_samples)
for i in range(len(typos_samples)):
    if abs(typos_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - typos_samples[i]["predicted_score"] < threshold:
        fail += 1
    tests += 1
    
print (f"{fails} fails and {ties} ties.")

## Test Word Swap:

In [None]:
import numpy as np

def swap_words(string, swaps=1):
    words = string.split()
    swap_idx = np.random.choice(len(words) - 1, swaps)[0]
    if swap_idx == 0:
        swap_idx += 1
    words[swap_idx-1], words[swap_idx] = words[swap_idx], words[swap_idx-1]
    return ' '.join(words)

In [None]:
swap_samples = {"src": sources, "mt": [swap_words(h) for h in hypothesis], "ref": references}
swap_samples = [dict(zip(swap_samples, t)) for t in zip(*swap_samples.values())]

In [None]:
model.predict(swap_samples)
for i in range(len(swap_samples)):
    if abs(swap_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - swap_samples[i]["predicted_score"] < threshold:
        fail += 1
    tests += 1
    
print (f"{fails} fails and {ties} ties.")

## Test No-translate:

In [None]:
no_translate_samples = {"src": sources, "mt": sources, "ref": references}
no_translate_samples = [dict(zip(no_translate_samples, t)) for t in zip(*no_translate_samples.values())]

In [None]:
model.predict(no_translate_samples)
for i in range(len(no_translate_samples)):
    if abs(no_translate_samples[i]["predicted_score"] - samples[i]["predicted_score"]) < threshold:
        ties += 1
    elif samples[i]["predicted_score"] - no_translate_samples[i]["predicted_score"] < threshold:
        fail += 1
        
print (f"{fails} fails and {ties} ties.")

# Results:

In [None]:
print ("Fail %: {}".format(fails/tests))
print ("Ties %: {}".format(ties/tests))
print ("Passed %: {}".format((tests-(ties+fails))/tests))