In [1]:
# -*- coding: utf-8 -*-
import sys
import nltk
import numpy as np
import pandas as pd
import pickle5 as pickle
import json

from nltk import meteor_score
from scipy import stats
from scipy.spatial.distance import euclidean
import pulp

from collections import defaultdict
from itertools import product

import torch
from torch.autograd import Variable
from sentence_transformers import SentenceTransformer
from torch import nn

from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel

import matplotlib.pyplot as plt
%matplotlib inline

# WMT-17

In [31]:
def load_obj(data_path, name):
    with open(data_path + name + '.pkl', 'rb') as f:
        data = pickle.load(f)
        return data
    
def get_lang_translation(collections, lang):
    src = [sample[0] for sample in collections[lang]]
    ref = [sample[1] for sample in collections[lang]]
    MT = [sample[2] for sample in collections[lang]]
    score = [float(sample[3]) for sample in collections[lang]]
    return src, ref, MT, score

In [32]:
def data_processing_wmt17():
    data_path = sys.path[0] +"/wmt17-processed-data/data/"
    lang = ["csen","deen","enru", "enzh", "fien","lven","ruen", "zhen","tren"]
    nums = 0
    collections = {}

    for l in lang:    
        num_sens_lang = 0
        score_lang = []    
        data = load_obj(data_path, l)
        
        for i in data.values():
            if len(i[3]) > 0: # With human scores

                for k in i[3]:
                    score_lang.append([i[1],i[2],k[0],k[1]]) # A human score, A sample

                nums += len(i[3])
                num_sens_lang +=  len(i[3])

        collections[l] = score_lang     
        
    return collections

# Multi-30K

In [33]:
def data_processing_Multi30K():
    collections = {
        "de": sub_processing("de"),
        "fr": sub_processing("fr")
    }
    
    return collections

def sub_processing(lang):
    data_path = sys.path[0] + "/human_assessment/"
    
    with open(f"{data_path}MMTsourcedict.json") as json_file:
        src_dict = json.load(json_file)
    with open(f"{data_path}MMTgolddict_{lang}.json") as json_file:
        ref_dict = json.load(json_file)
    with open(f"{data_path}MMTtranslationdict_{lang}.json") as json_file:
        mt_dict = json.load(json_file)
    
    lang_match = []
    assert len(src_dict) == len(ref_dict)
    for src_id in src_dict:
        if src_id in mt_dict:
            for mt in mt_dict[src_id]:
                assert len(mt) == 2
                lang_match.append([src_dict[src_id], ref_dict[src_id], mt[0], mt[1]])
                                   
    return lang_match

# WMT-20

In [34]:
def data_processing_wmt20():
    data_path = sys.path[0]+"/wmt20_data/processed_data/"
    lang = ["neen","ende","eten", "enzh", "roen","sien","ruen"]
    nums = 0
    collections = {}

    for l in lang:    
        num_sens_lang = 0
        score_lang = []
        data = load_obj(data_path, l)
        
        for i in data:
            if len(i[1]) > 0: # With human scores

                score_lang.append([i[0],[],i[1],float(i[2])]) # A human score, A sample

                nums += 1
                num_sens_lang +=  1

        collections[l] = score_lang     

    return collections

# WMD

## WMD with embeddings

In [35]:
### Hook Method: Each Layers' output
def layer_processing(model):
    layers = []

    def layer_hook(module, input_, output):
        layers.append(output[0])

    for i in model.encoder.layer:
        i.register_forward_hook(layer_hook)

    return layers

## WMD model

In [36]:
def get_WMD_Model(name):
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModel.from_pretrained(name, return_dict=True)
    # bert_model.embeddings.word_embeddings
    model.eval()
    return tokenizer, model

## WMD Computation

In [37]:
### Weights of each token for WMD
## From the aspects of model embedding.
def tokens_to_fracdict(tokens):
    cntdict = defaultdict(lambda : 0)
        
    for token in tokens:
        cntdict[token] += 1
    totalcnt = sum(cntdict.values())
    return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}

## From the aspects of model output, considering contextual relationship.
## Each tokens means different, even they are the same.
def tokens_to_fracdict_contextual(tokens):
    return {token: 1/len(tokens) for token in range(len(tokens))}

In [58]:
## There are two components can be used as embedding
## 1) model embedding 
## 2) Model output states

def embedding_processing(sent1, sent2, tokenizer, model, embed_type=False):
    
    sent1_tokens = tokenizer.tokenize(sent1)
    sent2_tokens = tokenizer.tokenize(sent2)
    
    if embed_type:
        
        sent1_buckets = tokens_to_fracdict(sent1_tokens)
        sent2_buckets = tokens_to_fracdict(sent2_tokens) 
        
        sent1_embedding = model.embeddings.word_embeddings(torch.tensor(tokenizer.convert_tokens_to_ids(list(sent1_buckets.keys()))))
        sent2_embedding = model.embeddings.word_embeddings(torch.tensor(tokenizer.convert_tokens_to_ids(list(sent2_buckets.keys()))))
        
    else:
        
        sent1_buckets = tokens_to_fracdict_contextual(sent1_tokens)
        sent2_buckets = tokens_to_fracdict_contextual(sent2_tokens) 
        
        sent1_id = tokenizer(sent1,return_tensors="pt")
        sent2_id = tokenizer(sent2,return_tensors="pt")


#         [-8:-7] indicates Roberta-Large layer 17
#         [-4,-3] indicates XLM Roberta-Base layer 9
        model(sent1_id['input_ids'])
        sent1_embedding = torch.mean(torch.stack(layers[-4:-3]).squeeze(1).permute(1,0,2), dim=1)
        
        model(sent2_id['input_ids'])
        sent2_embedding = torch.mean(torch.stack(layers[-4:-3]).squeeze(1).permute(1,0,2), dim=1)
    
    layers.clear()
    
    if sent1_embedding.size()[0] - 2 == len(sent1_tokens):
        sent1_embedding = sent1_embedding[1:-1,:] # Remove bos and eos tokens

    if sent2_embedding.size()[0] - 2 == len(sent2_tokens):
        sent2_embedding = sent2_embedding[1:-1,:] # Remove bos and eos tokens  
    
    
    assert len(sent1_buckets) + len(sent2_buckets) == (sent1_embedding.size()[0] + sent2_embedding.size()[0])
    
    return sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding

In [None]:
def word_mover_distance_probspec(sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding, lpFile=None,):

    # Updated buckets with labeled name
    first_sent_buckets = {f"x{idx}": item[1] for idx, item in enumerate(sent1_buckets.items())}
    second_sent_buckets = {f"y{idx}": item[1] for idx, item in enumerate(sent2_buckets.items())}

    var_names = list(first_sent_buckets.keys()) + list(second_sent_buckets.keys())
     
    all_embedding = torch.cat([sent1_embedding, sent2_embedding])
        
    assert len(var_names) == all_embedding.size(0)
    
    wordvecs = {token: embedding.detach().numpy() for token, embedding in zip(var_names, all_embedding)}
    
    
    T = pulp.LpVariable.dicts('T_matrix', list(product(var_names, var_names)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(var_names, var_names)])
    
    for token2 in second_sent_buckets:   #constrains
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
        
    for token1 in first_sent_buckets:    #constrains
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    if lpFile!=None:
        prob.writeLP(lpFile)

    prob.solve()
#     prob.solve(pulp.PULP_CBC_CMD(msg=False))

    return prob

In [40]:
def word_mover_distance(sent1, sent2, tokenizer, model, embed_type, lpFile=None):
    
    sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding = embedding_processing(sent1, sent2, tokenizer, model, embed_type)
    
    prob = word_mover_distance_probspec(sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding, lpFile=lpFile)
    
    return pulp.value(prob.objective)

## Fluent Based WMD

### Order penalty

In [41]:
from nltk import meteor_score
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from itertools import chain, product

def order_penalty(    
    reference,
    hypothesis,
    preprocess=str.lower,
    stemmer=PorterStemmer(),
    wordnet=wordnet):
    
    enum_hypothesis, enum_reference = meteor_score._generate_enums(
        hypothesis, reference, preprocess=preprocess
    )
    
    translation_length = len(enum_hypothesis)
    reference_length = len(enum_reference)
    
    matches, _, _ = meteor_score._enum_allign_words(enum_hypothesis, enum_reference, stemmer=stemmer)
    
    matches_count = len(matches)
    
    try:
        chunk_count = float(meteor_score._count_chunks(matches))
        frag_frac = chunk_count / matches_count
        
    except ZeroDivisionError: # No unigrams match
        return 0
    
    return frag_frac

In [42]:
def fluency_based_wmd(wmd, ref, hypo, gamma=0.2):
    
    frag_penalty = order_penalty(ref, hypo)

    return wmd - gamma *(0.5 - frag_penalty)

## SMD

In [43]:
def getSentenceMoverDistance(sents1, sents2, tokenizer, model, embed_type=False):
    
    sentence_distance = []
    
    for sent1, sent2 in zip(sents1, sents2):
        
        _,_,sent1_embedding, sent2_embedding = embedding_processing(sent1, sent2, tokenizer, model, embed_type)
    
        smd = euclidean(torch.mean(sent1_embedding, axis = 0).detach().numpy(), torch.mean(sent2_embedding,axis=0).detach().numpy())
        
        sentence_distance.append(smd)
        
    sentence_distance = (sentence_distance -np.min(sentence_distance))/ (np.max(sentence_distance)-np.min(sentence_distance))

    return sentence_distance
    

## Cosine Similarity

In [44]:
def getSentSimilarity(sents1, sents2, model):
    embed_sent1 = model.encode(sents1, convert_to_tensor=True)
    embed_sent2 = model.encode(sents2, convert_to_tensor=True)
    cos_sim = nn.CosineSimilarity(dim=1)(embed_sent1,embed_sent2)
    # Normalized
    cos_sim = (cos_sim -torch.min(cos_sim))/ (torch.max(cos_sim)-torch.min(cos_sim))
    return cos_sim.cpu().numpy()

## Bert Score

In [45]:
from datasets import load_metric
bert_score_metric = load_metric('bertscore', keep_in_memory=True, cache_dir=sys.path[0])

In [46]:
# model_type: bert-base-multilingual-cased, xlm-roberta-base

def getBertScore(sents1, sents2, model):
    bert_score_metric.add_batch(predictions=sents2, references=sents1)
    score = bert_score_metric.compute(model_type=model)
    # Normalized Bert Score F1
    norm_score = (score["f1"] -torch.min(score["f1"]))/ (torch.max(score["f1"])-torch.min(score["f1"]))
    return norm_score.tolist()

## Compound Method

In [47]:
'''
args: lists of metrics with same length.
corr: a list to store the correlation relation of metrics with target. 
    -1 represents negatively correlated.
     1  represents postively correlated.
'''

def combine_metrics(*args, **kwargs):
    assert len(args) == len(kwargs["corr"])
    assert len(args[0]) == len(args[1])
    
    output = []
    
    for i in range(len(args[0])):
        value = 0
        for sign, metric in zip(kwargs["corr"],args):
            assert metric[i] <= 1 and metric[i] >= 0
            if sign > 0:
                value += np.exp(metric[i])
            else:
                value += np.exp(1-metric[i])
        output.append(value)
        
    return output


# Evalutaion

In [48]:
'''
sents: a collection of language sentences with structure [src, ref, MT, score]
tokenizer: wmd tokenizer
model: wmd model
embed_type: BPE embedding or Model outputs
Fluent: Fluent-based WMD or not 
cross_ling: SRC-MT or REF_MT
'''

def compute_WMD_WMDo(sents, tokenizer, model, embed_type=False, fluent=False, cross_ling=False):
    
    wmd = []
    wmdo =[]
    
    for i in range(len(sents)):  # Sent structure: [src, ref, MT, score]
        hypothesis = sents[i][2]
        
        if cross_ling:
            reference = sents[i][0]   # src - mt
        else:
            reference = sents[i][1]   # ref - mt
        
        wmd_tmp = word_mover_distance(reference, hypothesis, tokenizer, model, embed_type)
        wmd.append(wmd_tmp)

        if fluent:
            wmdo.append(fluency_based_wmd(wmd_tmp, reference, hypothesis))
                
    # Normalize
    wmd = [(val-min(wmd))/(max(wmd)-min(wmd)) for val in wmd]
    wmdo = [(val-min(wmdo))/(max(wmdo)-min(wmdo)) for val in wmdo]

    return np.array(wmd), np.array(wmdo)

In [49]:
## Correlation evaluation
def evaluation(wmd, score):
    pearson = stats.pearsonr(wmd, score)
    spearman = stats.spearmanr(wmd, score)
    print("Spearman Correlation:", spearman)
    print("Pearson Correlation:", pearson)
    return pearson, spearman

In [50]:
## Save Metrics
def save_metrics(name, metric, score):
    filePath = f"{sys.path[0]}/Metrics/{name}"
    file = open(f"{filePath}.pkl", 'wb') 
    pickle.dump([metric, score], file)
    file.close()

def load_metrics(name):
    filePath = f"{sys.path[0]}/Metrics/{name}"
    file = open(filePath, 'rb')
    data = pickle.load(file)
    file.close()
    return data[0], data[1]

In [51]:
## metric: numpy array
## score: numpy array
def scatter_diagram(*args, legend, score):
    score = np.array(score)
    score = (score - np.min(score)) / (np.max(score)-np.min(score))
    for metric in args:
        metric = np.array(metric)
        metric = (metric - np.min(metric)) / (np.max(metric)-np.min(metric))
        plt.scatter(score, metric)
    plt.xlabel("human score")
    plt.ylabel("Normalized score")
    plt.legend(legend)
    plt.plot([0,1],[0,1], "r--")
    plt.grid()
    plt.show()

# Test

In [23]:
wmd_tokenizer, wmd_model = get_WMD_Model('xlm-roberta-base')
# wmd_tokenizer, wmd_model = get_WMD_Model('roberta-large') 

In [24]:
# bert_score_model = 'roberta-large'
bert_score_model = 'xlm-roberta-base'

In [25]:
cos_sim_model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')
# cos_sim_model = SentenceTransformer('roberta-large-nli-stsb-mean-tokens')

In [26]:
layers = layer_processing(wmd_model)

In [27]:
'''
wmd_tokenizer, wmd_model: tokenizers and pretrained model used in the wmd
bert_score_model: specified model type used in the bert score
cos_sim_model: cosine similarity model to compute the embedding of sentences
save_path: save the each metric with human score 
lang: Provides the language to be estimated. Otherwise, the whole 
fluent: Whether fluent based Wmdo are used
cross_ling: 
    True: Cross-linguistic, evaluate src - MT
    False: Mono-linguistic, evaluate ref - MT
'''

def WMT20_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, langs=None, fluent=False, cross_ling=False):
    if not langs:
        langs = ["neen","ende","eten", "enzh", "roen","sien","ruen"]
    wmt20_collections = data_processing_wmt20()
    testing(collections = wmt20_collections, 
            langs = langs, 
            wmd_tokenizer = wmd_tokenizer, 
            wmd_model = wmd_model, 
            bert_score_model = bert_score_model, 
            cos_sim_model = cos_sim_model, 
            save_path = save_path, 
            fluent=fluent, 
            cross_ling=cross_ling)
    
def WMT17_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, langs=None, fluent=False, cross_ling=False):
    if not langs:
        if not cross_ling:
            langs = ["csen","deen","enru", "enzh", "fien","lven","ruen", "zhen", "tren"]
        else:
            langs = ["csen","deen","fien","lven","ruen", "zhen", "tren"]
            
    wmt17_collections = data_processing_wmt17()
    testing(
            collections = wmt17_collections, 
            langs = langs, 
            wmd_tokenizer = wmd_tokenizer, 
            wmd_model = wmd_model, 
            bert_score_model = bert_score_model, 
            cos_sim_model = cos_sim_model, 
            save_path = save_path, 
            fluent=fluent, 
            cross_ling=cross_ling)

    
def Multi_30K_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, langs=None, fluent=False, cross_ling=False):
    if not langs:
        langs = ["fr","de"]
    Multi30K_collections = data_processing_Multi30K()
    testing(collections = Multi30K_collections, 
            langs = langs, 
            wmd_tokenizer = wmd_tokenizer, 
            wmd_model = wmd_model, 
            bert_score_model = bert_score_model, 
            cos_sim_model = cos_sim_model, 
            save_path = save_path, 
            fluent=fluent, 
            cross_ling=cross_ling)


In [70]:
    
def testing(**kwargs):
    
    save_path = kwargs['save_path']
    
    for lang in kwargs["langs"]:
        
        print(f"Processing {lang} data:")
        
        src, ref, hypothesis, score = get_lang_translation(kwargs["collections"], lang)
        
        if kwargs["cross_ling"]:             
            reference = src
        else:
            reference = ref
            
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        print("One Metric")
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        
        wmd, wmdo = compute_WMD_WMDo(kwargs["collections"][lang], kwargs["wmd_tokenizer"], kwargs["wmd_model"], embed_type=False, fluent=kwargs["fluent"], cross_ling=kwargs["cross_ling"])
        print(f"Average WMD: \n{sum(wmd)/len(wmd)}")
        _,_ = evaluation(wmd, score)
        #save_metrics(f"{save_path}/{lang}_wmd", wmd, score)
        print("---------------------------------------------")

        smd = getSentenceMoverDistance(hypothesis, reference, kwargs["wmd_tokenizer"], kwargs["wmd_model"], embed_type=False)
        print(f"Average SMD: \n{sum(smd)/len(smd)}")
        _,_ = evaluation(smd, score)
        #save_metrics(f"{save_path}/{lang}_smd", smd, score)
        print("---------------------------------------------")
        
        similarity = getSentSimilarity(hypothesis, reference, kwargs["cos_sim_model"])
        print(f"Average Cosine similarity: \n{sum(similarity)/len(similarity)}")
        _,_ = evaluation(similarity, score)
        #save_metrics(f"{save_path}/{lang}_cs", similarity, score)
        print("---------------------------------------------")
        
        bert_score = getBertScore(hypothesis, reference, kwargs["bert_score_model"])
        print(f"Average Bert Score: \n{sum(bert_score)/len(bert_score)}")
        _,_ = evaluation(bert_score, score)
        #save_metrics(f"{save_path}/{lang}_bs", bert_score, score)
        print("---------------------------------------------\n")
        
        
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        print("Two metrics")
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        compound_metric = combine_metrics(similarity, wmd, corr=[1, -1])
        print(f"Average similarity + wmd: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
        print("---------------------------------------------")
        compound_metric = combine_metrics(bert_score, wmd, corr=[1, -1])
        print(f"Average bert score +  wmd: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
        print("---------------------------------------------")
        compound_metric = combine_metrics(similarity, bert_score, corr=[1, 1])
        print(f"Average similarity + bert score: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
        print("---------------------------------------------")
        compound_metric = combine_metrics(smd, wmd, corr=[-1, -1])
        print(f"Average smd + wmd: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
        print("---------------------------------------------")
        compound_metric = combine_metrics(smd, bert_score, corr=[-1, 1])
        print(f"Average smd + bert score: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)        
        print("---------------------------------------------")

        
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        print("Three Metrics")
        print("+++++++++++++++++++++++++++++++++++++++++++++")
        compound_metric = combine_metrics(similarity, bert_score, wmd, corr=[1, 1, -1])
        print(f"Average + similarity + bert score - wmd: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
        print("---------------------------------------------")
        compound_metric = combine_metrics(similarity, bert_score, wmd, smd, corr=[1, 1, -1, -1])
        print(f"Average + similarity + bert score - wmd - smd: \n{sum(compound_metric)/len(compound_metric)}")
        _,_ = evaluation(compound_metric, score)
        # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
        print("---------------------------------------------\n")

        
        if kwargs["fluent"]:
            
            print("+++++++++++++++++++++++++++++++++++++++++++++")
            print("WMDo Metrics")
            print("+++++++++++++++++++++++++++++++++++++++++++++")
        
            print(f"Average WMDo: \n {sum(wmdo)/len(wmdo)}")
            _,_ = evaluation(wmdo, score)
            save_metrics(f"{save_path}/{lang}_wmdo", wmdo, score)
            print("---------------------------------------------")
            compound_metric_o = combine_metrics(similarity, wmdo, corr = [1,-1])
            print(f"Average similarity + wmdo: \n{sum(compound_metric_o)/len(compound_metric_o)}")
            _,_ = evaluation(compound_metric_o, score)
            print("---------------------------------------------")
            compound_metric_o = combine_metrics(similarity, wmdo, corr = [-1, 1])
            print(f"Average similarity + wmdo: \n{sum(compound_metric_o)/len(compound_metric_o)}")
            _,_ = evaluation(compound_metric_o, score)
            print("---------------------------------------------")
            compound_metric = combine_metrics(similarity, bert_score, wmdo, corr=[1, 1, -1])
            print(f"Average similarity + bert score - wmd: \n{sum(compound_metric)/len(compound_metric)}")
            _,_ = evaluation(compound_metric, score)
            # save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)
            print("---------------------------------------------\n")
    

### WMT-17

In [60]:
WMT17_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path="wmt17/other", fluent=False, cross_ling=True)

### WMT-20

In [71]:
WMT20_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path="wmt20/src_mt", fluent=False, cross_ling=True)

### Multi-30K


In [72]:
Multi_30K_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path="multi30K/others/", fluent=False, cross_ling=True)