In [1]:
# -*- coding: utf-8 -*-

import sys
import nltk
import numpy as np
import pandas as pd
import pickle5 as pickle
import json
import re

from nltk import meteor_score
from scipy import stats
from scipy.spatial.distance import euclidean
import pulp

from collections import defaultdict
from itertools import product

import torch
from torch.autograd import Variable
from sentence_transformers import SentenceTransformer, util
from torch import nn

from transformers import AutoTokenizer, AutoModel
from transformers import BertTokenizer, BertModel

import matplotlib.pyplot as plt
%matplotlib inline

# WMT-17

In [2]:
def load_obj(data_path, name):
    with open(data_path + name + '.pkl', 'rb') as f:
        data = pickle.load(f)
#         print(f"{name} has {len(data)}")
        return data
    
def get_lang_translation(collections, lang):
    src = [sample[0] for sample in collections[lang]]
    ref = [sample[1] for sample in collections[lang]]
    MT = [sample[2] for sample in collections[lang]]
    score = [float(sample[3]) for sample in collections[lang]]
    return src, ref, MT, score

In [3]:
def data_processing_wmt17():
    data_path = sys.path[0]+"/wmt17-processed-data/final_"
    lang = ["csen","deen","enru", "enzh", "fien","lven","ruen", "zhen"]
    
    nums = 0
    collections = {}

    for l in lang:    

        num_sens_lang = 0
        score_lang = []
        
        data = load_obj(data_path, l)
        
        for i in data.values():
            if len(i[3]) > 0: # With human scores

                for k in i[3]:
                    score_lang.append([i[1],i[2],k[0],k[1]]) # A human score, A sample

                nums += len(i[3])
                num_sens_lang +=  len(i[3])

        collections[l] = score_lang     
#         print(f"{l}: {num_sens_lang}")
#     print(nums)
    return collections

# PASCAL-50S

In [4]:
def clean_up_french(ref_fr_dict, mt_fr_dict):
    tmp = ref_fr_dict.items()
    for key, value in tmp:
        new_val = re.sub(r"&\w+;\s","'",value)
        ref_fr_dict[key] = new_val

    tmp = mt_fr_dict.items()
    for key, value in tmp:
            new_val = [[re.sub(r"&\w+;\s","'",item[0]),item[1]]for item in value]
            mt_fr_dict[key] = new_val
    return ref_fr_dict, mt_fr_dict

In [5]:
def data_processing_pascal():
    collections = {
        "de": sub_processing("de"),
        "fr": sub_processing("fr")
    }
#     print("De: " + len(collections["de"]))
#     print("fr: " + len(collections["fr"]))
    
    return collections

def sub_processing(lang):
    data_path = sys.path[0] + "/human_assessment/"
    
    with open(f"{data_path}MMTsourcedict.json") as json_file:
        src_dict = json.load(json_file)
    with open(f"{data_path}MMTgolddict_{lang}.json") as json_file:
        ref_dict = json.load(json_file)
    with open(f"{data_path}MMTtranslationdict_{lang}.json") as json_file:
        mt_dict = json.load(json_file)
    
    if lang == "fr": # Additional clean up for French language
        ref_dict, mt_dict = clean_up_french(ref_dict, mt_dict)
    
    lang_match = []
    assert len(src_dict) == len(ref_dict)
    for src_id in src_dict:
        if src_id in mt_dict:
            for mt in mt_dict[src_id]:
                assert len(mt) == 2
                lang_match.append([src_dict[src_id], ref_dict[src_id], mt[0], mt[1]])
                                   
    return lang_match

# WMT-20

In [6]:
def data_processing_wmt20():
    data_path = sys.path[0]+"/wmt20_data/processed_data/"
    lang = ["neen","ende","eten", "enzh", "roen","sien","ruen"]
    
    nums = 0
    collections = {}

    for l in lang:    

        num_sens_lang = 0
        score_lang = []
        
        data = load_obj(data_path, l)
        
        for i in data:
            if len(i[1]) > 0: # With human scores

                score_lang.append([i[0],[],i[1],float(i[2])]) # A human score, A sample

                nums += 1
                num_sens_lang +=  1

        collections[l] = score_lang     
#         print(f"{l}: {num_sens_lang}")
#     print(nums)
    return collections

# WMD

## WMD with embeddings

In [7]:
### Hook Method: Each Layers' output
def layer_processing(model):
    layers = []

    def layer_hook(module, input_, output):
        layers.append(output[0])

    for i in model.encoder.layer:
        i.register_forward_hook(layer_hook)

    return layers

### Bert

In [8]:
# bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
# bert_model = AutoModel.from_pretrained('bert-base-multilingual-cased', return_dict=True)
# # bert_model.embeddings.word_embeddings
# bert_model.eval()
# layers = layer_processing(bert_model)

### XLM-Reborta

In [9]:
# from transformers import AutoTokenizer, AutoModel
# xlm_r_tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-base")
# xlm_r_model = AutoModel.from_pretrained("xlm-roberta-base",return_dict=True)
# xlm_r_model.eval()
# # xlm_r_model.embeddings.word_embeddings
# print()
# layers = layer_processing(xlm_r_model)

In [10]:
def get_WMD_Model(name):
    tokenizer = AutoTokenizer.from_pretrained(name)
    model = AutoModel.from_pretrained(name, return_dict=True)
    # bert_model.embeddings.word_embeddings
    model.eval()
    return tokenizer, model

## WMD Computation

In [11]:
### Weights of each token for WMD
## From the aspects of model embedding.
def tokens_to_fracdict(tokens):
    cntdict = defaultdict(lambda : 0)
        
    for token in tokens:
        cntdict[token] += 1
    totalcnt = sum(cntdict.values())
    return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}

## From the aspects of model output, considering contextual relationship.
## Each tokens means different, even they are the same.
def tokens_to_fracdict_contextual(tokens):
    return {token: 1/len(tokens) for token in range(len(tokens))}

In [12]:
## There are two components can be used as embedding
## 1) model embedding 
## 2) Model output states

def embedding_processing(sent1, sent2, tokenizer, model, embed_type):
    
    sent1_tokens = tokenizer.tokenize(sent1)
    sent2_tokens = tokenizer.tokenize(sent2)
    
    if embed_type == 1:
        
        sent1_buckets = tokens_to_fracdict(sent1_tokens)
        sent2_buckets = tokens_to_fracdict(sent2_tokens) 
        
        sent1_embedding = model.embeddings.word_embeddings(torch.tensor(tokenizer.convert_tokens_to_ids(list(sent1_buckets.keys()))))
        sent2_embedding = model.embeddings.word_embeddings(torch.tensor(tokenizer.convert_tokens_to_ids(list(sent2_buckets.keys()))))
        
    elif embed_type == 2:
        
#         sent1_buckets = tokens_to_fracdict(sent1_tokens)
#         sent2_buckets = tokens_to_fracdict(sent2_tokens) 
        
        sent1_buckets = tokens_to_fracdict_contextual(sent1_tokens)
        sent2_buckets = tokens_to_fracdict_contextual(sent2_tokens) 
        
        sent1_id = tokenizer(sent1,return_tensors="pt")
        sent2_id = tokenizer(sent2,return_tensors="pt")
        
#         sent1_embedding = model(sent1_id['input_ids']).last_hidden_state.squeeze(0)
#         sent2_embedding = model(sent2_id['input_ids']).last_hidden_state.squeeze(0)
        
        model(sent1_id['input_ids'])
        sent1_embedding = torch.mean(torch.stack(layers[-4:]).squeeze(1).permute(1,0,2), dim=1)
        
        model(sent2_id['input_ids'])
        sent2_embedding = torch.mean(torch.stack(layers[-4:]).squeeze(1).permute(1,0,2), dim=1)
    
    layers.clear()
    
    if sent1_embedding.size()[0] - 2 == len(sent1_tokens):
        sent1_embedding = sent1_embedding[1:-1,:] # Remove bos and eos tokens

    if sent2_embedding.size()[0] - 2 == len(sent2_tokens):
        sent2_embedding = sent2_embedding[1:-1,:] # Remove bos and eos tokens  
    
    
    all_embedding = torch.cat([sent1_embedding, sent2_embedding])

    assert len(sent1_buckets) + len(sent2_buckets) == all_embedding.size()[0]
    
    return sent1_buckets, sent2_buckets, all_embedding

In [13]:
def word_mover_distance_probspec(sent1_buckets, sent2_buckets, all_embedding, lpFile=None,):

    # Updated buckets with labeled name
    first_sent_buckets = {f"x{idx}": item[1] for idx, item in enumerate(sent1_buckets.items())}
    second_sent_buckets = {f"y{idx}": item[1] for idx, item in enumerate(sent2_buckets.items())}

    var_names = list(first_sent_buckets.keys()) + list(second_sent_buckets.keys())
    
    assert len(var_names) == all_embedding.size(0)
    
    wordvecs = {token: embedding.detach().numpy() for token, embedding in zip(var_names, all_embedding)}
    
    
    T = pulp.LpVariable.dicts('T_matrix', list(product(var_names, var_names)), lowBound=0)

    prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
    
    prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
                        for token1, token2 in product(var_names, var_names)])
    
    for token2 in second_sent_buckets:   #constrains
        prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
        
    for token1 in first_sent_buckets:    #constrains
        prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]

    if lpFile!=None:
        prob.writeLP(lpFile)

#     prob.solve()
    prob.solve(pulp.PULP_CBC_CMD(msg=False))

    return prob

In [14]:
def word_mover_distance(sent1, sent2, tokenizer, model, embed_type, lpFile=None):
    
    sent1_buckets, sent2_buckets, embeddings = embedding_processing(sent1, sent2, tokenizer, model, embed_type)
    
    prob = word_mover_distance_probspec(sent1_buckets, sent2_buckets, embeddings, lpFile=lpFile)
    
    return pulp.value(prob.objective)

## Fluent Based WMD

### Order penalty

In [15]:
from nltk import meteor_score
from nltk.stem.porter import PorterStemmer
from nltk.corpus import wordnet
from itertools import chain, product

def order_penalty(    
    reference,
    hypothesis,
    preprocess=str.lower,
    stemmer=PorterStemmer(),
    wordnet=wordnet):
    
    enum_hypothesis, enum_reference = meteor_score._generate_enums(
        hypothesis, reference, preprocess=preprocess
    )
    
    translation_length = len(enum_hypothesis)
    reference_length = len(enum_reference)
    
    matches, _, _ = meteor_score._enum_allign_words(enum_hypothesis, enum_reference, stemmer=stemmer)
    
    matches_count = len(matches)
    
    try:
        chunk_count = float(meteor_score._count_chunks(matches))
        frag_frac = chunk_count / matches_count
        
    except ZeroDivisionError: # No unigrams match
        return 0
    
    return frag_frac

In [16]:
def fluency_based_wmd(wmd, ref, hypo, gamma=0.2):
    
    frag_penalty = order_penalty(ref, hypo)

    # print(frag_penalty)
    
    return wmd - gamma *(0.5 - frag_penalty)

## Cosine Similarity

In [17]:
def getSentSimilarity(sents1, sents2, model):
    embed_sent1 = model.encode(sents1, convert_to_tensor=True)
    embed_sent2 = model.encode(sents2, convert_to_tensor=True)
    cos_sim = nn.CosineSimilarity(dim=1)(embed_sent1,embed_sent2)
    # Normalized
    cos_sim = (cos_sim -torch.min(cos_sim))/ (torch.max(cos_sim)-torch.min(cos_sim))
    return cos_sim.numpy()

## Bert Score

In [18]:
from datasets import load_metric
bert_score_metric = load_metric('bertscore', keep_in_memory=True, cache_dir=sys.path[0])

In [19]:
# model_type: bert-base-multilingual-cased, xlm-roberta-base

def getBertScore(sents1, sents2, model):
    bert_score_metric.add_batch(predictions=sents2, references=sents1)
    score = bert_score_metric.compute(model_type=model)
    # Normalized Bert Score F1
    norm_score = (score["f1"] -torch.min(score["f1"]))/ (torch.max(score["f1"])-torch.min(score["f1"]))
    return norm_score.tolist()

## Compound Method

In [20]:
## Two metrics are used
## Metric1 is main metric and metric2 is auxiliary
def combine_WMD_Similarity(metric1, metric2):
    output = [np.exp(v1) + np.exp(-v2) for v1,v2 in zip(metric1, metric2)]
#     output = [-1*v1 + v2 for v1,v2 in zip(wmd, similarity)]
    return output

# Evalutaion

In [21]:
def compute_WMD_WMDo(sents, tokenizer, model, embed_type, fluent=False, cross_ling=False):
    
    wmd = []
    wmdo =[]
    
    for i in range(len(sents)):  # Sent structure: [src, ref, MT, score]
        hypothesis = sents[i][2]
        
        if cross_ling:
            reference = sents[i][0]   # src - mt
        else:
            reference = sents[i][1]   # ref - mt
       
        wmd.append(word_mover_distance(reference, hypothesis, tokenizer, model, embed_type))

        if fluent:
            wmdo.append(fluency_based_wmd(wmd_tmp, reference, hypothesis))
                
#         if i % 100 == 0:
#             print(i)

    # Normalize
    wmd = [(val-min(wmd))/(max(wmd)-min(wmd)) for val in wmd]
    wmdo = [(val-min(wmdo))/(max(wmdo)-min(wmdo)) for val in wmdo]

    return np.array(wmd), np.array(wmdo)

In [22]:
## Correlation evaluation
def evaluation(wmd, score):
    pearson = stats.pearsonr(wmd, score)
    spearman = stats.spearmanr(wmd, score)
    print("Spearman Correlation:", spearman)
    print("Pearson Correlation:", pearson)

In [23]:
## Save Metrics
def save_metrics(name, metric, score):
    filePath = f"{sys.path[0]}/Metrics/{name}"
    file = open(f"{filePath}.pkl", 'wb') 
    pickle.dump([metric, score], file)
    file.close()

def load_metrics(name):
    path = f"{sys.path[0]}/Metrics/{name}"
    file = open(f"{filePath}.pkl", 'rb')
    data = pickle.load(file)
    file.close()
    return data[0], data[1]

In [24]:
## metric: numpy array
## score: numpy array
def scatter_diagram(metric, score):
#     metric = metric - np.mean(metric)
#     score = score-np.mean(score)
    plt.scatter(metric,score)
    plt.xlabel("human score")
    plt.ylabel("Normalized score")
    # plt.legend(["wmd", "wmdo"])
    plt.grid()

# Test

In [25]:
wmd_tokenizer, wmd_model = get_WMD_Model('xlm-roberta-base') 

In [26]:
bert_score_model = 'xlm-roberta-base'
# bert_score_model = 'bert-base-multilingual-cased'

In [27]:
cos_sim_model = SentenceTransformer('xlm-r-bert-base-nli-stsb-mean-tokens')

In [28]:
layers = layer_processing(wmd_model)

In [29]:
'''
wmd_tokenizer, wmd_model: tokenizers and pretrained model used in the wmd
bert_score_model: specified model type used in the bert score
cos_sim_model: cosine similarity model to compute the embedding of sentences
save_path: save the each metric with human score 
lang: Provides the language to be estimated. Otherwise, the whole 
fluent: Whether fluent based Wmdo are used
cross_ling: 
    True: Cross-linguistic, evaluate src - MT
    False: Mono-linguistic, evaluate ref - MT
'''

def WMT20_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, langs=None, fluent=False, cross_ling=False):
    if not langs:
        langs = ["neen","ende","eten", "enzh", "roen","sien","ruen"]
    collections = data_processing_wmt20()
    testing(collections, langs, wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, fluent=fluent, cross_ling=cross_ling)
    
def WMT17_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, langs=None, fluent=False, cross_ling=False):
    if not langs:
        langs = ["csen","deen","enru", "enzh", "fien","lven","ruen", "zhen"]
    collections = data_processing_wmt17()
    testing(collections, langs, wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, fluent=fluent, cross_ling=cross_ling)

    
def PASCAL_Testing(wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, langs=None, fluent=False, cross_ling=False):
    if not langs:
        langs = ["fr","de"]
    collections = data_processing_pascal()
    testing(collections, langs, wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, fluent=fluent, cross_ling=cross_ling)
    
    
def testing(collections, langs, wmd_tokenizer, wmd_model, bert_score_model, cos_sim_model, save_path, fluent=False, cross_ling=False): 
    
    for lang in langs:
        
        print(f"Processing {lang} data:")
        
        src, ref, hypothesis, score = get_lang_translation(collections, lang)
        
        if cross_ling:             
            reference = src
        else:
            reference = ref
            
        wmd, wmdo = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=fluent, cross_ling=cross_ling)
        
        print(f"Average WMD: {sum(wmd)/len(wmd)}")
        evaluation(wmd, score)
        save_metrics(f"{save_path}/{lang}_wmd", wmd, score)
        
        similarity = getSentSimilarity(hypothesis, reference, cos_sim_model)
        print(f"Average Cosine similarity: {sum(similarity)/len(similarity)}")
        evaluation(similarity, score)
        save_metrics(f"{save_path}/{lang}_cs", similarity, score)
        
        
        bert_score = getBertScore(hypothesis, reference, bert_score_model)
        print(f"Average Bert Score: {sum(bert_score)/len(bert_score)}")
        evaluation(bert_score, score)
        save_metrics(f"{save_path}/{lang}_bs", bert_score, score)
        
        
        compound_metric = combine_WMD_Similarity(similarity, wmd)
        print(f"Average compound metric: {sum(compound_metric)/len(compound_metric)}")
        evaluation(compound_metric, score)
        save_metrics(f"{save_path}/{lang}_cs", compound_metric, score)
        
        
        if fluent:
            print(f"Average WMDo: {sum(wmdo)/len(wmdo)}")
            evaluation(wmdo, score)
            save_metrics(f"{save_path}/{lang}_wmdo", wmd, score)
            
            compound_metric_o = combine_WMD_Similarity(similarity, wmdo)
            print(f"Average compound metric: {sum(compound_metric_o)/len(compound_metric_o)}")
            evaluation(compound_metric_o, score)
            save_metrics(f"{save_path}/{lang}_cso", compound_metric_o, score)
    
    

### WMT=17

In [None]:
# lang ="deen"
# collections = data_processing_wmt17()
# src, ref, MT, score = get_lang_translation(collections, lang)

In [None]:
# wmd, wmd0 = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=False, cross_lingual=False)

In [None]:
# evaluation(wmd, score)

In [None]:
# similarity = getSentSimilarity(ref, MT, cos_sim_model)
# evaluation(similarity, score)

In [None]:
# bert_score = getBertScore(ref, MT, bert_score_model)
# evaluation(bert_score, score)

In [None]:
# compound_metric = combine_WMD_Similarity(similarity, wmd)
# evaluation(compound_metric, score)

In [None]:
# save_metrics("deen_src", compound_metric, score)

In [None]:
# # similarity + wmd
# Spearman Correlation: SpearmanrResult(correlation=0.6354514842202942, pvalue=1.1451312453135562e-64)
# Pearson Correlation: (0.6474685652507881, 7.521815837611303e-68)
# # Bert score
# Spearman Correlation: SpearmanrResult(correlation=0.6113722119194076, pvalue=1.0676403977139098e-58)
# Pearson Correlation: (0.6199900522366385, 8.944390549431547e-61)
# # similarity
# Spearman Correlation: SpearmanrResult(correlation=0.5864668846884991, pvalue=4.845536502383926e-53)
# Pearson Correlation: (0.5800512616953695, 1.1600227084324783e-51)
# # wmd
# Spearman Correlation: SpearmanrResult(correlation=-0.5769821833433325, pvalue=5.169696250950864e-51)
# Pearson Correlation: (-0.5764824968926663, 6.583882495171967e-51)

In [97]:
lang ="enzh"
save_path = "WMT17/src_mt"
collections = data_processing_wmt17()
reference, ref, hypothesis, score = get_lang_translation(collections, lang)    
wmd, wmdo = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=False, cross_ling=True)
        
print(f"Average WMD: {sum(wmd)/len(wmd)}")
evaluation(wmd, score)
save_metrics(f"{save_path}/{lang}_wmd", wmd, score)

similarity = getSentSimilarity(hypothesis, reference, cos_sim_model)
print(f"Average Cosine similarity: {sum(similarity)/len(similarity)}")
evaluation(similarity, score)
save_metrics(f"{save_path}/{lang}_cs", similarity, score)

bert_score = getBertScore(hypothesis, reference, bert_score_model)
print(f"Average Bert Score: {sum(bert_score)/len(bert_score)}")
evaluation(bert_score, score)
save_metrics(f"{save_path}/{lang}_bs", bert_score, score)

compound_metric = combine_WMD_Similarity(similarity, wmd)
print(f"Average compound metric: {sum(compound_metric)/len(compound_metric)}")
evaluation(compound_metric, score)
save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)


Average WMD: 0.44656908581620436
Spearman Correlation: SpearmanrResult(correlation=-0.3594431560231835, pvalue=1.6034094853448769e-18)
Pearson Correlation: (-0.3731727424348741, 6.045474158745098e-20)
Average Cosine similarity: 0.7714485871971452
Spearman Correlation: SpearmanrResult(correlation=0.4558142951439806, pvalue=4.433524936294658e-30)
Pearson Correlation: (0.4577949452014327, 2.332494014824013e-30)




Average Bert Score: 0.6129297337095653
Spearman Correlation: SpearmanrResult(correlation=0.4343902335694401, pvalue=3.545459940247522e-27)
Pearson Correlation: (0.4456844977556345, 1.1093008814434217e-28)
Average compound metric: 2.838093718163001
Spearman Correlation: SpearmanrResult(correlation=0.48337695136600756, pvalue=3.947252268042836e-34)
Pearson Correlation: (0.4964085345318672, 3.540878161928391e-36)


In [102]:
m1 = combine_WMD_Similarity(wmd, bert_score)
evaluation(m1, score)

Spearman Correlation: SpearmanrResult(correlation=-0.38138687577812064, pvalue=7.880118523915315e-21)
Pearson Correlation: (-0.39793172755812567, 1.0872668821543204e-22)


### WMT-20

In [None]:
# lang ="ne-en"
# collections = data_processing_wmt20()
# src, ref, MT, score = get_lang_translation(collections, lang)

In [None]:
# wmd, wmd0 = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=False, cross_lingual=True)

In [None]:
# evaluation(wmd, score)

In [None]:
# similarity = getSentSimilarity(src, MT, cos_sim_model)
# evaluation(similarity, score)

In [None]:
# bert_score = getBertScore(src, MT, bert_score_model)
# evaluation(bert_score, score)

In [None]:
# compound_metric = combine_WMD_Similarity(wmd, similarity)
# evaluation(compound_metric, score)

In [None]:
# # wmd + similarity
# Spearman Correlation: SpearmanrResult(correlation=-0.42386348786348793, pvalue=7.177309137106313e-45)
# Pearson Correlation: (-0.4010307328248355, 6.296505116193518e-40)
# # Bert score    
# Spearman Correlation: SpearmanrResult(correlation=0.4161586641586642, pvalue=3.689162607653665e-43)
# Pearson Correlation: (0.4065892159098228, 4.2734690689076027e-41)
# # similarity    
# Spearman Correlation: SpearmanrResult(correlation=0.33880733182375383, pvalue=2.7863502204930564e-28)
# Pearson Correlation: (0.31293965321378914, 3.6938392780287396e-24)
# # wmd
# Spearman Correlation: SpearmanrResult(correlation=-0.3927715887715888, pvalue=3.1227407872395823e-38)
# Pearson Correlation: (-0.3748131198452938, 1.0431379070860892e-34)

In [53]:
lang ="ruen"
save_path = "WMT20/src_mt"
collections = data_processing_wmt20()
reference, ref, hypothesis, score = get_lang_translation(collections, lang)

wmd, wmdo = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=False, cross_ling=True)
        
print(f"Average WMD: {sum(wmd)/len(wmd)}")
evaluation(wmd, score)
# save_metrics(f"{save_path}/{lang}_wmd", wmd, score)

similarity = getSentSimilarity(hypothesis, reference, cos_sim_model)
print(f"Average Cosine similarity: {sum(similarity)/len(similarity)}")
evaluation(similarity, score)
# save_metrics(f"{save_path}/{lang}_cs", similarity, score)

bert_score = getBertScore(hypothesis, reference, bert_score_model)
print(f"Average Bert Score: {sum(bert_score)/len(bert_score)}")
evaluation(bert_score, score)
# save_metrics(f"{save_path}/{lang}_bs", bert_score, score)

compound_metric = combine_WMD_Similarity(similarity, wmd)
print(f"Average compound metric: {sum(compound_metric)/len(compound_metric)}")
evaluation(compound_metric, score)
# save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)


Average WMD: 0.5447597079218107
Spearman Correlation: SpearmanrResult(correlation=-0.26313208713208713, pvalue=2.6718130122671233e-17)
Pearson Correlation: (-0.29166047648402227, 4.638414036310341e-21)
Average Cosine similarity: 0.8006474618464708
Spearman Correlation: SpearmanrResult(correlation=0.4586783186783187, pvalue=3.5416264958295277e-53)
Pearson Correlation: (0.44133880200919595, 6.41350465831774e-49)




Average Bert Score: 0.5580206314427778
Spearman Correlation: SpearmanrResult(correlation=0.2954577434577435, pvalue=1.3556919535309243e-21)
Pearson Correlation: (0.3196547139290076, 3.431674756689878e-25)
Average compound metric: 2.8483759578133085
Spearman Correlation: SpearmanrResult(correlation=0.457913293913294, pvalue=5.524424330449296e-53)
Pearson Correlation: (0.4740848525259266, 3.5856506175883255e-57)


In [54]:
m1 = combine_WMD_Similarity(similarity, bert_score)
evaluation(m1, score)

Spearman Correlation: SpearmanrResult(correlation=0.409045417045417, pvalue=1.2806445785385669e-41)
Pearson Correlation: (0.39535946656418003, 9.299455669166302e-39)


### PASCAL

In [None]:
lang ="de"
collections = data_processing_pascal()
src, ref, MT, score = get_lang_translation(collections, lang)

In [None]:
wmd, wmdo = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=False, cross_ling=True)

In [None]:
similarity = getSentSimilarity(ref, MT, cos_sim_model)
evaluation(similarity, score)

In [None]:
compound_metric = combine_WMD_Similarity(similarity, wmd)
compound_metric_o = combine_WMD_Similarity(similarity, wmdo)

evaluation(compound_metric, score)
evaluation(compound_metric_o, score)
evaluation(wmd, score)

In [None]:
# save_metrics("de_src_mt", compound_metric, score)

In [55]:
lang ="fr"
save_path = "PASCAL/src_mt"
collections = data_processing_pascal()
reference, ref, hypothesis, score = get_lang_translation(collections, lang)    
wmd, wmdo = compute_WMD_WMDo(collections[lang], wmd_tokenizer, wmd_model, embed_type=2, fluent=False, cross_ling=True)
        
print(f"Average WMD: {sum(wmd)/len(wmd)}")
evaluation(wmd, score)
save_metrics(f"{save_path}/{lang}_wmd", wmd, score)

similarity = getSentSimilarity(hypothesis, reference, cos_sim_model)
print(f"Average Cosine similarity: {sum(similarity)/len(similarity)}")
evaluation(similarity, score)
save_metrics(f"{save_path}/{lang}_cs", similarity, score)

bert_score = getBertScore(hypothesis, reference, bert_score_model)
print(f"Average Bert Score: {sum(bert_score)/len(bert_score)}")
evaluation(bert_score, score)
save_metrics(f"{save_path}/{lang}_bs", bert_score, score)

compound_metric = combine_WMD_Similarity(similarity, wmd)
print(f"Average compound metric: {sum(compound_metric)/len(compound_metric)}")
evaluation(compound_metric, score)
save_metrics(f"{save_path}/{lang}_compound", compound_metric, score)


Average WMD: 0.426280527932965
Spearman Correlation: SpearmanrResult(correlation=-0.34767295777287427, pvalue=1.6511001375190912e-72)
Pearson Correlation: (-0.3115109157512241, 7.825970965212337e-58)
Average Cosine similarity: 0.8005517303662759
Spearman Correlation: SpearmanrResult(correlation=0.5769663065131238, pvalue=1.2716484759879025e-223)
Pearson Correlation: (0.44880634943075265, 3.6529508344078213e-125)
Average Bert Score: 0.6898668720502644
Spearman Correlation: SpearmanrResult(correlation=0.3693158281199988, pvalue=2.845619175293309e-82)
Pearson Correlation: (0.29082513817044003, 2.6086283768258104e-50)
Average compound metric: 2.941454745213255
Spearman Correlation: SpearmanrResult(correlation=0.5805957503435236, pvalue=4.439003469240075e-227)
Pearson Correlation: (0.4761938931193055, 8.133499336247869e-143)


In [56]:
m1 = combine_WMD_Similarity(similarity, bert_score)
evaluation(m1, score)

Spearman Correlation: SpearmanrResult(correlation=0.5577853959020562, pvalue=4.7170872087520807e-206)
Pearson Correlation: (0.5281844035367238, 4.3126248308364895e-181)


In [None]:
# # Compound
# Spearman Correlation: SpearmanrResult(correlation=0.5583900565528739, pvalue=7.008387361962618e-285)
# Pearson Correlation: (0.5345921289288134, 8.221075657865956e-257)
# # WMDo
# Spearman Correlation: SpearmanrResult(correlation=0.5593502697055622, pvalue=4.621492592222047e-286)
# Pearson Correlation: (0.5361630924475681, 1.3513459056589761e-258)
# # WMD
# Spearman Correlation: SpearmanrResult(correlation=-0.5089823380124615, pvalue=5.165687764724598e-229)
# Pearson Correlation: (-0.49318902941149667, 5.109081581867692e-213)
# # Cosine Similarity
# Spearman Correlation: SpearmanrResult(correlation=0.533531776812245, pvalue=1.2997631025228642e-255)
# Pearson Correlation: (0.48671856252666984, 1.053467840253653e-206)

In [None]:
# # Compound
# Spearman Correlation: SpearmanrResult(correlation=0.6277348161095964, pvalue=3.092750295851163e-276)
# Pearson Correlation: (0.5053510100353606, 1.8336657874693513e-163)
# # WMDo
# Spearman Correlation: SpearmanrResult(correlation=0.6276570124525135, pvalue=3.789024954110633e-276)
# Pearson Correlation: (0.5062459132918665, 3.957301711429879e-164)
# # WMD    
# Spearman Correlation: SpearmanrResult(correlation=-0.5260006850493311, pvalue=2.3959546961579428e-179)
# Pearson Correlation: (-0.43285363745499555, 1.3322431658079695e-115)
# # Cosine similarity
# Spearman Correlation: SpearmanrResult(correlation=0.6140189852157125, pvalue=4.533173262128837e-261)
# Pearson Correlation: (0.4497808438694378, 9.158265216134208e-126)