In [1]:
import pandas as pd
import numpy as np
import json
import tqdm
import os
import truecase
from mosestokenizer import MosesDetokenizer

from Metrics.mt_utils import (find_corpus, 
                      load_data, 
                      load_metadata, 
                      print_sys_level_correlation, 
                      output_MT_correlation,
                      df_append)
import torch

In [4]:
lps = find_corpus("WMT17").values()

operations = dict([
    ('mean', (lambda values: [np.mean(values, axis=0)])),
    ('max', (lambda values: [np.max(values, axis=0)])), 
    ('min', (lambda values: [np.min(values, axis=0)])),
    ('p_mean_2', (lambda values: [gen_mean(values, p=2.0).real])),
    ('p_mean_3', (lambda values: [gen_mean(values, p=3.0).real])),
    ("hmean", lambda values: [hmean(values)]),
    ("gmean", (lambda values: [gmean(values)]))
])

for o in operations:
  print(o)
  output_MT_correlation(lps,'simalign'+o, os.path.join(f"Metrics/WMT17/", 'DA-seglevel.csv'))

mean
Metrics/WMT17/DA-seglevel.csv
cs-en	0.241
de-en	0.370
fi-en	0.355
lv-en	0.254
ru-en	0.273
tr-en	0.337
zh-en	0.457
max
Metrics/WMT17/DA-seglevel.csv
cs-en	0.209
de-en	0.229
fi-en	0.331
lv-en	0.288
ru-en	0.181
tr-en	0.232
zh-en	0.322
min
Metrics/WMT17/DA-seglevel.csv
cs-en	0.231
de-en	0.254
fi-en	0.326
lv-en	0.250
ru-en	0.216
tr-en	0.326
zh-en	0.304
p_mean_2
Metrics/WMT17/DA-seglevel.csv
cs-en	0.240
de-en	0.367
fi-en	0.354
lv-en	0.252
ru-en	0.271
tr-en	0.334
zh-en	0.456
p_mean_3
Metrics/WMT17/DA-seglevel.csv
cs-en	0.238
de-en	0.365
fi-en	0.352
lv-en	0.250
ru-en	0.268
tr-en	0.331
zh-en	0.454
hmean
Metrics/WMT17/DA-seglevel.csv
cs-en	0.245
de-en	0.374
fi-en	0.358
lv-en	0.259
ru-en	0.277
tr-en	0.343
zh-en	0.458
gmean
Metrics/WMT17/DA-seglevel.csv
cs-en	0.243
de-en	0.372
fi-en	0.357
lv-en	0.257
ru-en	0.275
tr-en	0.340
zh-en	0.458


In [16]:
import json
year = "17"
include_path = find_corpus("WMT"+year)
data = {}
for k,v in include_path.items():
  with open(f"Results{year}/awesomealign{v}.json") as f:
    data[v] = json.load(f)
  break

In [9]:
from scipy.stats.mstats import gmean, hmean
def gen_mean(vals, p):
    p = float(p)
    return np.power(
        np.mean(
            np.power(
                np.array(vals, dtype=complex),
                p),
            axis=0),
        1 / p
    )

operations = dict([
    ('mean', (lambda values: [np.mean(values, axis=0)])),
    ('max', (lambda values: [np.max(values, axis=0)])), 
    ('min', (lambda values: [np.min(values, axis=0)])),
    ('p_mean_2', (lambda values: [gen_mean(values, p=2.0).real])),
    ('p_mean_3', (lambda values: [gen_mean(values, p=3.0).real])),
    ("hmean", lambda values: [hmean(values)]),
    ("gmean", (lambda values: [gmean(values)]))
])


In [62]:
operations[o]

NameError: name 'operations' is not defined

In [6]:

def evaluate_results(data,level, operations,year):
  dataset = find_corpus("WMT"+year)
  for o in operations:
      wmt17scores = []
      for key1,values in data.items():
          for key2, values2 in values.items():
            ref = key2.split("-")
            testset = ref[0]
            lp = ref[1]+"-"+ref[2]
            system = "-".join(ref[3:])
            scores = []
            num_samples =  len(values2)
            for date in values2:
                scores.append(operations[o](date["awesome_align_sim_scores"])[0])
            wmt17scores.append(df_append('awesomealign'+o, num_samples, lp, testset, system, scores))
      if level == "sys":
        print_sys_level_correlation('awesomealign'+o, wmt17scores, list(dataset.values()), os.path.join("Metrics/WMT17/", 'DA-syslevel.csv'))
      else:
        print_seg_level_correlation_wmt17('awesomealign'+o, wmt17scores, list(dataset.values()), os.path.join("Metrics/WMT17/", 'DA-seglevel.csv'))
    

In [None]:
for o in operations:
  output_MT_correlation('awesomealign'+o,f)

In [7]:
evaluate_results(data, "seg", operations, "17")

NameError: name 'data' is not defined

In [48]:
#Ensembles
wmt17min = []
wmt17avg = []
wmt17max = []
for k,v in awesomealign_data.items():
    ref = k.split("-")
    testset = ref[0]
    lp = ref[1]+"-"+ref[2]
    system = "-".join(ref[3:])

    scoresmin = []
    scoresmax = []
    scoresavg = []
    num_samples =  len(v)
    for i,date in enumerate(v):
        sim_align_date = simalign_data[k][i]
        awesome_align_score = (sum(date["awesome_align_sim_scores"])/len(date["awesome_align_sim_scores"]))
        sim_align_score = sum(sim_align_date)/len(sim_align_date)
        scoresmin.append(min(awesome_align_score,sim_align_score))
        scoresmax.append(min(awesome_align_score,sim_align_score))
        scoresavg.append((awesome_align_score+sim_align_score)/2)
    wmt17min.append(df_append('ensemble_min', num_samples, lp, testset, system, scoresmin)) 
    wmt17avg.append(df_append('ensemble_avg', num_samples, lp, testset, system, scoresavg)) 
    wmt17max.append(df_append('ensemble_max', num_samples, lp, testset, system, scoresmax)) 


In [3]:
from itertools import chain
from collections import defaultdict, Counter
from multiprocessing import Pool
from functools import partial
from math import log

import transformers
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
def process(a, tokenizer):
 return tokenizer.encode(
                 a, add_special_tokens=True, max_length=tokenizer.model_max_length, truncation=True,
            )
            
def get_idf_dict(arr, tokenizer, nthreads=4):
    """
    Returns mapping from word piece index to its inverse document frequency.
    Args:
        - :param: `arr` (list of str) : sentences to process.
        - :param: `tokenizer` : a BERT tokenizer corresponds to `model`.
        - :param: `nthreads` (int) : number of CPU threads to use
    """
    idf_count = Counter()
    num_docs = len(arr)

    process_partial = partial(process, tokenizer=tokenizer)

    with Pool(nthreads) as p:
        idf_count.update(chain.from_iterable(p.map(process_partial, arr)))

    idf_dict = defaultdict(lambda: log((num_docs + 1) / (1)))
    idf_dict.update({idx: log((num_docs + 1) / (c + 1)) for (idx, c) in idf_count.items()})
    return idf_dict


In [4]:
include_path = 'Metrics/WMT17/'
data_idf_dict = {}
dataset = find_corpus("WMT17")
for pair in dataset.items():
    reference_path, lp = pair
    references = load_data(os.path.join( include_path +'references/', reference_path))
    src, tgt = lp.split('-')
    source_path = reference_path.replace('ref', 'src')
    source_path = source_path.split('.')[0] + '.' + src  
    source = load_data(os.path.join(include_path +'source', source_path))
    all_meta_data = load_metadata(os.path.join(include_path + 'system-outputs', lp))
    with MosesDetokenizer(src) as detokenize:        
        source = [detokenize(s.split(' ')) for s in source]         
    with MosesDetokenizer(tgt) as detokenize:                
        references = [detokenize(s.split(' ')) for s in references]
    trans = []
    for i in range(len(all_meta_data)):
        path, testset, lp, system = all_meta_data[i]
        
        translations = load_data(path)        
        num_samples = len(references)

        with MosesDetokenizer(tgt) as detokenize:                    
            translations = [detokenize(s.split(' ')) for s in translations]
        translations = [truecase.get_true_case(s) for s in translations]
        data_idf_dict[f"{testset}-{lp}-{system}"] = {}
        data_idf_dict[f"{testset}-{lp}-{system}"]["src"] = get_idf_dict(source, tokenizer)
        data_idf_dict[f"{testset}-{lp}-{system}"]["trans"] = get_idf_dict(translations, tokenizer)
    break

In [7]:
translations[0]

'Eight-Year-Old chef found dead in store in San Francisco'

In [64]:
# %%
import transformers
model = transformers.BertModel.from_pretrained('bert-base-multilingual-cased')
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-multilingual-cased')
import itertools
from sklearn.metrics.pairwise import cosine_similarity
def get_similarity(X: np.ndarray, Y: np.ndarray) -> np.ndarray:
  return (cosine_similarity(X, Y) + 1.0) / 2.0
device = torch.device('cuda')
def create_awesome_data(src, trans):
  sent_src, sent_tgt = src.strip().split(), trans.strip().split()
  token_src, token_tgt = [tokenizer.tokenize(word) for word in sent_src], [tokenizer.tokenize(word) for word in sent_tgt]
  wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
  ids_src, ids_tgt = tokenizer.prepare_for_model(list(itertools.chain(*wid_src)), return_tensors='pt', model_max_length=tokenizer.model_max_length, truncation=True)['input_ids'], tokenizer.prepare_for_model(list(itertools.chain(*wid_tgt)), return_tensors='pt', truncation=True, model_max_length=tokenizer.model_max_length)['input_ids']
  
  ids_src, ids_tgt = ids_src.to(device), ids_tgt.to(device)
  sub2word_map_src = []
  for i, word_list in enumerate(token_src):
    sub2word_map_src += [i for x in word_list]
  sub2word_map_tgt = []
  for i, word_list in enumerate(token_tgt):
    sub2word_map_tgt += [i for x in word_list]

  # alignment
  align_layer = 8
  threshold = 1e-3
  model.to(device)
  model.eval()
  with torch.no_grad():
    out_src = model(ids_src.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]
    out_tgt = model(ids_tgt.unsqueeze(0), output_hidden_states=True)[2][align_layer][0, 1:-1]

    dot_prod = torch.matmul(out_src, out_tgt.transpose(-1, -2))

    softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
    softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)

    softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)

  align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
  cos = get_similarity(out_src.cpu(), out_tgt.cpu())
  sims = []
  for i, j in align_subwords:
     sims.append( float(cos[i][j]))
  
  return  {"awesome_align_sim_scores" : sims, "align_subwords" : align_subwords.tolist(), "cos": cos}

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [65]:
tokenizer.decode([117])

','

In [39]:
data = create_awesome_data(source[0], translations[0])

In [60]:
def create_token_ids(text):
  token_src = [tokenizer.tokenize(word) for word in text.split()]
  wid_src = [tokenizer.convert_tokens_to_ids(x) for x in token_src]
  return list(itertools.chain(*wid_src))

In [25]:
token_src, token_tgt = [tokenizer.tokenize(word) for word in source[0].split()], [tokenizer.tokenize(word) for word in translations[0].split()]

In [26]:
wid_src, wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_src], [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]

In [33]:
len(ids_tgt)

15