## Imports

In [1]:
!pip install transformers==3.3.0
import numpy as np
import string, random
import os
from time import perf_counter
import pandas as pd
from scipy.stats import pearsonr
import tracemalloc



In [2]:
# set model
model_name = "albert-base-v2" # one of "distilbert-base-uncased", "bert-base-uncased", "roberta-base", "albert-base-v2"
os.environ['MOVERSCORE_MODEL'] = model_name

In [3]:
# installs
!pip3 install moverscore
!pip install sentencepiece



In [4]:
from moverscore_v2 import get_idf_dict, word_mover_score

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

## Preprocessing

In [5]:
# load data file from drive
from google.colab import drive
drive.mount("/content/drive")
dpath = "/content/drive/MyDrive/data/MSE/scores_deen.csv"
df = pd.read_csv(dpath, usecols=['mt', 'ref', 'raw_score'])

Mounted at /content/drive


In [7]:
# process data to lists of machine translation, reference and human score
translations = df.mt.tolist()
references = df.ref.tolist()
raw_scores = df.raw_score.tolist()

In [8]:
VARIANT = 'Variant'
CORRELATION = 'Pearson Corr.'
TIME = 'Time F1 Score'
BERTTIME = '% Time BERT'
MEMORY = 'max Memory Usage'
BERTMEM = '% Memory BERT'

In [9]:
def results_dict():
  """
  builds an empty dict of all necessary  values
  :returns: results (dict): dict of value : empty list to fill in
  """
  results = {
      VARIANT: [],
      CORRELATION: [],
      TIME: [],
      BERTTIME: [],
      MEMORY: [],
      BERTMEM: []
  }
  return results

## Scoring

In [15]:
# TODO run multiple times to get more robust averages 
# calculate f1 scores + durations for all variants
def get_scores(references, translations, raw_scores):
  """
  calculates scores for given references and translations and a pearson correlation
  with human scores
  :param:   references (list): list of reference translations
  :param:   translations (list): list of machine translations
  :param:   raw_scores (list): list of human scores between references and mt
  :returns: results (dict): dict of variant, time and memory footprint and correlation
  """
  results = results_dict()
  m = model_name
  print(m)
  idf_dict_hyp = get_idf_dict(translations)
  idf_dict_ref = get_idf_dict(references)
  results[VARIANT].append(m)
  tracemalloc.start()
  s = perf_counter()
  scores = word_mover_score(references, translations, idf_dict_ref, idf_dict_hyp, 
                            stop_words=[], n_gram=1, remove_subwords=True, batch_size=16)
  results[TIME].append(perf_counter() - s)
  current, peak = tracemalloc.get_traced_memory()
  print(f'Current memory usage is {current / 10**6}MB; Peak was {peak / 10**6}MB, time taken {perf_counter() - s:.2f} s')
  tracemalloc.stop()
  results[MEMORY].append(peak)
  assert len(scores) == len(raw_scores), f"Scores are not of the same length, (F1: {scores}, human: {raw_scores})"
  corr, _ = pearsonr(scores, raw_scores)

  results[CORRELATION].append(f'{corr:.3f}')
  return results

In [11]:
# only to ignore warning messages from transformers
import warnings
warnings.filterwarnings("ignore")

from transformers import logging

logging.set_verbosity_error()

In [12]:
# zip translations, references and according scores together
d_set = list(zip(translations, references, raw_scores))

In [16]:
# limited number of examples due to colab limitations
num_ex = 10000
n = 5
res = {}
# iterate over samples of given data
for i in range(n):
  sample = random.sample(d_set, num_ex)
  t, r, s = zip(*sample)
  print(f'Iteration {i+1}')
  res[i] = get_scores(t, r, s)


Iteration 1
albert-base-v2
Current memory usage is 0.483096MB; Peak was 3.183791MB, time taken 144.52 s
Iteration 2
albert-base-v2
Current memory usage is 0.432384MB; Peak was 3.198395MB, time taken 138.64 s
Iteration 3
albert-base-v2
Current memory usage is 0.366708MB; Peak was 3.251741MB, time taken 141.82 s
Iteration 4
albert-base-v2
Current memory usage is 0.515152MB; Peak was 3.109372MB, time taken 147.08 s
Iteration 5
albert-base-v2
Current memory usage is 0.475575MB; Peak was 3.273053MB, time taken 143.91 s


## Results

In [17]:
# combine multiple runs into average DataFrame
df_list = []
for i in range(n):
  try:
    del res[i][BERTMEM]
    del res[i][BERTTIME]
  except:
    pass  
  df_list.append(pd.DataFrame.from_dict(res[i]).astype({'Pearson Corr.': 'float64'}))
df = pd.concat(df_list)
average = df.groupby(VARIANT, sort=False, as_index=False).mean()
result_df = pd.DataFrame.to_dict(average, orient='list')

In [22]:
# build and save dataframe
df = pd.DataFrame(result_df, columns=[VARIANT, CORRELATION, TIME, MEMORY]).set_index(VARIANT)
df.to_csv(f"MS_{os.environ.get('MOVERSCORE_MODEL')}.csv")

In [21]:
df

Unnamed: 0,Variant,Pearson Corr.,Time F1 Score,max Memory Usage
0,albert-base-v2,0.251,143.193808,3203270.4
