## `BERTScore` metric with `BERT` alternatives 

In [1]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.10-py3-none-any.whl (59 kB)
[?25l[K     |█████▌                          | 10 kB 31.1 MB/s eta 0:00:01[K     |███████████                     | 20 kB 33.7 MB/s eta 0:00:01[K     |████████████████▍               | 30 kB 22.4 MB/s eta 0:00:01[K     |██████████████████████          | 40 kB 18.1 MB/s eta 0:00:01[K     |███████████████████████████▍    | 51 kB 9.2 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 4.5 MB/s 
Collecting transformers>=3.0.0numpy
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 18.0 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 62.1 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylin

In [2]:
# check installation
import bert_score
bert_score.__version__

'0.3.10'

In [3]:
import transformers
import tracemalloc
import random
import pandas as pd
import numpy as np
from time import perf_counter
from bert_score import score
from scipy.stats import pearsonr

In [4]:
# get dataset
from google.colab import drive
drive.mount("/content/drive")
dpath = "/content/drive/MyDrive/data/MSE/scores_deen.csv"
df = pd.read_csv(dpath, usecols=['mt', 'ref', 'raw_score'])

Mounted at /content/drive


In [5]:
translations = df.mt.tolist()
references = df.ref.tolist()
raw_scores = df.raw_score.tolist()

Calculating scores


In [6]:
VARIANT = 'Variant'
CORRELATION = 'Pearson Corr.'
TIME = 'Time F1 Score'
BERTTIME = '% Time BERT'
MEMORY = 'max Memory Usage'
BERTMEM = '% Memory BERT'

In [7]:
def get_variants():
  """
  defines bert variants to test and builds an empty dict of all necessary  values
  :returns: variants (list): list of BERT variants to test on
  :returns: results (dict): dict of value : empty list to fill in
  """
  variants = ["bert-base-uncased", "roberta-base", "albert-base-v2", "distilbert-base-uncased", "microsoft/deberta-base"]
  results = {
      VARIANT: [],
      CORRELATION: [],
      TIME: [],
      BERTTIME: [],
      MEMORY: [],
      BERTMEM: []
  }
  return variants, results

In [13]:
def get_scores(cands, refs, raw_scores, idf):
  """
  calculates scores for given references and translations and a pearson correlation
  with human scores
  :param:   refs (list): list of reference translations
  :param:   cands (list): list of machine translations
  :param:   raw_scores (list): list of human scores between references and mt
  :param:   idf (Boolean): whether to use idf importance weighting
  :returns: results (dict): dict of variant, time and memory footprint and correlation
  """
  variants, results = get_variants()
  for v in variants:
    F1_scores = []
    results[VARIANT].append(v)
    tracemalloc.start()
    s = perf_counter()
    _, _, F1 = score(cands, refs, lang='en', model_type=v, idf=idf)
    F1_scores.append(F1)
    if (i+1) % 100 == 0:
      print(f'Done: {i+1}, in {v}')
    results[TIME].append(perf_counter() - s)
    current, peak = tracemalloc.get_traced_memory()
    print(f'Current memory usage for {v} is {current / 10**6}MB; Peak was {peak / 10**6}MB, time taken {perf_counter() - s:.2f} s ({(perf_counter() - s) /60:.2f} min)')
    tracemalloc.stop()
    results[MEMORY].append(peak)
    assert len(F1) == len(raw_scores), f"Scores are not of the same length, (F1: {F1_scores}, human: {raw_scores})"
    corr, _ = pearsonr(F1, raw_scores)

    results[CORRELATION].append(f'{corr:.3f}')
  return results

In [9]:
# ignore warning messages from transformers
import warnings
warnings.filterwarnings("ignore")

from transformers import logging

logging.set_verbosity_error()

In [10]:
# zip translations, references and according scores together
d_set = list(zip(translations, references, raw_scores))

In [14]:
# limited number of examples due to colab limitations
num_ex = 10000
n = 5
res = {}
# iterate over samples of given data
for i in range(n):
  sample = random.sample(d_set, num_ex)
  t, r, s = zip(*sample)
  print(f'Iteration {i+1}')
  res[i] = get_scores(t, r, s, idf=True)


Iteration 1
Current memory usage for bert-base-uncased is 38.823393MB; Peak was 60.366585MB, time taken 44.29 s (0.74 min)
Current memory usage for roberta-base is 28.636529MB; Peak was 54.927814MB, time taken 42.06 s (0.70 min)
Current memory usage for albert-base-v2 is 0.174684MB; Peak was 19.734051MB, time taken 34.08 s (0.57 min)
Current memory usage for distilbert-base-uncased is 9.411342MB; Peak was 30.974451MB, time taken 33.80 s (0.56 min)
Current memory usage for microsoft/deberta-base is 28.609074MB; Peak was 56.187488MB, time taken 44.90 s (0.75 min)
Iteration 2
Current memory usage for bert-base-uncased is 9.805145MB; Peak was 31.455536MB, time taken 44.02 s (0.73 min)
Current memory usage for roberta-base is 0.223263MB; Peak was 54.968443MB, time taken 42.70 s (0.71 min)
Current memory usage for albert-base-v2 is 0.163775MB; Peak was 19.818513MB, time taken 34.79 s (0.58 min)
Current memory usage for distilbert-base-uncased is 9.408906MB; Peak was 31.074832MB, time taken 3

In [15]:
# combine multiple runs into average DataFrame
df_list = []
for i in range(n):
  try:
    del res[i][BERTMEM]
    del res[i][BERTTIME]
  except:
    pass  
  df_list.append(pd.DataFrame.from_dict(res[i]).astype({'Pearson Corr.': 'float64'}))
# print(df_list, type(df_list))
df = pd.concat(df_list)
average = df.groupby(VARIANT, sort=False, as_index=False).mean()
results = pd.DataFrame.to_dict(average, orient='list')

In [16]:
# compare time taken to base bert time
variants = ["bert-base-uncased", "roberta-base", "albert-base-v2", "distilbert-base-uncased", "microsoft/deberta-base"]

results[BERTTIME] = []
results[BERTMEM] = []
for i in range(len(variants)):
  results[BERTTIME].append(f'{results[TIME][i] / results[TIME][0] * 100:.1f}')
  results[BERTMEM].append(f'{results[MEMORY][i] / results[MEMORY][0] * 100:.1f}')

In [None]:
# build and save dataframe
df = pd.DataFrame(results, columns=[VARIANT, CORRELATION, BERTTIME, BERTMEM]).set_index(VARIANT)
df.to_csv("BERTScore_results.csv")

In [None]:
df

Unnamed: 0_level_0,Pearson Corr.,% Time BERT,% Memory BERT
Variant,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
bert-base-uncased,0.3494,100.0,100.0
roberta-base,0.355,111.5,276.7
albert-base-v2,0.3444,79.1,35.9
distilbert-base-uncased,0.3496,79.7,99.0
microsoft/deberta-base,0.3592,115.5,275.7
