# tf2.6 + Python 3.8

In [1]:
!python --version

Python 3.8.10


In [2]:
!pip list | grep tensorflow

tensorflow               2.6.1
tensorflow-addons        0.14.0
tensorflow-estimator     2.6.0
tensorflow-hub           0.12.0
tensorflow-probability   0.13.0
tensorflow-text          2.6.0


In [3]:
!pip list | grep transformers

transformers             2.11.0


### Prepare

In [4]:
from tqdm import tqdm
from typing import Tuple
import pandas as pd

import common

In [5]:
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension

In [6]:
common.TEST_CASES[0]

{'model_name': 'bert',
 'model_weights': None,
 'texts': ['Good evening.', 'here is the sentence I want embeddings for.'],
 'expected_shape': [(3, 768), (9, 768)],
 'expected_sequence_vec': [[0.6569931, 0.77279466],
  [0.21718428, 0.34955627, 0.59124136, 0.6869872, 0.16993292]],
 'expected_cls_vec': [[0.29528213,
   0.5543281,
   -0.4091331,
   0.65817744,
   0.81740487],
  [-0.17215663, 0.26811457, -0.1922609, -0.63926417, -1.626383]]}

In [7]:
len(set(f"{test_case['model_name']}+{test_case['model_weights']}" for test_case in common.TEST_CASES)) == len(common.TEST_CASES)

True

In [8]:
%%capture
for test_case in tqdm(common.TEST_CASES):
    common.get_featurizer(test_case)

2022-01-10 11:23:11.004769: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-10 11:23:19.830459: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


### Compare

In [9]:
comparisons = common.collect_comparisons(common.TEST_CASES)

  return np.array(nonpadded_sequence_embeddings)
  np.array(post_processed_sequence_embeddings),
  return np.array(reshaped_sequence_embeddings)
  sequence_final_embeddings = np.array(sequence_final_embeddings)
 29%|█████████████████████████████████████████████████████████▏                                                                                                                                              | 2/7 [00:11<00:26,  5.36s/it]ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:27<00:00,  3.89s/it]


In [10]:
comparisons.sort_values('max_diff')[['model_name','model_weights'] + [col for col in comparisons.columns if 'diff' in col]]

Unnamed: 0,model_name,model_weights,sequence_diff,sentence_diff,max_diff
10,distilbert,,2.831221e-07,1.639128e-07,2.831221e-07
13,roberta,,2.86761e-07,2.086163e-07,2.86761e-07
11,distilbert,,4.172325e-07,8.940697e-08,4.172325e-07
0,bert,,3.537218e-07,4.813641e-07,4.813641e-07
12,roberta,,9.378815e-08,5.587935e-07,5.587935e-07
6,gpt2,,2.975506e-07,5.662441e-07,5.662441e-07
3,bert,bert-base-uncased,7.756384e-07,1.48993e-07,7.756384e-07
1,bert,,5.247802e-07,9.504773e-07,9.504773e-07
7,gpt2,,9.816301e-07,2.384186e-07,9.816301e-07
4,gpt,,9.387732e-07,1.132488e-06,1.132488e-06


In [11]:
comparisons.to_csv('comparison-tf26.csv',index=False)

In [12]:
comparisons[comparisons['max_diff']<1e-4]['model_name'].unique()

array(['bert', 'gpt', 'gpt2', 'xlnet', 'distilbert', 'roberta'],
      dtype=object)

In [13]:
comparisons[comparisons['max_diff']>1e-4]['model_name'].unique()

array([], dtype=object)

In [14]:
comparisons.to_csv('comparison-tf26.csv', index=False)

## Inspect Embeddings, Weights, ...

In [15]:
test_case = next(test_case for test_case in common.TEST_CASES if 'xlnet' == test_case['model_name'])

In [16]:
test_case

{'model_name': 'xlnet',
 'model_weights': None,
 'texts': ['Good evening.', 'here is the sentence I want embeddings for.'],
 'expected_shape': [(3, 768), (9, 768)],
 'expected_sequence_vec': [[1.7612367868423462, 2.5819129943847656],
  [0.784195065498352,
   0.7068007588386536,
   1.5883606672286987,
   1.891886591911316,
   2.5209126472473145]],
 'expected_cls_vec': [[2.171574831008911,
   -1.5377449989318848,
   -3.2671749591827393,
   0.22520869970321655,
   -1.598855972290039],
  [1.6516317129135132,
   0.021670114248991013,
   -2.5114030838012695,
   1.447351098060608,
   -2.5866634845733643]]}

In [17]:
tmp_featurizer = common.get_featurizer(test_case)

In [18]:
tmp_featurizer.tokenizer.encode(test_case['texts'][0], add_special_tokens=False)

[2803, 2060, 9]

In [19]:
tmp_featurizer.tokenizer.encode(test_case['texts'][1], add_special_tokens=False)

[193, 27, 18, 3833, 35, 210, 26405, 3487, 23, 28, 9]