# tf2.7 + Python 3.9

In [1]:
!python --version

Python 3.9.9


In [2]:
!pip list | grep tensorflow

tensorflow                   2.7.0
tensorflow-addons            0.15.0
tensorflow-estimator         2.7.0
tensorflow-hub               0.12.0
tensorflow-io-gcs-filesystem 0.23.1
tensorflow-text              2.7.3


In [3]:
!pip list | grep transformers

transformers                 4.13.0


### Prepare

In [4]:
from tqdm import tqdm
from typing import Tuple
import pandas as pd

import common

In [5]:
#!pip install ipywidgets
#!jupyter nbextension enable --py widgetsnbextension

In [6]:
common.TEST_CASES[0]

{'model_name': 'bert',
 'model_weights': None,
 'texts': ['Good evening.', 'here is the sentence I want embeddings for.'],
 'expected_shape': [(3, 768), (9, 768)],
 'expected_sequence_vec': [[0.6569931, 0.77279466],
  [0.21718428, 0.34955627, 0.59124136, 0.6869872, 0.16993292]],
 'expected_cls_vec': [[0.29528213,
   0.5543281,
   -0.4091331,
   0.65817744,
   0.81740487],
  [-0.17215663, 0.26811457, -0.1922609, -0.63926417, -1.626383]]}

In [7]:
len(set(f"{test_case['model_name']}+{test_case['model_weights']}" for test_case in common.TEST_CASES)) == len(common.TEST_CASES)

True

In [8]:
%%capture
for test_case in tqdm(common.TEST_CASES):
    common.get_featurizer(test_case)

2022-01-10 18:31:24.253438: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-01-10 18:31:39.765596: W tensorflow/python/util/util.cc:368] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


### Compare

In [9]:
comparisons = common.collect_comparisons(common.TEST_CASES)

  return np.array(nonpadded_sequence_embeddings)
  np.array(post_processed_sequence_embeddings),
  return np.array(reshaped_sequence_embeddings)
  sequence_final_embeddings = np.array(sequence_final_embeddings)
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:50<00:00,  7.25s/it]


In [10]:
comparisons.sort_values('max_diff')[['model_name','model_weights', 'text'] + [col for col in comparisons.columns if 'diff' in col]]

Unnamed: 0,model_name,model_weights,text,sequence_diff,sentence_diff,max_diff
10,distilbert,,Good evening.,2.831221e-07,1.639128e-07,2.831221e-07
13,roberta,,here is the sentence I want embeddings for.,2.86761e-07,2.086163e-07,2.86761e-07
11,distilbert,,here is the sentence I want embeddings for.,4.172325e-07,8.940697e-08,4.172325e-07
0,bert,,Good evening.,3.537218e-07,4.813641e-07,4.813641e-07
12,roberta,,Good evening.,9.378815e-08,5.587935e-07,5.587935e-07
6,gpt2,,Good evening.,2.975506e-07,5.662441e-07,5.662441e-07
3,bert,bert-base-uncased,here is the sentence I want embeddings for.,7.756384e-07,1.48993e-07,7.756384e-07
1,bert,,here is the sentence I want embeddings for.,5.247802e-07,9.504773e-07,9.504773e-07
7,gpt2,,here is the sentence I want embeddings for.,9.816301e-07,2.384186e-07,9.816301e-07
2,bert,bert-base-uncased,Good evening.,1.504973e-06,1.603702e-07,1.504973e-06


In [11]:
comparisons.to_csv('comparison-tf27.csv',index=False)

In [12]:
comparisons[comparisons['max_diff']<1e-4]['model_name'].unique()

array(['bert', 'gpt2', 'distilbert', 'roberta'], dtype=object)

In [13]:
comparisons[comparisons['max_diff']>1e-4]['model_name'].unique()

array(['gpt', 'xlnet'], dtype=object)

## If we update the corresponding test, then the new "expected" values should be:

In [14]:
xlnet_gpt = comparisons[comparisons['model_name'].isin(['xlnet', 'gpt'])]

In [15]:
for _, row in xlnet_gpt.iterrows():
    print(row['model_name'], row['text'])
    print('seq', row['sequence_actual'])
    print('sent', row['sentence_actual'])

gpt Good evening.
seq [-0.06324312090873718, 0.4072571396827698]
sent [0.1720070093870163, 0.1511477530002594, 0.39497435092926025, -0.5745484828948975, 0.05334469676017761]
gpt here is the sentence I want embeddings for.
seq [0.8041259050369263, -0.08877559006214142, 0.9976294636726379, -0.38815218210220337, 0.08530596643686295]
sent [0.4095669686794281, -0.11725597828626633, -0.30236583948135376, -0.4023253917694092, 0.6285617351531982]
xlnet Good evening.
seq [1.7588920593261719, 2.578641176223755]
sent [2.168766498565674, -1.5277889966964722, -3.2499680519104004, 0.23829853534698486, -1.603652000427246]
xlnet here is the sentence I want embeddings for.
seq [0.7821242213249207, 0.6983698606491089, 1.5819640159606934, 1.891527533531189, 2.511735200881958]
sent [1.643880844116211, 0.023089325055480003, -2.497927665710449, 1.4621683359146118, -2.5919559001922607]


## Inspect Embeddings, Weights, ...

In [16]:
test_case = next(test_case for test_case in common.TEST_CASES if 'xlnet' == test_case['model_name'])

In [17]:
test_case

{'model_name': 'xlnet',
 'model_weights': None,
 'texts': ['Good evening.', 'here is the sentence I want embeddings for.'],
 'expected_shape': [(3, 768), (9, 768)],
 'expected_sequence_vec': [[1.7612367868423462, 2.5819129943847656],
  [0.784195065498352,
   0.7068007588386536,
   1.5883606672286987,
   1.891886591911316,
   2.5209126472473145]],
 'expected_cls_vec': [[2.171574831008911,
   -1.5377449989318848,
   -3.2671749591827393,
   0.22520869970321655,
   -1.598855972290039],
  [1.6516317129135132,
   0.021670114248991013,
   -2.5114030838012695,
   1.447351098060608,
   -2.5866634845733643]]}

In [18]:
tmp_featurizer = common.get_featurizer(test_case)

In [19]:
tmp_featurizer.tokenizer.encode(test_case['texts'][0], add_special_tokens=False)

[2803, 2060, 9]

In [20]:
tmp_featurizer.tokenizer.encode(test_case['texts'][1], add_special_tokens=False)

[193, 27, 18, 3833, 35, 210, 26405, 3487, 23, 28, 9]