In [1]:
from collections import defaultdict
import itertools
import re
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

from stats_count import *
from grab_weights import grab_attention_weights, text_preprocessing

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
!env | grep CUDA_VISIBLE

## Parameters

In [4]:
np.random.seed(42) # For reproducibility.

In [5]:
max_tokens_amount  = 128 # The number of tokens to which the tokenized text is truncated / padded.
stats_cap          = 500 # Max value that the feature can take. Is NOT applicable to Betty numbers.
    
layers_of_interest = [i for i in range(12)]  # Layers for which attention matrices and features on them are 
                                             # calculated. For calculating features on all layers, leave it be
                                             # [i for i in range(12)].
stats_name = "s_e_v_c_b0b1" # The set of topological features that will be count (see explanation below)

thresholds_array = [0.025, 0.05, 0.1, 0.25, 0.5, 0.75] # The set of thresholds
thrs = len(thresholds_array)                           # ("t" in the paper)

model_path = tokenizer_path = "bert-base-uncased"  

# You can use either standard or fine-tuned BERT. If you want to use fine-tuned BERT to your current task, save the
# model and the tokenizer with the commands tokenizer.save_pretrained(output_dir); 
# bert_classifier.save_pretrained(output_dir) into the same directory and insert the path to it here.

### Explanation of stats_name parameter

Currently, we implemented calculation of the following graphs features:
* "s"    - amount of strongly connected components
* "w"    - amount of weakly connected components
* "e"    - amount of edges
* "v"    - average vertex degree
* "c"    - amount of (directed) simple cycles
* "b0b1" - Betti numbers

The variable stats_name contains a string with the names of the features, which you want to calculate. The format of the string is the following:

"stat_name + "_" + stat_name + "_" + stat_name + ..."

**For example**:

`stats_name == "s_w"` means that the number of strongly and weakly connected components will be calculated

`stats_name == "b0b1"` means that only the Betti numbers will be calculated

`stats_name == "b0b1_c"` means that Betti numbers and the number of simple cycles will be calculated

e.t.c.

## Filenames

In [6]:
subset = "test_5k"           # .csv file with the texts, for which we count topological features
input_dir = "small_gpt_web/"  # Name of the directory with .csv file
output_dir = "small_gpt_web/" # Name of the directory with calculations results

prefix = output_dir + subset

r_file     = output_dir + 'attentions/' + subset  + "_all_heads_" + str(len(layers_of_interest)) + "_layers_MAX_LEN_" + \
             str(max_tokens_amount) + "_" + model_path.split("/")[-1]
# Name of the file for attention matrices weights

stats_file = output_dir + 'features/' + subset + "_all_heads_" + str(len(layers_of_interest)) + "_layers_" + stats_name \
             + "_lists_array_" + str(thrs) + "_thrs_MAX_LEN_" + str(max_tokens_amount) + \
             "_" + model_path.split("/")[-1] + '.npy'
# Name of the file for topological features array

In [7]:
stats_file

'small_gpt_web/features/test_5k_all_heads_12_layers_s_e_v_c_b0b1_lists_array_6_thrs_MAX_LEN_128_bert-base-uncased.npy'

In [8]:
r_file

'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased'

.csv file must contain the column with the name **sentence** with the texts. It can also contain the column **labels**, which will be needed for testing. Any other arbitrary columns will be ignored.

In [9]:
try:
    data = pd.read_csv(input_dir + subset + ".csv").reset_index(drop=True)
except:
    #data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t")
    data = pd.read_csv(input_dir + subset + ".tsv", delimiter="\t", header=None)
    data.columns = ["0", "labels", "2", "sentence"]

In [10]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label
0,4722,259722,True,231,The Learning Co.\n\nDeveloped by\n\nThe Learni...,natural
1,2757,257813,True,563,Bush doubles down on foreign policy on Saturda...,generated
2,2194,257194,True,62,Here are six interesting things you need to kn...,natural
3,817,255817,True,293,Introduction\n\nWe would like to thank Antec f...,natural
4,3886,258886,False,1024,"ELKRIDGE, Md.—A group called ""Muslims for Trum...",natural


In [11]:
sentences = data['sentence']
print("Average amount of words in example:", \
      np.mean(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))
print("Max. amount of words in example:", \
      np.max(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))
print("Min. amount of words in example:", \
      np.min(list(map(len, map(lambda x: re.sub('\w', ' ', x).split(" "), data['sentence'])))))

Average amount of words in example: 2723.5124
Max. amount of words in example: 6151
Min. amount of words in example: 34


In [12]:
def get_token_length(batch_texts):
    inputs = tokenizer.batch_encode_plus(batch_texts,
       return_tensors='pt',
       add_special_tokens=True,
       max_length=MAX_LEN,             # Max length to truncate/pad
       pad_to_max_length=True,         # Pad sentence to max length
       truncation=True
    )
    inputs = inputs['input_ids'].numpy()
    n_tokens = []
    indexes = np.argwhere(inputs == tokenizer.pad_token_id)
    for i in range(inputs.shape[0]):
        ids = indexes[(indexes == i)[:, 0]]
        if not len(ids):
            n_tokens.append(MAX_LEN)
        else:
            n_tokens.append(ids[0, 1])
    return n_tokens

In [13]:
MAX_LEN = max_tokens_amount
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)

In [14]:
data['tokenizer_length'] = get_token_length(data['sentence'].values)

In [15]:
data

Unnamed: 0.1,Unnamed: 0,id,ended,length,sentence,label,tokenizer_length
0,4722,259722,True,231,The Learning Co.\n\nDeveloped by\n\nThe Learni...,natural,128
1,2757,257813,True,563,Bush doubles down on foreign policy on Saturda...,generated,128
2,2194,257194,True,62,Here are six interesting things you need to kn...,natural,71
3,817,255817,True,293,Introduction\n\nWe would like to thank Antec f...,natural,128
4,3886,258886,False,1024,"ELKRIDGE, Md.—A group called ""Muslims for Trum...",natural,128
...,...,...,...,...,...,...,...
4995,1472,256472,False,1024,"Occasionally, we come across interesting scena...",natural,128
4996,326,255337,False,1024,Providing insight not only into the memes that...,generated,128
4997,3862,258862,True,339,"Each year, MONEY digs into enrollment data and...",natural,128
4998,2862,257862,False,1024,Grounding of the Queen Elizabeth 2 (response) ...,natural,128


In [16]:
ntokens_array = data['tokenizer_length'].values

## Attention extraction

Loading **BERT** and tokenizers using **transformers** library.

In [17]:
from math import ceil

batch_size = 10 # batch size
number_of_batches = ceil(len(data['sentence']) / batch_size)
DUMP_SIZE = 100 # number of batches to be dumped
batched_sentences = np.array_split(data['sentence'].values, number_of_batches)
number_of_files = ceil(number_of_batches / DUMP_SIZE)
adj_matricies = []
adj_filenames = []
assert number_of_batches == len(batched_sentences) # sanity check

In [18]:
device='cpu'
model = BertForSequenceClassification.from_pretrained(model_path, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
model = model.to(device)
MAX_LEN = max_tokens_amount

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [19]:
# for i in tqdm(range(number_of_batches), desc="Weights calc"): 
#     attention_w = grab_attention_weights(model, tokenizer, batched_sentences[i], max_tokens_amount, device)
#     # sample X layer X head X n_token X n_token
#     adj_matricies.append(attention_w)
#     if (i+1) % DUMP_SIZE == 0: # dumping
#         print(f'Saving: shape {adj_matricies[0].shape}')
#         adj_matricies = np.concatenate(adj_matricies, axis=1)
#         print("Concatenated")
#         adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
#         filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)) + "of" + str(number_of_files) + '.npy'
#         print(f"Saving weights to : {filename}")
#         adj_filenames.append(filename)
#         np.save(filename, adj_matricies)
#         adj_matricies = []
        
if len(adj_matricies):
    filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)) + "of" + str(number_of_files) + '.npy'
    print(f'Saving: shape {adj_matricies[0].shape}')
    adj_matricies = np.concatenate(adj_matricies, axis=1)
    print("Concatenated")
    adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
    print(f"Saving weights to : {filename}")
    np.save(filename, adj_matricies)

print("Results saved.")

Results saved.


## Calculating topological features

In [20]:
stats_name.split("_")

['s', 'e', 'v', 'c', 'b0b1']

In [21]:
import os
from multiprocessing import Pool
from tqdm import tqdm

adj_filenames = [
    output_dir + 'attentions/' + filename 
    for filename in os.listdir(output_dir + 'attentions/') if r_file in (output_dir + 'attentions/' + filename)
]
# sorted by part number
adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip())) 
adj_filenames

['small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part1of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part2of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part3of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part4of5.npy',
 'small_gpt_web/attentions/test_5k_all_heads_12_layers_MAX_LEN_128_bert-base-uncased_part5of5.npy']

In [22]:
# What is calculated in "f(v)". You can add any other function from the array with vertex degrees.

def function_for_v(list_of_v_degrees_of_graph):
    return sum(map(lambda x: np.sqrt(x*x), list_of_v_degrees_of_graph))

def split_matricies_and_lengths(adj_matricies, ntokens_array, num_of_workers):
    splitted_adj_matricies = np.array_split(adj_matricies, num_of_workers)
    splitted_ntokens = np.array_split(ntokens_array, num_of_workers)
    assert all([len(m)==len(n) for m, n in zip(splitted_adj_matricies, splitted_ntokens)]), "Split is not valid!"
    return zip(splitted_adj_matricies, splitted_ntokens)

In [23]:
num_of_workers = 20
pool = Pool(num_of_workers)

In [24]:
for i in tqdm(range(5)):
    pass

100%|██████████| 5/5 [00:00<00:00, 141699.46it/s]


In [25]:
stats_tuple_lists_array = []
for i, filename in enumerate(tqdm(adj_filenames, desc='Вычисление признаков')):
    print(i)
    adj_matricies = np.load(filename, allow_pickle=True)
    ntokens = ntokens_array[i*batch_size*DUMP_SIZE : (i+1)*batch_size*DUMP_SIZE]
    splitted = split_matricies_and_lengths(adj_matricies, ntokens, num_of_workers)
    args = [(m, thresholds_array, ntokens, stats_name.split("_"), stats_cap) for m, ntokens in splitted]
    stats_tuple_lists_array_part = pool.starmap(
        count_top_stats, args
    )
    stats_tuple_lists_array.append(np.concatenate([_ for _ in stats_tuple_lists_array_part], axis=3))

Вычисление признаков:   0%|          | 0/5 [00:00<?, ?it/s]

0


100%|██████████| 12/12 [09:26<00:00, 47.24s/it]
100%|██████████| 12/12 [09:40<00:00, 48.38s/it]
100%|██████████| 12/12 [09:33<00:00, 47.77s/it]
100%|██████████| 12/12 [09:50<00:00, 49.25s/it]
100%|██████████| 12/12 [09:48<00:00, 49.01s/it]
100%|██████████| 12/12 [09:53<00:00, 49.49s/it]
100%|██████████| 12/12 [09:56<00:00, 49.72s/it]
100%|██████████| 12/12 [09:59<00:00, 49.97s/it]
100%|██████████| 12/12 [09:51<00:00, 49.32s/it]
100%|██████████| 12/12 [09:57<00:00, 49.79s/it]
100%|██████████| 12/12 [09:54<00:00, 49.54s/it]
100%|██████████| 12/12 [10:02<00:00, 50.24s/it]
100%|██████████| 12/12 [09:51<00:00, 49.27s/it]
100%|██████████| 12/12 [09:53<00:00, 49.50s/it]
100%|██████████| 12/12 [09:56<00:00, 49.69s/it]
100%|██████████| 12/12 [09:47<00:00, 48.96s/it]
100%|██████████| 12/12 [09:47<00:00, 49.00s/it]
100%|██████████| 12/12 [09:52<00:00, 49.38s/it]
100%|██████████| 12/12 [09:51<00:00, 49.31s/it]
100%|██████████| 12/12 [09:55<00:00, 49.67s/it]
Вычисление признаков:  20%|██        | 1

1


100%|██████████| 12/12 [08:48<00:00, 44.01s/it]
100%|██████████| 12/12 [08:56<00:00, 44.68s/it]
100%|██████████| 12/12 [08:58<00:00, 44.88s/it]
100%|██████████| 12/12 [09:08<00:00, 45.69s/it]
100%|██████████| 12/12 [09:03<00:00, 45.33s/it]
100%|██████████| 12/12 [08:59<00:00, 45.00s/it]
100%|██████████| 12/12 [09:12<00:00, 46.05s/it]
100%|██████████| 12/12 [09:06<00:00, 45.56s/it]
100%|██████████| 12/12 [09:13<00:00, 46.10s/it]
100%|██████████| 12/12 [09:21<00:00, 46.77s/it]
100%|██████████| 12/12 [08:58<00:00, 44.90s/it]
100%|██████████| 12/12 [09:13<00:00, 46.17s/it]
100%|██████████| 12/12 [08:57<00:00, 44.76s/it]
100%|██████████| 12/12 [09:05<00:00, 45.49s/it]
100%|██████████| 12/12 [09:12<00:00, 46.07s/it]
100%|██████████| 12/12 [08:54<00:00, 44.58s/it]
100%|██████████| 12/12 [09:14<00:00, 46.19s/it]
100%|██████████| 12/12 [09:06<00:00, 45.54s/it]
100%|██████████| 12/12 [09:10<00:00, 45.90s/it]
100%|██████████| 12/12 [09:07<00:00, 45.64s/it]
Вычисление признаков:  40%|████      | 2

2


100%|██████████| 12/12 [09:01<00:00, 45.09s/it]
100%|██████████| 12/12 [09:06<00:00, 45.52s/it]
100%|██████████| 12/12 [09:03<00:00, 45.29s/it]
100%|██████████| 12/12 [09:07<00:00, 45.64s/it]
100%|██████████| 12/12 [09:04<00:00, 45.42s/it]
100%|██████████| 12/12 [09:11<00:00, 45.94s/it]
100%|██████████| 12/12 [09:10<00:00, 45.92s/it]
100%|██████████| 12/12 [08:57<00:00, 44.78s/it]
100%|██████████| 12/12 [09:17<00:00, 46.45s/it]
100%|██████████| 12/12 [09:08<00:00, 45.68s/it]
100%|██████████| 12/12 [09:13<00:00, 46.12s/it]
100%|██████████| 12/12 [09:20<00:00, 46.70s/it]
100%|██████████| 12/12 [09:16<00:00, 46.37s/it]
100%|██████████| 12/12 [09:13<00:00, 46.10s/it]
100%|██████████| 12/12 [09:14<00:00, 46.22s/it]
100%|██████████| 12/12 [09:08<00:00, 45.72s/it]
100%|██████████| 12/12 [09:14<00:00, 46.17s/it]
100%|██████████| 12/12 [09:11<00:00, 45.93s/it]
100%|██████████| 12/12 [09:17<00:00, 46.46s/it]
100%|██████████| 12/12 [09:18<00:00, 46.50s/it]
Вычисление признаков:  60%|██████    | 3

3


100%|██████████| 12/12 [08:42<00:00, 43.57s/it]
100%|██████████| 12/12 [08:50<00:00, 44.21s/it]
100%|██████████| 12/12 [08:54<00:00, 44.51s/it]
100%|██████████| 12/12 [09:00<00:00, 45.00s/it]
100%|██████████| 12/12 [09:02<00:00, 45.25s/it]
100%|██████████| 12/12 [09:03<00:00, 45.25s/it]
100%|██████████| 12/12 [09:04<00:00, 45.38s/it]
100%|██████████| 12/12 [09:02<00:00, 45.20s/it]
100%|██████████| 12/12 [09:04<00:00, 45.42s/it]
100%|██████████| 12/12 [09:00<00:00, 45.05s/it]
100%|██████████| 12/12 [09:02<00:00, 45.23s/it]
100%|██████████| 12/12 [09:06<00:00, 45.53s/it]
100%|██████████| 12/12 [09:02<00:00, 45.24s/it]
100%|██████████| 12/12 [08:55<00:00, 44.63s/it]
100%|██████████| 12/12 [09:00<00:00, 45.06s/it]
100%|██████████| 12/12 [09:05<00:00, 45.49s/it]
100%|██████████| 12/12 [09:00<00:00, 45.05s/it]
100%|██████████| 12/12 [09:01<00:00, 45.15s/it]
100%|██████████| 12/12 [09:06<00:00, 45.55s/it]
100%|██████████| 12/12 [08:58<00:00, 44.91s/it]
Вычисление признаков:  80%|████████  | 4

4


100%|██████████| 12/12 [08:39<00:00, 43.26s/it]
100%|██████████| 12/12 [08:41<00:00, 43.45s/it]
100%|██████████| 12/12 [08:52<00:00, 44.38s/it]
100%|██████████| 12/12 [08:48<00:00, 44.04s/it]
100%|██████████| 12/12 [08:55<00:00, 44.64s/it]
100%|██████████| 12/12 [08:47<00:00, 44.00s/it]
100%|██████████| 12/12 [08:51<00:00, 44.28s/it]
100%|██████████| 12/12 [09:01<00:00, 45.11s/it]
100%|██████████| 12/12 [09:01<00:00, 45.13s/it]
100%|██████████| 12/12 [08:53<00:00, 44.42s/it]
100%|██████████| 12/12 [09:03<00:00, 45.28s/it]
100%|██████████| 12/12 [09:01<00:00, 45.09s/it]
100%|██████████| 12/12 [09:03<00:00, 45.31s/it]
100%|██████████| 12/12 [09:03<00:00, 45.33s/it]
100%|██████████| 12/12 [08:56<00:00, 44.67s/it]
100%|██████████| 12/12 [09:00<00:00, 45.03s/it]
100%|██████████| 12/12 [08:55<00:00, 44.65s/it]
100%|██████████| 12/12 [08:57<00:00, 44.83s/it]
100%|██████████| 12/12 [08:55<00:00, 44.66s/it]
100%|██████████| 12/12 [08:59<00:00, 44.98s/it]
Вычисление признаков: 100%|██████████| 5

In [26]:
stats_tuple_lists_array = np.concatenate(stats_tuple_lists_array, axis=3)

In [27]:
stats_tuple_lists_array.shape

(12, 12, 6, 5000, 6)

In [28]:
from numpy import inf

np.sum(stats_tuple_lists_array[stats_tuple_lists_array == -inf]) + \
np.sum(stats_tuple_lists_array[stats_tuple_lists_array == inf])

0.0

In [29]:
stats_file

'small_gpt_web/features/test_5k_all_heads_12_layers_s_e_v_c_b0b1_lists_array_6_thrs_MAX_LEN_128_bert-base-uncased.npy'

In [32]:
np.save(stats_file, stats_tuple_lists_array)

##### Checking the size of features matrices:

Layers amount **Х** Heads amount **Х** Features amount **X** Examples amount **Х** Thresholds amount

**For example**:

`stats_name == "s_w"` => `Features amount == 2`

`stats_name == "b0b1"` => `Features amount == 2`

`stats_name == "b0b1_c"` => `Features amount == 3`

e.t.c.

`thresholds_array == [0.025, 0.05, 0.1, 0.25, 0.5, 0.75]` => `Thresholds amount == 6`

In [33]:
stats_tuple_lists_array.shape

(12, 12, 6, 5000, 6)