In [1]:
from collections import defaultdict
import itertools
import re
import subprocess

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

from tda4atd.stats_count import *
from tda4atd.grab_weights import grab_attention_weights, text_preprocessing

In [2]:
import warnings

warnings.filterwarnings('ignore')

In [3]:
np.random.seed(42) # For reproducibility.

In [5]:
max_tokens_amount  = 512 # The number of tokens to which the tokenized text is truncated / padded.
stats_cap          = 500 # Max value that the feature can take. Is NOT applicable to Betty numbers.
    
layers_of_interest = [i for i in range(12)]  # Layers for which attention matrices and features on them are 
                                             # calculated. For calculating features on all layers, leave it be
                                             # [i for i in range(12)].
stats_name = "s_e_v_c_b0b1" # The set of topological features that will be count (see explanation below)

thresholds_array = [0.025, 0.05, 0.1, 0.25, 0.5, 0.75] # The set of thresholds
thrs = len(thresholds_array)                           # ("t" in the paper)

model_path = tokenizer_path = "bert-base-uncased"  

# You can use either standard or fine-tuned BERT. If you want to use fine-tuned BERT to your current task, save the
# model and the tokenizer with the commands tokenizer.save_pretrained(output_dir); 
# bert_classifier.save_pretrained(output_dir) into the same directory and insert the path to it here.

In [6]:
# name = "reviews_train"           # .csv file with the texts, for which we count topological features
name = "senteval"           # .csv file with the texts, for which we count topological features
if name == "revies_train":
    output_dir = "target_problem/"
else:
    output_dir = "senteval_proc_data/" # Name of the directory with calculations results

prefix = output_dir + name

r_file     = output_dir + 'attentions/' + name  + "_all_heads_" + str(len(layers_of_interest)) + "_layers_MAX_LEN_" + \
             str(max_tokens_amount) + "_" + model_path.split("/")[-1]
# Name of the file for attention matrices weights

In [7]:
MAX_LEN = max_tokens_amount
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
tokenize_batch_size = 2048

In [8]:
def get_token_length(batch_texts):
    for elem in batch_texts:
        if not isinstance(elem, str):
            print(elem)
            break
        # break
    all_inputs = []
    for i in tqdm(range(0, len(batch_texts), tokenize_batch_size)):
        inputs_cur = tokenizer.batch_encode_plus(batch_texts[i:i+tokenize_batch_size],
            return_tensors='pt',
            add_special_tokens=True,
            max_length=MAX_LEN,             # Max length to truncate/pad
            pad_to_max_length=True,         # Pad sentence to max length
            truncation=True,
        )['input_ids']
        all_inputs.append(inputs_cur)
    print("done")
    inputs = np.concatenate(all_inputs)
    print(inputs.shape)
    n_tokens = []
    for i in tqdm(range(inputs.shape[0])):
        good = np.argwhere(inputs[i] == tokenizer.pad_token_id)
        if not len(good):
            n_tokens.append(MAX_LEN)
        else:
            n_tokens.append(np.min(good))
    return n_tokens

In [9]:
def read_sent_analyze():
    data = pd.read_csv("target_problem/reviews_train.csv").reset_index(drop=True)
    data = data[~data["cleaned_review"].isna()]
    data['tokenizer_length'] = get_token_length(data['cleaned_review'].values)
    ntokens_array = data['tokenizer_length'].values
    return data["cleaned_review"].values, ntokens_array

def read_sent_eval():
    data = []
    with open("SentEval/data/all_data.txt") as fin:
        for line in fin:
            data.append(line)
    tokenizer_length = get_token_length(data)
    print(len(data))
    return data, tokenizer_length

In [10]:
if name == "reviews_train":
    texts, ntokens_array = read_sent_analyze()
if name == "senteval":
    texts, ntokens_array = read_sent_eval()

FileNotFoundError: [Errno 2] No such file or directory: 'SentEval/data/all_data.txt'

In [11]:
from math import ceil

batch_size = 64 # batch size
batched_sentences = [texts[i:i+batch_size] for i in range(0, len(texts), batch_size)]
number_of_batches = len(batched_sentences)
DUMP_SIZE = 10 # number of batches to be dumped
number_of_files = ceil(number_of_batches / DUMP_SIZE)
adj_matricies = []
adj_filenames = []
assert number_of_batches == len(batched_sentences) # sanity check

NameError: name 'texts' is not defined

In [11]:
device='cuda'
model = BertForSequenceClassification.from_pretrained(model_path, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(tokenizer_path, do_lower_case=True)
model = model.to(device)
MAX_LEN = max_tokens_amount

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
%load_ext autoreload
%autoreload 2

In [13]:
use_first_head = True

for i in tqdm(range(number_of_batches), desc="Weights calc"): 
    attention_w = grab_attention_weights(model, tokenizer, batched_sentences[i], max_tokens_amount, device)
    if use_first_head:
        attention_w = attention_w[:, :, :1, :, :]
    # sample X layer X head X n_token X n_token
    adj_matricies.append(attention_w)
    if (i+1) % DUMP_SIZE == 0: # dumping
        print(f'Saving: shape {adj_matricies[0].shape}')
        adj_matricies = np.concatenate(adj_matricies, axis=1)
        print("Concatenated")
        adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
        filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)) + "of" + str(number_of_files) + '.npy'
        print(f"Saving weights to : {filename}")
        adj_filenames.append(filename)
        np.save(filename, adj_matricies)
        adj_matricies = []
        
if len(adj_matricies):
    filename = r_file + "_part" + str(ceil(i/DUMP_SIZE)) + "of" + str(number_of_files) + '.npy'
    print(f'Saving: shape {adj_matricies[0].shape}')
    adj_matricies = np.concatenate(adj_matricies, axis=1)
    print("Concatenated")
    adj_matricies = np.swapaxes(adj_matricies, axis1=0, axis2=1) # sample X layer X head X n_token X n_token
    print(f"Saving weights to : {filename}")
    np.save(filename, adj_matricies)

print("Results saved.")

Weights calc:   0%|          | 0/235 [00:00<?, ?it/s]

Saving: shape (12, 64, 1, 512, 512)
Concatenated
Saving weights to : senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part1of24.npy
Saving: shape (12, 64, 1, 512, 512)
Concatenated
Saving weights to : senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part2of24.npy
Saving: shape (12, 64, 1, 512, 512)
Concatenated
Saving weights to : senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part3of24.npy
Saving: shape (12, 64, 1, 512, 512)
Concatenated
Saving weights to : senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part4of24.npy
Saving: shape (12, 64, 1, 512, 512)
Concatenated
Saving weights to : senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part5of24.npy
Saving: shape (12, 64, 1, 512, 512)
Concatenated
Saving weights to : senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_

In [14]:
stats_name.split("_")

['s', 'e', 'v', 'c', 'b0b1']

In [15]:
import os
from multiprocessing import Pool
from tqdm.notebook import tqdm

adj_filenames = [
    output_dir + 'attentions/' + filename 
    for filename in os.listdir(output_dir + 'attentions/') if r_file in (output_dir + 'attentions/' + filename)
]
# sorted by part number
adj_filenames = sorted(adj_filenames, key = lambda x: int(x.split('_')[-1].split('of')[0][4:].strip())) 
adj_filenames

['senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part1of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part2of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part3of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part4of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part5of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part6of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part7of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part8of24.npy',
 'senteval_proc_data/attentions/senteval_all_heads_12_layers_MAX_LEN_512_bert-base-uncased_part9of24.npy',
 'senteval_proc_data/attentions/sente

In [16]:
# What is calculated in "f(v)". You can add any other function from the array with vertex degrees.

def function_for_v(list_of_v_degrees_of_graph):
    return sum(map(lambda x: np.sqrt(x*x), list_of_v_degrees_of_graph))

def split_matricies_and_lengths(adj_matricies, ntokens_array, num_of_workers):
    splitted_adj_matricies = np.array_split(adj_matricies, num_of_workers)
    splitted_ntokens = np.array_split(ntokens_array, num_of_workers)
    print(len(adj_matricies), len(ntokens_array))
    print([(len(m), len(n)) for m, n in zip(splitted_adj_matricies, splitted_ntokens)])
    assert all([len(m)==len(n) for m, n in zip(splitted_adj_matricies, splitted_ntokens)]), "Split is not valid!"
    return zip(splitted_adj_matricies, splitted_ntokens)

In [17]:
num_of_workers = 20
pool = Pool(num_of_workers)

In [18]:
stats_tuple_lists_array = []
for i, filename in enumerate(tqdm(adj_filenames, desc='Вычисление признаков')):
    print(i)
    adj_matricies = np.load(filename, allow_pickle=True)
    print("loaded")
    ntokens = ntokens_array[i*batch_size*DUMP_SIZE : (i+1)*batch_size*DUMP_SIZE]
    print("ntokens count")
    splitted = split_matricies_and_lengths(adj_matricies, ntokens, num_of_workers)
    print("split_matricies_and_lengths")
    args = [(m, thresholds_array, ntokens, stats_name.split("_"), stats_cap) for m, ntokens in splitted]
    stats_tuple_lists_array_part = pool.starmap(
        count_top_stats, args
    )
    stats_tuple_lists_array.append(np.concatenate([_ for _ in stats_tuple_lists_array_part], axis=3))

Вычисление признаков:   0%|          | 0/24 [00:00<?, ?it/s]

0
loaded
ntokens count
640 640
[(32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32)]
split_matricies_and_lengths
1
loaded
ntokens count
640 640
[(32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32)]
split_matricies_and_lengths
2
loaded
ntokens count
640 640
[(32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32)]
split_matricies_and_lengths
3
loaded
ntokens count
640 640
[(32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32), (32, 32)

In [19]:
stats_tuple_lists_array = np.concatenate(stats_tuple_lists_array, axis=3)

In [20]:
stats_tuple_lists_array.shape

(12, 1, 6, 15000, 6)

In [21]:
with open("output.npy", "wb") as f:
    np.save(f, stats_tuple_lists_array)