## Imports

In [1]:
import pandas as pd
import sys
from tqdm import tqdm
import json
import re
import torch
import numpy as np

In [3]:
!pip install transformers
!pip install ecco
from collections import Counter
from transformers import AutoTokenizer, BertConfig
from transformers import BertForSequenceClassification

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Reading the genres and subgenres from a file

I had the necessary files in cloud; this was for the purpose of fast and easy trials with google colab

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def read_labels_set(labels_set_path):
    with open(labels_set_path, "r") as f:
        labels2id = json.load(f)
    return labels2id

labels_set = read_labels_set("/content/drive/MyDrive/NLP/labels_set.json")
id2main = {labels_set['main2id'][k] : k for k in labels_set['main2id']}
id2sub = {labels_set['sub2id'][k] : k for k in labels_set['sub2id']}

## ECCO XAI

In [6]:
### copy model to local environment 
!mkdir model_dir
!cp /content/drive/MyDrive/NLP/ecco_genre_main_ecco_bert_100_epoches.pt ./model_dir


In [7]:
dir_name = 'model_dir'
labels_num = len(id2main)

model_path = f'{dir_name}/ecco_genre_main_ecco_bert_100_epoches.pt'
model = BertForSequenceClassification.from_pretrained("TurkuNLP/eccobert-base-cased-v1", num_labels=labels_num)
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/eccobert-base-cased-v1")
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
checkpoint = torch.load(model_path, map_location=device)
model.load_state_dict(checkpoint['net'])


model.save_pretrained(dir_name)
tokenizer = AutoTokenizer.from_pretrained("TurkuNLP/eccobert-base-cased-v1")
tokenizer.save_pretrained('tkn_dir')
!cp ./tkn_dir/* ./model_dir

Downloading:   0%|          | 0.00/568 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/500M [00:00<?, ?B/s]

Some weights of the model checkpoint at TurkuNLP/eccobert-base-cased-v1 were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

Downloading:   0%|          | 0.00/712k [00:00<?, ?B/s]

Import data

In [8]:
def get_data():


    def subwords_to_original(subword_text):
      out = ""
      for token in subword_text.split():
              if token in ["[CLS]", "[SEP]"]: continue
              if token.startswith("##"): 
                  out += token.replace("##", "")
              else:
                  out += " " + token
      return out.strip()


    pd_saved_chunk_preds = pd.read_csv('/content/drive/MyDrive/NLP/ecco_seq_model_test_res.csv', nrows=100)
    pd_saved_chunk_preds["text"] = pd_saved_chunk_preds["chunk_content"].apply(subwords_to_original)
    pd_saved_chunk_preds['chunk_content'] = pd_saved_chunk_preds.apply(lambda row: tokenizer(row['text'], return_tensors="pt", padding=True), axis=1)
    return pd_saved_chunk_preds

data = get_data()

Activations of every fully connected layer, factorized

In [9]:
import ecco



model_config = {
    'embedding': "embeddings.word_embeddings",
    'type': 'mlm',
    'activations': ['intermediate\.dense'],
    'token_prefix': '#',
    'partial_token_prefix': ''
}




lm = ecco.from_pretrained('model_dir', activations=True, model_config=model_config)
text = data.iloc[1, 13]

inputs = lm.tokenizer([text], return_tensors="pt")
output = lm(inputs)
nmf_1 = output.run_nmf(n_components=18) 
nmf_1.explore()


Some weights of the model checkpoint at model_dir were not used when initializing BertModel: ['classifier.bias', 'classifier.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<IPython.core.display.Javascript object>

First layer activations

In [10]:
nmf_1 = output.run_nmf(n_components=16, from_layer=0, to_layer=1) 
nmf_1.explore()

<IPython.core.display.Javascript object>