In [1]:
import numpy as np
import time
import datetime
import torch
import torch.nn as nn
import pandas as pd
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline

import datasets
import pandas as pd
from torch.utils.data import Dataset

In [2]:
all_trb_new = pd.read_csv("all_trb_new.csv")

In [3]:
all_trb_new.head()

Unnamed: 0.1,Unnamed: 0,CDR3,label
0,0,C A S S S T G L P Y G Y T F,0
1,1,C A S S S T G L P Y G Y T F,0
2,2,C A S S S R S S Y E Q Y F,0
3,3,C A S S S R S S Y E Q Y F,0
4,4,C A S S S R S S Y E Q Y F,0


In [4]:
all_trb_new = all_trb_new.drop(columns='Unnamed: 0', axis=1)

In [5]:
all_trb_new = all_trb_new.drop_duplicates(subset=['CDR3'])

In [6]:
all_trb_new.shape

(8321, 2)

In [7]:
X_train, X_test =  train_test_split(all_trb_new, test_size=0.2, random_state=42, stratify=all_trb_new["label"])

In [8]:
X_tr_dict = X_train.to_dict('list')
X_test_dict = X_test.to_dict('list')

In [9]:
train_dataset = datasets.Dataset.from_dict(X_tr_dict)
test_dataset = datasets.Dataset.from_dict(X_test_dict)
my_dataset_dict = datasets.DatasetDict({"train":train_dataset,"test":test_dataset})

2022-10-22 11:01:29.927258: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-22 11:01:38.935486: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-22 11:02:00.409245: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-10-22 11:02:00.410017: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such 

In [10]:
my_dataset_dict

DatasetDict({
    train: Dataset({
        features: ['CDR3', 'label'],
        num_rows: 6656
    })
    test: Dataset({
        features: ['CDR3', 'label'],
        num_rows: 1665
    })
})

In [11]:
from transformers import BertTokenizer, BertForMaskedLM, pipeline, AutoTokenizer, BertModel, BertConfig
from transformers import AutoModel

In [12]:
model_name = "Rostlab/prot_bert"

In [34]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [35]:
model = AutoModel.from_pretrained(model_name, output_attentions=True, num_hidden_layers = 6).to(device)

In [15]:
model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30, 1024, padding_idx=0)
    (position_embeddings): Embedding(40000, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False

In [16]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [60]:
tokenizer.vocab

{'I': 11,
 '[SEP]': 3,
 'S': 10,
 'H': 22,
 'X': 25,
 'Y': 20,
 'C': 23,
 'V': 8,
 'L': 5,
 'Z': 28,
 'B': 27,
 'A': 6,
 'R': 13,
 '[MASK]': 4,
 'G': 7,
 'N': 17,
 'Q': 18,
 'M': 21,
 'F': 19,
 'D': 14,
 '[UNK]': 1,
 'K': 12,
 'P': 16,
 'O': 29,
 '[CLS]': 2,
 'W': 24,
 'U': 26,
 '[PAD]': 0,
 'E': 9,
 'T': 15}

In [17]:
max_length = all_trb_new["CDR3"].str.replace(' ', '').str.len().max()

In [18]:
def tokenize(batch):
    return tokenizer(batch["CDR3"], padding=True, truncation=True, max_length=max_length)

In [19]:
my_dataset_encoded = my_dataset_dict.map(tokenize, batched=True, batch_size=None)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## Получаем скрытые слои для всех данных

In [36]:
def forward_pass(batch):
    input_ids = torch.tensor(batch["input_ids"]).to(device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(device)
    
    with torch.no_grad():
        last_hidden_state = model(input_ids, attention_mask).last_hidden_state
        last_hidden_state = last_hidden_state.cpu().numpy()
        
        attention = model(input_ids, attention_mask).attentions
    
    # Use average of unmasked hidden states for classification
    lhs_shape = last_hidden_state.shape
    boolean_mask = ~np.array(batch["attention_mask"]).astype(bool)
    boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
    boolean_mask = boolean_mask.reshape(lhs_shape)
    masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
    batch["hidden_state"] = masked_mean.data
    # batch["attention"] = attention.data
    return batch

my_dataset_encoded = my_dataset_encoded.map(forward_pass, batched=True,
                                        batch_size=16)

  0%|          | 0/416 [00:00<?, ?ba/s]

  0%|          | 0/105 [00:00<?, ?ba/s]

In [37]:
X_train = np.array(my_dataset_encoded["train"]["hidden_state"])
X_test = np.array(my_dataset_encoded["test"]["hidden_state"])
y_train = np.array(my_dataset_encoded["train"]["label"])
y_test = np.array(my_dataset_encoded["test"]["label"])
X_train.shape, X_test.shape

((6656, 1024), (1665, 1024))

### Линейная регрессия на эмбеддингах как на фичах

In [38]:
lr_clf = LogisticRegression(n_jobs=-1, penalty="none",  max_iter=2000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Explicitly set the environm

0.6252252252252253

Результат = 0.6910139356078808

0.62

In [39]:
y_preds = lr_clf.predict(X_test)

In [40]:
print(classification_report(y_test, y_preds, target_names=['0', '1']))

              precision    recall  f1-score   support

           0       0.66      0.67      0.66       920
           1       0.58      0.57      0.58       745

    accuracy                           0.63      1665
   macro avg       0.62      0.62      0.62      1665
weighted avg       0.62      0.63      0.62      1665



In [41]:
train_score = roc_auc_score(
    y_train,
    lr_clf.predict(X_train)
)
test_score = roc_auc_score(
    y_test,
    lr_clf.predict(X_test)
)
train_score, test_score

(0.7274872015832792, 0.6197621826670557)

Смотрим на TP и FN

In [42]:
diff = y_test-y_preds
print('diff: ',diff)

# Correct is 0 
# FP is -1 
# FN is 1
print('Correctly classified: ', np.where(diff == 0)[0])
print('Incorrectly classified: ', np.where(diff != 0)[0])
print('False positives: ', np.where(diff == -1)[0])
print('False negatives: ', np.where(diff == 1)[0])

diff:  [1 0 0 ... 0 1 0]
Correctly classified:  [   1    2    3 ... 1661 1662 1664]
Incorrectly classified:  [   0    6    9   11   15   16   21   24   29   30   33   35   40   41
   44   45   46   48   49   52   53   56   58   60   61   63   64   65
   66   76   77   80   81   82   83   85   86   88   89   92   93   97
  103  104  109  110  111  113  117  118  119  122  123  124  125  127
  128  129  131  136  138  141  150  152  158  159  162  163  170  171
  176  178  181  182  183  185  186  189  190  195  203  204  205  207
  209  212  213  218  233  234  235  236  238  239  240  244  245  246
  247  248  249  251  253  257  259  264  265  266  267  270  275  276
  279  283  284  285  287  288  293  296  298  301  303  304  306  314
  318  323  325  326  327  331  333  334  335  336  339  340  341  342
  345  349  350  352  353  354  362  365  369  372  375  376  382  383
  384  385  388  390  397  398  399  402  403  405  411  412  413  416
  419  421  422  425  432  438  441  44

In [33]:
X_test[0] #TP

array([-0.00761613,  0.0695463 , -0.11314024, ...,  0.00792013,
       -0.0535237 ,  0.09916826])

In [34]:
my_dataset_encoded["test"]["CDR3"][0] #TP

'C A S S L G F P N Q P Q H F'

In [39]:
my_dataset_encoded["test"]["CDR3"][600] #FP

'C A W S S G A G R D E Q F F'

In [40]:
my_dataset_encoded["test"]["label"][600] #FP

0

In [41]:
X_test[600]

array([ 0.07394347,  0.17160328, -0.0707214 , ...,  0.04634371,
       -0.0111469 ,  0.13327052])

In [44]:
my_dataset_encoded["test"]["label"][775] #FN

1

In [45]:
my_dataset_encoded["test"]["CDR3"][775] #FN

'C A S G P P G T G Y S N Q P Q H F'

## TF-IDF

In [36]:
all_not_sep = pd.read_csv("all_trb_not_sep.csv")
all_not_sep = all_not_sep.drop(columns='Unnamed: 0', axis=1)

In [37]:
all_not_sep.head(3)

Unnamed: 0,CDR3,label
0,CASSSTGLPYGYTF,0
1,CASSSTGLPYGYTF,0
2,CASSSRSSYEQYF,0


In [38]:
all_not_sep = all_not_sep.drop_duplicates(subset=['CDR3'])

In [39]:
X_train, X_test, y_train, y_test = train_test_split(all_not_sep["CDR3"],all_not_sep["label"], test_size=0.2, random_state=42, stratify=all_not_sep["label"])

pipe = Pipeline([
    ('vect', TfidfVectorizer(ngram_range=(1, 3), analyzer='char', token_pattern='\S+')),
    ('log', LogisticRegression(random_state=42, max_iter=1e4))
])

pipe.fit(X_train, y_train)
train_score = roc_auc_score(
    y_train,
    pipe.predict_proba(X_train)[:, 1]
)
test_score = roc_auc_score(
    y_test,
    pipe.predict_proba(X_test)[:, 1]
)

In [40]:
train_score, test_score

(0.837866334139092, 0.7051400641960899)

In [42]:
pd.Series(
    pipe[1].coef_[0],
    pipe[0].get_feature_names_out()
).sort_values()

ssi   -2.801945
rs    -2.749827
si    -2.554590
ssm   -1.688982
s     -1.590105
         ...   
n      1.306738
sse    1.366480
nt     1.547428
di     1.660295
ntg    3.229023
Length: 5004, dtype: float64

In [31]:
# pip install eli5

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting eli5
  Downloading eli5-0.13.0.tar.gz (216 kB)
[K     |████████████████████████████████| 216 kB 2.5 MB/s eta 0:00:01
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[K     |████████████████████████████████| 47 kB 1.1 MB/s  eta 0:00:01
[?25hCollecting tabulate>=0.7.7
  Downloading tabulate-0.9.0-py3-none-any.whl (35 kB)
Building wheels for collected packages: eli5
  Building wheel for eli5 (setup.py) ... [?25ldone
[?25h  Created wheel for eli5: filename=eli5-0.13.0-py2.py3-none-any.whl size=107748 sha256=6cda0a75181d3eea4dc8f592585b8daa6645755ad1a0fd1eb2fb6331fac6f6d3
  Stored in directory: /home/ksmotuzenko/.cache/pip/wheels/7b/26/a5/8460416695a992a2966b41caa5338e5e

In [43]:
import eli5

eli5.show_weights(estimator=pipe[1], 
                  feature_names= list(pipe[0].get_feature_names()),
                  top=(50,50))



Weight?,Feature
+3.229,ntg
+1.660,di
+1.547,nt
+1.366,sse
+1.307,n
+1.292,sls
+1.248,ie
+1.223,d
+1.204,iea
+1.200,sl


# Fine-tuning

In [53]:
from transformers import AutoModelForSequenceClassification

num_labels = 2
model = (AutoModelForSequenceClassification
.from_pretrained(model_name, num_labels=num_labels)
.to(device))

Some weights of the model checkpoint at Rostlab/prot_bert were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initiali

In [55]:
my_dataset_encoded.set_format("torch",
columns=["input_ids", "attention_mask", "label"])

In [56]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [57]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(my_dataset_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=3,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="f1",
                                  weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  disable_tqdm=False,
                                  logging_steps=logging_steps,
                                  save_strategy='epoch',) 
                                  # eval_steps = 10,
                                  # save_total_limit = 5,
                                  # evaluation_strategy='yes',)

In [35]:
from transformers import Trainer

trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=my_dataset_encoded["train"],
                  eval_dataset=my_dataset_encoded["test"])
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hidden_state, CDR3. If hidden_state, CDR3 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8320
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 390


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6035,0.547313,0.663623,0.660375
2,0.5189,0.511382,0.681403,0.675984
3,0.4766,0.486183,0.705911,0.701904


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hidden_state, CDR3. If hidden_state, CDR3 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2081
  Batch size = 64
Saving model checkpoint to results/checkpoint-130
Configuration saved in results/checkpoint-130/config.json
Model weights saved in results/checkpoint-130/pytorch_model.bin
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hidden_state, CDR3. If hidden_state, CDR3 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2081
  Batch size = 64
Saving model checkpoint to results/checkpoint-260
Configuration saved in results/checkpoint-260/config.json
Model 

TrainOutput(global_step=390, training_loss=0.5329911843324319, metrics={'train_runtime': 263.3362, 'train_samples_per_second': 94.784, 'train_steps_per_second': 1.481, 'total_flos': 2156504445987840.0, 'train_loss': 0.5329911843324319, 'epoch': 3.0})

In [36]:
results = trainer.evaluate()
results

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hidden_state, CDR3. If hidden_state, CDR3 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 2081
  Batch size = 64


{'eval_loss': 0.48618268966674805,
 'eval_accuracy': 0.7059106198942816,
 'eval_f1': 0.7019044054730003,
 'eval_runtime': 4.1377,
 'eval_samples_per_second': 502.942,
 'eval_steps_per_second': 7.976,
 'epoch': 3.0}

In [37]:
preds_output = trainer.predict(my_dataset_encoded["test"])
preds_output.metrics

The following columns in the test set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hidden_state, CDR3. If hidden_state, CDR3 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 2081
  Batch size = 64


{'test_loss': 0.48618268966674805,
 'test_accuracy': 0.7059106198942816,
 'test_f1': 0.7019044054730003,
 'test_runtime': 4.0341,
 'test_samples_per_second': 515.857,
 'test_steps_per_second': 8.18}

In [41]:
trainer.save_model("models/protbert-cdr3")
tokenizer.save_pretrained("models/protbert-cdr3")

Saving model checkpoint to models/protbert-cdr3
Configuration saved in models/protbert-cdr3/config.json
Model weights saved in models/protbert-cdr3/pytorch_model.bin
tokenizer config file saved in models/protbert-cdr3/tokenizer_config.json
Special tokens file saved in models/protbert-cdr3/special_tokens_map.json


('models/protbert-cdr3/tokenizer_config.json',
 'models/protbert-cdr3/special_tokens_map.json',
 'models/protbert-cdr3/vocab.txt',
 'models/protbert-cdr3/added_tokens.json',
 'models/protbert-cdr3/tokenizer.json')

## Получаем скрытые слои для всех данных

In [None]:
# config = BertConfig.from_pretrained("Rostlab/prot_bert", output_hidden_states=True, config.output_attentions=True)
# model = AutoModelForSequenceClassification.from_pretrained("Rostlab/prot_bert", config=config)
# model.to(device);

In [None]:
# config_class, model_class, tokenizer_class = BertConfig, BertForMaskedLM, BertTokenizer

# config = config_class.from_json_file('models/protbert-cdr3/config.json', output_hidden_states=True, output_attentions=True)       
# # text_feature_extractor = model_class.from_pretrained('models/protbert-cdr3/pytorch_model.bin',
# #                                             from_tf=bool('.ckpt' in 'models/protbert-cdr3/pytorch_model.bin'),
# #                                             config=config)

# tokenizer = tokenizer_class.('models/protbert-cdr3/config.json', do_lower_case=True)
# model_ft = model.from_pretrained('models/protbert-cdr3/pytorch_model.bin', from_tf=bool('.ckpt' in 'models/protbert-cdr3/pytorch_model.bin'), config=config)
# model_ft.to(device);

## Вытаскиваем лучшую модель и считаем скрытые слои

In [None]:
config_class, model_class, tokenizer_class = BertConfig, BertForMaskedLM, BertTokenizer

config = config_class.from_pretrained('models/protbert-cdr3/config.json', output_hidden_states=True, output_attentions=True, num_hidden_layers = 6)
model_ft = AutoModel.from_pretrained('models/protbert-cdr3/pytorch_model.bin',
                                            from_tf=bool('.ckpt' in 'models/protbert-cdr3/pytorch_model.bin'),
                                            config=config)
model_ft.to(device);

In [36]:
model_ft

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30, 1024, padding_idx=0)
    (position_embeddings): Embedding(40000, 1024)
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.0, inplace=False

In [22]:
# model = model.from_pretrained(name, output_hidden_states=True)

In [None]:
# config = config_class.from_pretrained(name, output_hidden_states=True, output_attentions=True)
# tokenizer = tokenizer_class.from_pretrained(name, do_lower_case=True)

# model = model.from_pretrained(name, config=config)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(model_name)

In [18]:
def forward_pass(batch):
    input_ids = torch.tensor(batch["input_ids"]).to(device)
    attention_mask = torch.tensor(batch["attention_mask"]).to(device)
    
    with torch.no_grad():
        last_hidden_state = model_ft(input_ids, attention_mask).last_hidden_state
        last_hidden_state = last_hidden_state.cpu().numpy()
    
    # Use average of unmasked hidden states for classification
    lhs_shape = last_hidden_state.shape
    boolean_mask = ~np.array(batch["attention_mask"]).astype(bool)
    boolean_mask = np.repeat(boolean_mask, lhs_shape[-1], axis=-1)
    boolean_mask = boolean_mask.reshape(lhs_shape)
    masked_mean = np.ma.array(last_hidden_state, mask=boolean_mask).mean(axis=1)
    batch["hidden_state"] = masked_mean.data
    return batch

my_dataset_encoded_fine = my_dataset_encoded.map(forward_pass, batched=True,
                                        batch_size=16)

  0%|          | 0/520 [00:00<?, ?ba/s]

  0%|          | 0/131 [00:00<?, ?ba/s]

In [31]:
my_dataset_encoded_fine

DatasetDict({
    train: Dataset({
        features: ['CDR3', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 8320
    })
    test: Dataset({
        features: ['CDR3', 'label', 'input_ids', 'token_type_ids', 'attention_mask', 'hidden_state'],
        num_rows: 2081
    })
})

### Линейная регрессия на ft эмбеддингах как на фичах

In [None]:
X_train = np.array(my_dataset_encoded_fine["train"]["hidden_state"])
X_test = np.array(my_dataset_encoded_fine["test"]["hidden_state"])
y_train = np.array(my_dataset_encoded_fine["train"]["label"])
y_test = np.array(my_dataset_encoded_fine["test"]["label"])

In [20]:
lr_clf = LogisticRegression(n_jobs=-1, penalty="none",  max_iter=2000)
lr_clf.fit(X_train, y_train)
lr_clf.score(X_test, y_test)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

0.691975012013455

In [34]:
y_preds = lr_clf.predict(X_test)

In [35]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds, target_names=['0', '1']))

              precision    recall  f1-score   support

           0       0.75      0.77      0.76      1283
           1       0.61      0.59      0.60       798

    accuracy                           0.70      2081
   macro avg       0.68      0.68      0.68      2081
weighted avg       0.70      0.70      0.70      2081



In [36]:
y_preds

array([0, 1, 0, ..., 0, 1, 1])

In [25]:
from sklearn.metrics import roc_auc_score

train_score = roc_auc_score(
    y_train,
    lr_clf.predict(X_train)
)
test_score = roc_auc_score(
    y_test,
    lr_clf.predict(X_test)
)
train_score, test_score

(0.8377163350019394, 0.6798421423785498)

## True positive and false positive

from log reg + non-ft bert

In [37]:
import numpy as np 

diff = y_test-y_preds
print('diff: ',diff)

# Correct is 0 
# FP is -1 
# FN is 1
print('Correctly classified: ', np.where(diff == 0)[0])
print('Incorrectly classified: ', np.where(diff != 0)[0])
print('False positives: ', np.where(diff == -1)[0])
print('False negatives: ', np.where(diff == 1)[0])

diff:  [0 0 0 ... 0 0 0]
Correctly classified:  [   0    1    2 ... 2078 2079 2080]
Incorrectly classified:  [   4    5    7   14   15   22   26   30   32   34   35   50   56   57
   66   68   70   75   77   82   83   87   91   95  101  105  109  113
  116  120  121  132  134  135  145  147  151  153  154  156  160  162
  163  165  170  171  175  188  191  193  195  198  202  205  207  211
  212  215  222  228  230  242  244  248  257  259  261  264  268  269
  272  273  274  279  294  295  299  303  304  305  308  312  314  315
  319  320  322  327  329  330  336  337  343  346  351  354  357  365
  366  371  372  375  376  378  379  380  383  389  392  395  397  400
  402  404  406  413  414  418  421  423  426  427  430  431  436  441
  445  452  458  459  460  462  463  465  468  470  477  479  481  483
  487  488  490  495  500  504  505  508  509  513  515  519  524  529
  530  531  533  543  549  550  559  564  568  571  573  575  580  587
  589  591  594  599  602  603  606  60

In [42]:
np.where(diff == 0)[0]

array([   0,    1,    2, ..., 2078, 2079, 2080])

In [29]:
my_dataset_dict["test"]["CDR3"] == 'C A S S I L T G P Q P Q H F'

['C A S S L G F P N Q P Q H F',
 'C A S S P D R N T G E L F F',
 'C A S S M R S S G E L F F',
 'C A S S Q D L G A G T E A F F',
 'C A S R Y S G G D T G E L F F',
 'C A S S P L A R R G G Y N E Q F F',
 'C A S S Y S T T G R Y E Q F F',
 'C A S S T G T G V F E D T E A F F',
 'C S A R D E R A L N T G E L F F',
 'C A S A N V G L N T E A F F',
 'C A S S E L T G G G Y E Q Y F',
 'C A S S P S R I L T G E T E K L F F',
 'C A S T R G Q R N G A F F',
 'C A S Q G L N T G E L F F',
 'C S A D G G D I W S D E Q F F',
 'C R Y K G Q G V S G A N V L T F',
 'C S V R D I S T N E K L F F',
 'C A S S S P D N Y R E D T E A F F',
 'C A W S V G G G G Y G Y T F',
 'C A S S M I G T G A L N E Q F F',
 'C A W S L N G D E Q Y F',
 'C S A R D Y R G G T T Y E Q Y F',
 'C S A R D N L A G D T D T Q Y F',
 'C A S S F Q N T G E L F F',
 'C A S S L V Q A S E N E Q Y F',
 'C A S S V R S T D T Q Y F',
 'C A S S Q F N E K L F F',
 'C A S S L F Q G D E Q F F',
 'C A S D R V G S N Q P Q H F',
 'C A S S Q D Q A G G N T I Y F',


In [49]:
print(my_dataset_dict["test"]["CDR3"].index("C A S S I L T G P Q P Q H F"))

950


In [51]:
my_dataset_dict["test"]["label"][950]

0

In [55]:
corr = np.where(diff == 0)[0] 

In [56]:
950 in corr

True

## Bertviz

In [23]:
from bertviz import model_view, head_view
from transformers import AutoTokenizer, AutoModel, utils
from transformers import AutoTokenizer
from bertviz.transformers_neuron_view import BertModel
from bertviz.neuron_view import show

In [45]:
my_dataset_dict["train"]["label"][0]

1

In [None]:
show(model, "bert", tokenize, my_dataset_dict, display_mode="light",
layer=0, head=8)

In [22]:
from bertviz import model_view, head_view
from transformers import AutoTokenizer, AutoModel, utils

utils.logging.set_verbosity_error()

## 0 label

0 - coorectly classified

In [24]:
tokenizer = AutoTokenizer.from_pretrained("Rostlab/prot_bert")
model = AutoModel.from_pretrained("Rostlab/prot_bert", output_attentions=True, num_hidden_layers=6)
inputs = tokenizer.encode(my_dataset_dict["train"]["CDR3"][0], return_tensors='pt')
outputs = model(inputs)
attention = outputs[-1]  # Output includes attention weights when output_attentions=True
tokens = tokenizer.convert_ids_to_tokens(inputs[0]) 
head_view(attention, tokens)

<IPython.core.display.Javascript object>

In [25]:
model_view(attention, tokens)

<IPython.core.display.Javascript object>

In [23]:
my_dataset_dict["train"]["CDR3"][0]

'C A S S I R S G N E Q F F'