In [1]:
import pandas as pd
import numpy as np
import json
import os
import re
import datasets
from torch.utils.data import DataLoader,Dataset,RandomSampler, SequentialSampler
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from typing import List
from transformers import BertModel,BertTokenizer,AdamW, get_linear_schedule_with_warmup
from transformers import AutoModelForSequenceClassification, PreTrainedTokenizerFast
from torch import nn ,cuda
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import BertTokenizer, BertTokenizerFast
# cuda
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


2022-11-28 11:50:36.476825: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-11-28 11:50:36.596431: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-28 11:50:36.596449: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-28 11:50:36.619951: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-11-28 11:50:37.220530: W tensorflow/stream_executor/platform/de

In [50]:
class MultiLDataset(Dataset):

    def __init__(self,
                 texts: list,
                 labels: list,
                 tokenizer,
                 max_len: int,
                 binarizer):
        self.binarizer = binarizer
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels : np.ndarray =  self.binarizer.transform(labels)#np.asarray(labels)# 
        self.max_len = max_len


    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item_idx):
        text: str = self.texts[item_idx]
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=False,
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].flatten()
        attn_mask = inputs['attention_mask'].flatten()

        return {'input_ids': input_ids,
                'attention_mask': attn_mask,
                'label': torch.tensor(self.labels[item_idx], dtype=torch.float)
                }


class DataModule(pl.LightningDataModule):

    ## binarizer is automatically set to None
    ## but it must be initialized with a
    ## binarizer, such as
    def __init__(self, x_tr_:list, y_tr_:list,\
                 x_val_:list, y_val_:list,\
                 x_test_:list, y_test_:list,
                 tokenizer_, batch_size=16, max_token_len=200,
                 binarizer=None):

        super().__init__()
        self.binarizer = binarizer
        self.tr_text = x_tr_
        self.tr_label = y_tr_
        self.val_text = x_val_
        self.val_label = y_val_
        self.test_text = x_test_
        self.test_label = y_test_
        self.tokenizer = tokenizer_
        self.batch_size = batch_size
        self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = MultiLDataset(texts=self.tr_text,
                                       labels=self.tr_label,
                                       tokenizer=self.tokenizer, max_len=self.max_token_len, binarizer=self.binarizer)
        self.val_dataset = MultiLDataset(texts=self.val_text, labels=self.val_label, tokenizer=self.tokenizer,
                                     max_len=self.max_token_len, binarizer=self.binarizer)
        self.test_dataset = MultiLDataset(texts=self.test_text, labels=self.test_label, tokenizer=self.tokenizer,
                                      max_len=self.max_token_len, binarizer=self.binarizer)


    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=16)

    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=16)


# we will use the BERT base model(the smaller one)
class LabelsClassifier(pl.LightningModule):
    # Set up the classifier
    def __init__(self, bert_model, n_classes:int, steps_per_epoch=None, n_epochs=3, lr=2e-5):
        super().__init__()
        self.bert = BertModel.from_pretrained(bert_model, return_dict=True)
        #self.bert = AutoModelForSequenceClassification.from_pretrained(bert_model, num_labels=n_classes,return_dict=True)
        self.classifier = torch.nn.Linear(self.bert.config.hidden_size, n_classes)
        self.dropout =  nn.Dropout(0.25)
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self, input_ids, attn_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attn_mask)
        print("first output",  len(output.pooler_output))
        print("first output 0",  len(output.pooler_output[0]))
        print("first output 1",  len(output.pooler_output[1]))
        print("output pooler")
        print(len(output.pooler_output[0]))
        print(len(output.pooler_output[1]))
        pooled_output = self.dropout(output.pooler_output)
        print("after dropout ",  len(output))
        print("after dropout ---  ",  len(output[0][0]))
        print("after dropout ---  ",  len(output[0][1]))
        output = self.classifier(pooled_output)
        print("after linear transformation ",  output.shape)
        print("output is")
        print(output)
        import sys
        sys.exit
        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = self(input_ids, attention_mask)
        loss = self.criterion(outputs, labels)
        self.log('train_loss', loss, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = self(input_ids, attention_mask)
        loss = self.criterion(outputs, labels)
        self.log('val_loss', loss, prog_bar=True, logger=True)

        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['label']

        outputs = self(input_ids, attention_mask)
        loss = self.criterion(outputs, labels)
        self.log('test_loss', loss, prog_bar=True, logger=True)

        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.lr)
        warmup_steps = self.steps_per_epoch // 3
        total_steps = self.steps_per_epoch * self.n_epochs - warmup_steps

        scheduler = get_linear_schedule_with_warmup(optimizer, warmup_steps, total_steps)

        return [optimizer], [scheduler]

### move to antoher folder

def _binarize(labels):

    #print(MultiLabelBinarizer().fit(labels).classes_.shape)
    return MultiLabelBinarizer().fit(labels)

def split_train_test(texts:list, labels:list, random_seed=99):

    x_train, x_test, y_train, y_test = \
            train_test_split(texts, labels,\
                            test_size = 0.1, random_state=random_seed,\
                             shuffle=True)
    return x_train, x_test, y_train, y_test

def func_utf8(text):
    try:
        return text.encode("latin-1").decode("UTF-8")
    except Exception as E:
        return  text


def remwithre(text, there=re.compile(re.escape('Mit freundlich')+'.*')):
    return there.sub('', text)

def modelCheckpoint(filename = None, top_models = 10, mode='min'):
    #pytorch_lightning.callbacks.ModelCheckpoint(...)
    if(type(filename) == type(None)):
        filename = 'Labels-{epoch:02d}-{val_loss:.2f}-{train_loss:.2f}'
    return ModelCheckpoint(monitor = 'val_loss' ,# monitored quantity,
                           filename=filename,
                           save_top_k=top_models ,# save the top k models
                           mode=mode,) # mode of the monitored quantity for optimization)

In [51]:
"""
## the .json file for the data is store as follow:
##        '{"data":[]}'
multi_path = "/home/fb198/BA/classification/data_files_classification_data_multi_labeled_hf_dataset_organs_disorders.json"
data = datasets.Dataset.from_json(multi_path)
data


df = pd.DataFrame(data)
#print(df.shape)
# df.text = df['text'].apply(func_utf8)
#for idx_, t in enumerate(texts):
#    texts[idx_] = func_utf8(t)


texts = [remwithre(t) for t in df.text.tolist()]
texts = [remwithre(t,there=re.compile(re.escape('Prof.')+'.*')) for t in texts]
texts = [t.replace("\n\r\n", '') for t in texts]
texts = [t.replace("\r", '') for t in texts]
texts = [re.sub(r'\.+', ".", t) for t in texts]
texts = [re.sub(r'\-+', "-", t) for t in texts]
texts = [re.sub(r'\,+', "", t) for t in texts]

index2labelname = ['colon', 'prostate', 'stomach', 'inflammation', 'carcinoma', 'adenoma', 'BPH']
labels_list = df.label.tolist()
#labels_list[:2]
for idx, l in enumerate(labels_list):
    temp = []
    for idx_, i in enumerate(l):
        if(i == 1):
            temp.append(index2labelname[idx_])
    labels_list[idx] = temp

df['text'] = texts
df['label'] = labels_list
"""
multi_path = multi_path = "/home/fb198/BA/DataNephroTexts/classification_data/filtered_multi_labeled_final.json"
data = datasets.Dataset.from_json(multi_path)
data

index2labelname = {7:'colon',
                   1:'prostate',
                   2:'stomach',
                   3:'inflammation',
                   4:'carcinoma',
                   5:'adenoma',
                   6:'BPH'}
labelname2index = {'colon': 7,
                     'prostate': 1,
                     'stomach': 2,
                     'inflammation': 3,
                     'carcinoma': 4,
                     'adenoma': 5,
                     'BPH': 6}
index2labelnamelist = ['colon', 'prostate', 'stomach', 'inflammation', 'carcinoma', 'adenoma', 'BPH']


df_mulitlabel = pd.DataFrame(data)
labels_list = df_mulitlabel.label.tolist()
#labels_list[:2]
for idx, l in enumerate(labels_list):
    temp = []
    for idx_, i in enumerate(l):
        if(i == 1):
            temp.append(index2labelnamelist[idx_])
    labels_list[idx] = temp
df_mulitlabel['labels_test'] = labels_list
df_mulitlabel.columns = ["text", "label_numerical", "label"]
print(f"first five rows in dataframe \n {df_mulitlabel.head(5)}")



x_train, x_test, y_train, y_test = split_train_test(df_mulitlabel['text'].tolist(), df_mulitlabel['label'].tolist())
x_tr, x_val, y_tr, y_val = split_train_test(x_train, y_train)
print(f"number of samples in training data {len(x_train)}, number of samples in testing data {len(x_test)}")

## binarize labels
binarizer = _binarize(df_mulitlabel['label'].tolist())


Using custom data configuration default-1be2fdcf59742fff
Reusing dataset json (/home/fb198/.cache/huggingface/datasets/json/default-1be2fdcf59742fff/0.0.0)


first five rows in dataframe 
                                                 text        label_numerical  \
0   Fragmentiert vorliegende 0,5 cm große Probeen...  [0, 0, 1, 1, 0, 0, 0]   
1   1 (rechts Basis medial): Eine maximal 1,3 cm ...  [0, 1, 0, 0, 1, 0, 0]   
2  Nachbericht:  Im nachträglich eingebetteten Fe...  [0, 0, 0, 0, 1, 0, 0]   
3   jeweils 0,5 cm durchmessende Proben vom Darm ...  [1, 0, 0, 1, 0, 0, 0]   
4   1. Lymphknoten Milzhilus: Ein 1,6 x 1,3 x 0,5...  [1, 0, 1, 0, 0, 0, 0]   

                     label  
0  [stomach, inflammation]  
1    [prostate, carcinoma]  
2              [carcinoma]  
3    [colon, inflammation]  
4         [colon, stomach]  
number of samples in training data 3813, number of samples in testing data 424


In [52]:
## tokenizer's path

#model_tk_path = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/mlm_evaluation_2/bert-1_sp_1_batch_size_8"
#tk_filtered = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/mlm_evaluation_2/bert-1_sp_1_batch_size_8"
pretrained_model_pth = "/home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/filtered_data_training_bert/sentencepiece_v1000_filtered_training_data"
BERT_MODEL_NAME = pretrained_model_pth


tokenizer = PreTrainedTokenizerFast.from_pretrained(BERT_MODEL_NAME)
#tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

MAX_LEN = 512
BATCH_SIZE = 8
data_module = DataModule(x_tr_ = x_tr ,y_tr_ = y_tr,
                            x_val_ = x_val, y_val_ = y_val,
                            x_test_ = x_test, y_test_=y_test,
                            tokenizer_ = tokenizer,
                            batch_size=BATCH_SIZE, max_token_len=MAX_LEN, binarizer=binarizer)
data_module.setup()
checkpoint_callback = modelCheckpoint()
num_unique_classes =  len(binarizer.classes_) #len(df_mulitlabel.label.tolist()[0]) ## adjust it to a
N_EPOCHS = 10
BATCH_SIZE_ = 8 
LR = 2e-05
steps_per_epoch = len(x_tr)//BATCH_SIZE_


model = LabelsClassifier(BERT_MODEL_NAME,
                    n_classes=num_unique_classes,
                    steps_per_epoch=steps_per_epoch,
                    n_epochs=N_EPOCHS,
                    lr=LR
                        )

# Instantiate and set up the data_module

model
# Instantiate the Model Trainer
trainer = pl.Trainer(max_epochs=N_EPOCHS, gpus=1, callbacks=[checkpoint_callback], enable_progress_bar=True)
# Train the Classifier Model
trainer.fit(model, data_module)

Some weights of the model checkpoint at /home/fb198/BA/nlp-in-diagnostic-texts-from-nephropathology-master/LanguageModelling/LanguageModelling/filtered_data_training_bert/sentencepiece_v1000_filtered_training_data were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertModel w

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 86.8 M
1 | classifier | Linear            | 5.4 K 
2 | dropout    | Dropout           | 0     
3 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
86.8 M    Trainable params
0         Non-trainable params
86.8 M    Total params
347.259   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


first output 16
first output 0 768
first output 1 768
output pooler
768
768
after dropout  2
after dropout ---   512
after dropout ---   512
after linear transformation  torch.Size([16, 7])
output is
tensor([[ 6.7387e-03,  1.1480e-01, -1.9100e-02, -1.7665e-01,  1.3457e-01,
          4.7156e-01, -1.2673e-01],
        [ 5.4611e-04,  1.7378e-01,  3.8233e-02, -2.9726e-01,  1.1189e-01,
          3.0127e-01, -1.9259e-01],
        [ 4.9352e-04,  2.2738e-01,  4.8346e-02, -2.0077e-01,  5.1712e-02,
          3.4578e-01, -1.7440e-01],
        [-3.7720e-02,  1.5368e-01,  4.1026e-02, -3.2399e-01,  1.4420e-01,
          2.7285e-01, -1.8885e-01],
        [-1.0046e-01,  1.7977e-01, -1.2886e-01, -2.0054e-01,  1.3153e-01,
          2.3185e-01, -2.3985e-01],
        [ 4.2761e-02,  3.1786e-01, -4.9221e-03, -2.9027e-01,  1.2250e-01,
          4.3284e-01, -1.6661e-01],
        [-1.8256e-01,  1.1400e-01, -1.1158e-01, -2.9309e-01,  2.6147e-02,
          3.7845e-01, -2.1318e-01],
        [ 1.6019e-02,  1.4518e

Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

first output 8
first output 0 768
first output 1 768
output pooler
768
768
after dropout  2
after dropout ---   512
after dropout ---   512
after linear transformation  torch.Size([8, 7])
output is
tensor([[ 0.0310,  0.5032, -0.0970, -0.2175, -0.1381,  0.0829,  0.0782],
        [-0.0398,  0.1543,  0.2319, -0.0936,  0.1732,  0.1937, -0.2723],
        [-0.3464,  0.0790, -0.1107, -0.5117,  0.0387,  0.8389,  0.0736],
        [ 0.1416,  0.2867,  0.0637,  0.0392, -0.0861,  0.1166, -0.1860],
        [-0.1506,  0.1386,  0.2109, -0.3218,  0.0980,  0.7958, -0.0432],
        [ 0.0234,  0.5302, -0.2681, -0.2902, -0.0368,  0.6210, -0.1478],
        [-0.5369, -0.0127, -0.3511,  0.2152, -0.1687,  0.5329, -0.4045],
        [-0.2835,  0.2277, -0.3892, -0.0887,  0.0804, -0.0453, -0.2014]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
first output 8
first output 0 768
first output 1 768
output pooler
768
768
after dropout  2
after dropout ---   512
after dropout ---   512
after linear transformation

first output 8
first output 0 768
first output 1 768
output pooler
768
768
after dropout  2
after dropout ---   512
after dropout ---   512
after linear transformation  torch.Size([8, 7])
output is
tensor([[ 0.1998, -0.0340,  0.1166,  0.1042,  0.0360,  0.2208, -0.3306],
        [ 0.0378, -0.1835, -0.0943, -0.2911,  0.1807,  0.3315,  0.0857],
        [ 0.0999,  0.3059,  0.0187, -0.1486,  0.2664,  0.2839, -0.2769],
        [ 0.0552,  0.3255,  0.1112, -0.2846, -0.2466, -0.0445, -0.2265],
        [ 0.0227, -0.1901,  0.2537, -0.0518, -0.0637,  0.4876, -0.1751],
        [-0.1540, -0.1029,  0.1340, -0.4167, -0.3214,  0.6383, -0.0736],
        [ 0.2376,  0.2049, -0.4304,  0.0502,  0.1528,  0.3734,  0.0592],
        [-0.0920,  0.2091, -0.0658,  0.0077,  0.2018,  0.1111,  0.1494]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
first output 8
first output 0 768
first output 1 768
output pooler
768
768
after dropout  2
after dropout ---   512
after dropout ---   512
after linear transformation

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


In [58]:
np.array([1,0,0]).dot(np.log([0.5,0.1,0.1]))

-0.6931471805599453

In [57]:
np.array([0,1,0]).dot(np.log([0.8,0.1,0.1]))

-2.3025850929940455

# test model

In [40]:
# Evaluate the model performance on the test dataset
trainer.test(model,datamodule=data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

first output 16
output pooler
768
after dropout  2
after linear transformation  torch.Size([16, 7])
output is
tensor([[ 3.1158e-01, -4.5311e-01, -5.6874e-02, -1.8313e-01, -3.6651e-01,
          9.0005e-02,  8.5200e-02],
        [ 1.4309e-01, -2.6659e-01, -2.6319e-01, -2.4824e-01, -3.1615e-01,
          8.7576e-02,  1.0758e-01],
        [ 2.9626e-01, -2.0970e-01, -2.5891e-01, -2.1066e-01, -2.7369e-01,
          2.1883e-01,  7.1913e-02],
        [ 3.0371e-01, -2.9100e-02, -1.1777e-01, -9.2428e-02, -2.9317e-01,
          2.4454e-01,  2.1991e-01],
        [ 2.9084e-01, -1.6524e-01, -1.5787e-01, -1.7299e-01, -2.8365e-01,
          5.8818e-02,  1.1024e-01],
        [ 1.7204e-01, -3.3301e-01, -2.4086e-01, -1.1848e-01, -4.6661e-01,
          7.9397e-02,  2.2292e-01],
        [ 1.4905e-01, -1.7121e-01, -4.1588e-02, -2.0829e-01, -2.2960e-01,
          1.9639e-01,  2.3637e-01],
        [ 3.7817e-01, -3.6942e-01, -2.1609e-01, -1.9435e-01, -2.5473e-01,
         -5.3759e-02,  2.1029e-01],
        [ 

tensor([[ 0.2462, -0.1453, -0.1236, -0.1280, -0.2110,  0.0092,  0.0677],
        [ 0.4052, -0.1661, -0.1855, -0.0345, -0.3857, -0.0686,  0.0965],
        [ 0.2844, -0.2099, -0.2007, -0.0443, -0.3813,  0.0628,  0.1055],
        [ 0.1701, -0.2581, -0.1142, -0.2046, -0.2564,  0.0904,  0.2659],
        [ 0.0741, -0.2576, -0.2398, -0.2107, -0.1959,  0.0893,  0.3093],
        [ 0.1885, -0.2198, -0.1756, -0.0753, -0.3552,  0.0997,  0.2413],
        [ 0.3092, -0.2334, -0.1007, -0.1257, -0.3630,  0.0440, -0.0056],
        [ 0.2982, -0.0477, -0.1370, -0.1554, -0.4577,  0.0992, -0.0542],
        [ 0.3219, -0.2411, -0.0811, -0.1568, -0.3486,  0.0777,  0.0395],
        [ 0.2901, -0.1914, -0.1714, -0.0893, -0.2598,  0.0521,  0.1209],
        [ 0.2717, -0.0454, -0.3258, -0.0177, -0.3072,  0.0833,  0.2270],
        [ 0.1875, -0.2250, -0.1711, -0.2238, -0.3945,  0.0876,  0.2015],
        [ 0.2791, -0.0986, -0.2997, -0.0316, -0.2798,  0.0962,  0.2317],
        [ 0.1393, -0.2504, -0.1529, -0.2555, -0.260

tensor([[ 0.2994, -0.2316, -0.2579, -0.1525, -0.3591,  0.0224,  0.0439],
        [ 0.3701, -0.1330, -0.1967, -0.1224, -0.2929, -0.0608,  0.0166],
        [ 0.1834, -0.2619, -0.1863, -0.0963, -0.3483,  0.1356,  0.2003],
        [ 0.2213, -0.3019, -0.1602, -0.1582, -0.2985, -0.0176,  0.2226],
        [ 0.2020, -0.3837, -0.2963, -0.1861, -0.4291, -0.0565,  0.0748],
        [ 0.3272, -0.2375, -0.2122, -0.1228, -0.3605,  0.0628,  0.1163],
        [ 0.3387, -0.1631, -0.1460, -0.1583, -0.3296,  0.0712,  0.0504],
        [ 0.2621, -0.1232, -0.0454, -0.0285, -0.1935,  0.0630,  0.0979],
        [ 0.3437, -0.2529, -0.0269, -0.1343, -0.3349, -0.0579, -0.0333],
        [ 0.2651, -0.2444, -0.1112, -0.1553, -0.2332,  0.1077,  0.1397],
        [ 0.2870, -0.2563, -0.0746,  0.0126, -0.4785,  0.2255, -0.0691],
        [ 0.2883, -0.0819, -0.1395, -0.2586, -0.2827,  0.1565, -0.0097],
        [ 0.2892, -0.1522, -0.2046, -0.1282, -0.3380,  0.0299,  0.0027],
        [ 0.1962, -0.2416, -0.1700, -0.1721, -0.241

tensor([[ 2.5390e-01, -1.8118e-01,  1.6996e-02, -8.3232e-02, -2.1133e-01,
          1.1528e-01,  9.4147e-02],
        [ 3.6496e-01,  6.5263e-02, -9.5154e-02, -2.9278e-01, -4.5829e-01,
          2.0662e-01, -5.3537e-02],
        [ 2.7138e-01,  1.1942e-02,  2.2219e-04, -2.2258e-01, -3.1216e-01,
         -7.0005e-02,  7.8048e-02],
        [ 1.7364e-01, -2.7446e-01, -1.7379e-02, -2.3706e-01, -2.0423e-01,
          4.7542e-02,  1.8480e-01],
        [ 2.7645e-01, -2.5093e-01, -1.7037e-01, -1.9409e-01, -3.3925e-01,
          3.8763e-02,  8.5916e-02],
        [ 2.6145e-01, -2.4202e-01, -7.6329e-02, -8.1567e-02, -2.3540e-01,
          3.9976e-02,  3.4762e-02],
        [ 3.3013e-01,  1.2392e-01, -5.7740e-02, -8.1737e-02, -3.2944e-01,
          4.4428e-02,  9.7522e-02],
        [ 2.3712e-01, -2.7147e-01, -2.7428e-03, -7.3397e-02, -1.2248e-01,
          7.3165e-02,  1.2975e-01],
        [ 3.0923e-01, -1.9293e-01, -2.1727e-01, -1.4150e-01, -3.5055e-01,
          2.6117e-02, -7.1998e-03],
        [ 

tensor([[ 0.3050, -0.0751, -0.0660, -0.1555, -0.4698,  0.1354,  0.0830],
        [ 0.3735, -0.2475, -0.1773, -0.1107, -0.3447, -0.0464,  0.0279],
        [ 0.3336, -0.2469, -0.1641, -0.0575, -0.3678,  0.0070,  0.0844],
        [ 0.2784, -0.2317, -0.1871, -0.0762, -0.2429,  0.1596,  0.1526],
        [ 0.2758, -0.2705, -0.1534, -0.2343, -0.3614,  0.0030,  0.1478],
        [ 0.2050, -0.2801, -0.1675, -0.0954, -0.2404,  0.2065,  0.2085],
        [ 0.2013, -0.2093, -0.1943, -0.0578, -0.3691,  0.0876,  0.2172],
        [ 0.2804, -0.3315, -0.0598, -0.1976, -0.3846,  0.0639,  0.0690],
        [ 0.3146, -0.0719, -0.3004,  0.0098, -0.2794,  0.0885,  0.1450],
        [ 0.2896, -0.0925, -0.1508, -0.1076, -0.2605,  0.0019,  0.2081],
        [ 0.1454, -0.0951, -0.1139, -0.3485, -0.3785,  0.1608,  0.1113],
        [ 0.2624, -0.2712, -0.0726, -0.1763, -0.4019,  0.1657,  0.1000],
        [ 0.3212, -0.1025, -0.0955, -0.1469, -0.4150,  0.1346, -0.0037],
        [ 0.1028, -0.2207, -0.1344, -0.1621, -0.241

[{'test_loss': 0.6836158633232117}]

In [267]:
# Visualize the logs using tensorboard. #lightning_logs/
%load_ext tensorboard
%tensorboard --logdir lightning_logs/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


Reusing TensorBoard on port 6006 (pid 155923), started 1 day, 21:07:51 ago. (Use '!kill 155923' to kill it.)

In [216]:
len(y_test), len(x_test)

(424, 424)

In [217]:
# Retreive the checkpoint path for best model
model_path = checkpoint_callback.best_model_path
model_path

'/home/fb198/BA/classification/multi_class/lightning_logs/version_8/checkpoints/Labels-epoch=09-val_loss=0.25-train_loss=0.32.ckpt'

In [218]:
#print(len(y_test), len(x_test))
# Size of Test set
print(f'Number of test samples = {len(x_test)}')

Number of test samples = 424


In [219]:
from torch.utils.data import TensorDataset

# Tokenize all questions in x_test
input_ids = []
attention_masks = []


for description in x_test:
    encoded_quest = tokenizer.encode_plus(
                    description,
                    None,
                    add_special_tokens=True,
                    max_length= MAX_LEN,
                    padding = 'max_length',
                    return_token_type_ids= False,
                    return_attention_mask= True,
                    truncation=True,
                    return_tensors = 'pt'      
    )
    # Add the input_ids from encoded question to the list.    
    input_ids.append(encoded_quest['input_ids'])
    # Add its attention mask 
    attention_masks.append(encoded_quest['attention_mask'])
    
# Now convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(binarizer.transform(y_test)) #y_test

# Set the batch size.  
TEST_BATCH_SIZE = 8  

print(labels.shape)
print(attention_masks.shape)

torch.Size([424, 7])
torch.Size([424, 512])


In [220]:
# Create the DataLoader.
pred_data = TensorDataset(input_ids, attention_masks, labels)
pred_sampler = SequentialSampler(pred_data)
pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=TEST_BATCH_SIZE)

In [221]:
# Put model in evaluation mode
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device) # moving model to cuda
model.eval()

# Tracking variables 
pred_outs, true_labels = [], []
#i=0
# Predict 
for batch in pred_dataloader:
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
  
    # Unpack the inputs from our dataloader
    b_input_ids, b_attn_mask, b_labels = batch
 
    with torch.no_grad():
        # Forward pass, calculate logit predictions
        pred_out = model(b_input_ids,b_attn_mask)
        pred_out = torch.sigmoid(pred_out)
        # Move predicted output and labels to CPU
        pred_out = pred_out.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        #i+=1
        # Store predictions and true labels
        #print(i)
        #print(outputs)
        #print(logits)
        #print(label_ids)
    pred_outs.append(pred_out)
    true_labels.append(label_ids)
    
# Combine the results across all batches. 
flat_pred_outs = np.concatenate(pred_outs, axis=0)

# Combine the correct labels for each batch into a single list.
flat_true_labels = np.concatenate(true_labels, axis=0)

print(flat_pred_outs.shape , flat_true_labels.shape)

(424, 7) (424, 7)


In [20]:
import numpy as np
from scipy.special import softmax
(softmax([[-3,4,5,8],[7,5,-6,9],[-3,2,1,5]])), np.exp([1,-2,-3])


(array([[3.92310761e-06, 4.30220989e-03, 1.16946190e-02, 2.34892701e-01],
        [8.64121955e-02, 1.16946190e-02, 1.95320027e-07, 6.38504560e-01],
        [3.92310761e-06, 5.82240793e-04, 2.14194418e-04, 1.16946190e-02]]),
 array([2.71828183, 0.13533528, 0.04978707]))

In [None]:
([[-3,4,5,8],[7,5,-6,9],[-3,2,1,5]])

In [49]:
softmax([-3,4,5,8])

array([1.56365484e-05, 1.71475574e-02, 4.66118937e-02, 9.36224912e-01])

In [16]:
from torch import nn
import torch

In [18]:
arr_exp = [[-0.2455, -0.1434, -0.0611,  0.2274, -0.1777, -0.4269,  0.0411],
        [ 0.1890, -0.1654, -0.3152,  0.0682,  0.0319,  0.0546,  0.5577],
        [-0.0895, -0.2367,  0.0864, -0.0883,  0.0559,  0.1175,  0.2676],
        [ 0.1260, -0.0741, -0.1121,  0.2380, -0.0779,  0.0131,  0.1981],
        [ 0.3605, -0.1398, -0.3145,  0.2203, -0.1846,  0.1293,  0.3266],
        [-0.2461, -0.2586,  0.0239, -0.2525, -0.0506, -0.1413,  0.4473],
        [-0.0562, -0.0878,  0.2782,  0.2046, -0.4587,  0.4339,  0.5150],
        [-0.2414, -0.2358,  0.2084, -0.1253, -0.4013,  0.1053,  0.6817]]

In [None]:
arr_exp

In [18]:
a = [[-3.6984,4.4987,5.9874645,8.6498],[7.2165,5.786,-6.648,9.38],[-3.19613,2.1597496,1.78564,5.34984]]
linear_dd = nn.Linear(4,3)

In [22]:
#exp_ = linear_dd(torch.tensor(a), dim=1)
thres_exp = 0.018
softed = softmax(arr_exp)

preds_exp = []
for row in softed:
    temp_exp = []
    for val in row:
        if(val > thres_exp):
            temp_exp.append(1)
        else:
            temp_exp.append(0)
    preds_exp.append(temp_exp)

In [25]:
softed.shape

(8, 7)

In [27]:
thresholdexp  = np.arange(0.2,0.51,0.01)
thresholdexp

array([0.2 , 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3 ,
       0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4 , 0.41,
       0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.5 ])

In [222]:
#define candidate threshold values
threshold  = np.arange(0.2,0.51,0.01)
threshold
# convert probabilities into 0 or 1 based on a threshold value
def classify(pred_prob,thresh):
    y_pred = []

    for tag_label_row in pred_prob:
        temp=[]
        for tag_label in tag_label_row:
            if tag_label >= thresh:
                temp.append(1) # Infer tag value as 1 (present)
            else:
                temp.append(0) # Infer tag value as 0 (absent)
        y_pred.append(temp)

    return y_pred

from sklearn import metrics
scores=[] # Store the list of f1 scores for prediction on each threshold

#convert labels to 1D array
y_true = flat_true_labels.ravel() 

for thresh in threshold:
    
    #classes for each threshold
    pred_bin_label = classify(flat_pred_outs,thresh) 

    #convert to 1D array
    y_pred = np.array(pred_bin_label).ravel()

    scores.append(metrics.f1_score(y_true,y_pred))
    
# find the optimal threshold
opt_thresh = threshold[scores.index(max(scores))]
print(f'Optimal Threshold Value = {opt_thresh}')

print(f"scores: \n{scores}")

Optimal Threshold Value = 0.42000000000000004
scores: 
[0.7362171331636981, 0.7377472055030095, 0.7383015597920277, 0.73665791776028, 0.7360988526037069, 0.7342222222222222, 0.7305282005371531, 0.7249774571686203, 0.7247956403269754, 0.7221206581352834, 0.7199265381083563]


# Performance Score Evaluation


In [223]:
#predictions for optimal threshold
y_pred_labels = classify(flat_pred_outs,opt_thresh)
y_pred = np.array(y_pred_labels).ravel() # Flatten

In [224]:
#y_test[:10], flat_true_labels[:10]

In [225]:
def convert_labels(array_2d):
    array_2d_ = array_2d.copy()
    for idx_2d , arr in enumerate(array_2d_):
        for idx_, value in enumerate(arr):
            
            if value == 0:
                array_2d_[idx_2d][idx_] = -1
            if value == 1:
                array_2d_[idx_2d][idx_] = labelname2index[label_pos[idx_]]
    return array_2d_

In [226]:
in_temp_true = flat_true_labels
in_temp_pred = np.array(y_pred_labels)

flat_true_labels_converted = convert_labels(in_temp_true)
flat_pred_labels_converted = convert_labels(in_temp_pred)

flat_true_labels_converted = flat_true_labels_converted.ravel()
flat_pred_labels_converted = flat_pred_labels_converted.ravel()

print(metrics.classification_report(flat_true_labels_converted, flat_pred_labels_converted))

              precision    recall  f1-score   support

          -1       0.93      0.94      0.94      2373
           1       0.97      0.97      0.97       147
           2       0.68      0.65      0.66        88
           3       0.62      0.16      0.25        51
           4       0.65      0.74      0.70       140
           5       0.43      0.16      0.23        19
           6       0.00      0.00      0.00        22
           7       0.75      0.88      0.81       128

    accuracy                           0.90      2968
   macro avg       0.63      0.56      0.57      2968
weighted avg       0.89      0.90      0.89      2968



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [227]:
## print(y_true.shape, y_pred.shape)
print(metrics.classification_report(y_true,y_pred))

              precision    recall  f1-score   support

           0       0.93      0.94      0.94      2373
           1       0.76      0.72      0.74       595

    accuracy                           0.90      2968
   macro avg       0.85      0.83      0.84      2968
weighted avg       0.90      0.90      0.90      2968



In [228]:
y_pred = binarizer.inverse_transform(np.array(y_pred_labels))
y_act = binarizer.inverse_transform(flat_true_labels)

df_pred_2 = pd.DataFrame({'Body':x_test,'Actual Tags':y_act,'Predicted Tags':y_pred})
df_pred_2

Unnamed: 0,Body,Actual Tags,Predicted Tags
0,"1. (Rechts Basis medial): 0,8cm lange Stanze ...","(carcinoma, prostate)","(carcinoma, prostate)"
1,"1 (15 mm Polyp IC-Klappe): Ein 1,1 x 0,7 x 0,...","(carcinoma,)","(colon,)"
2,"Dysplasiefreie, leicht hyperplastische Dickda...","(colon,)","(colon,)"
3,1-3 Regelrecht aufgebaute Duodenalschleimhaut...,"(inflammation, stomach)","(stomach,)"
4,1 (Dünndarmmetastase bei Mamma- und Sigma-Ca)...,"(colon,)","(colon,)"
...,...,...,...
419,"1. (Antrum): Fragmentiertes, zusammen 0,3 cm ...","(stomach,)","(inflammation, stomach)"
420,"1 (Prostata): Ein 50 g schweres, 6,0 x 4,2 x ...","(carcinoma, prostate)","(carcinoma, prostate)"
421,": 1 (Prostata): Ein 5,8 x 4,9 x 4,8 cm messend...","(prostate,)","(carcinoma, prostate)"
422,Nachbericht: Immunhistochemisch zeigt sich nu...,"(carcinoma, stomach)","(carcinoma,)"


In [229]:
## not really needed
"""actual_tag = df_pred_2["Actual Tags"].tolist()
actual_num = []
act_total = []
for act in actual_tag:
    true_temp = []
    for _x in act:
        true_temp.append(labelname2index[_x])
        act_total.append(labelname2index[_x])
    actual_num.append(true_temp)
    
pred_tag_ = df_pred_2["Predicted Tags"].tolist()
pred_num = []
pred_total = []
for pred_ in pred_tag_:
    pred_temp = []
    for _x in pred_:
        pred_temp.append(labelname2index[_x])
        pred_total.append(labelname2index[_x])
    pred_num.append(pred_temp)

df_pred_2['true_ids'] = actual_num
df_pred_2['pred_ids'] = pred_num
"""


'actual_tag = df_pred_2["Actual Tags"].tolist()\nactual_num = []\nact_total = []\nfor act in actual_tag:\n    true_temp = []\n    for _x in act:\n        true_temp.append(labelname2index[_x])\n        act_total.append(labelname2index[_x])\n    actual_num.append(true_temp)\n    \npred_tag_ = df_pred_2["Predicted Tags"].tolist()\npred_num = []\npred_total = []\nfor pred_ in pred_tag_:\n    pred_temp = []\n    for _x in pred_:\n        pred_temp.append(labelname2index[_x])\n        pred_total.append(labelname2index[_x])\n    pred_num.append(pred_temp)\n\ndf_pred_2[\'true_ids\'] = actual_num\ndf_pred_2[\'pred_ids\'] = pred_num\n'

In [230]:
def predict(description):
    text_enc = tokenizer.encode_plus(
            description,
            None,
            add_special_tokens=True,
            max_length= MAX_LEN,
            padding = 'max_length',
            return_token_type_ids= False,
            return_attention_mask= True,
            truncation=True,
            return_tensors = 'pt'      
    )
    model.to('cpu')
    outputs = model(text_enc['input_ids'], text_enc['attention_mask'])
    pred_out = outputs[0].detach().numpy()
    #print(f'Outputs = {outputs}')
    #print(f'Type = {type(outputs)}')
    #print(f'Pred Outputs = {pred_out}')
    #print(f'Type = {type(pred_out)}')
    #preds = np.round(pred_out)
    #print(pred_out, opt_thresh)
    preds = [(pred > opt_thresh) for pred in pred_out ]
    #pred_list = [ round(pred) for pred in pred_logits ]
    #print(preds)
    preds = np.asarray(preds)
    #print(f'Predictions = {preds}')
    #print(f'Type = {type(preds)}')
    #print(mlb.classes_)
    new_preds = preds.reshape(1,-1).astype(int)
    #print(new_preds)
    #print(new_preds)
    pred_tags = new_preds
    #print(mlb.inverse_transform(np.array(new_preds)))
    return pred_tags 

In [262]:
clinical_texts = df_mulitlabel.text.tolist()
for i in range(0,20):#,500,1000,1520]:
    tags = predict(clinical_texts[i])
    print("========== Test Result ==========")
    print("description: \n")
    print(clinical_texts[i][:500], "\n==========   ==========   ==========")
    print("real label: ", df_mulitlabel.iloc[i].label)
    print("==========   ==========   ==========")
    print("predicted labels: ")
    print(binarizer.inverse_transform(tags))
        
        
    

description: 

 Fragmentiert vorliegende 0,5 cm große Probeentnahmen aus dem Antrum (1), Korpus Pseudopolyp (2). . Nachbericht:  Wieangekündigt, haben wir von Position 2 noch ergänzende Sonderfärbungen angefertigt (Gastrin, Chromogranin A und Synaptophysin). Hierbei zeigt sich eine lineare ECL-Zell-Hyperplasie. Die Biopsie negativ für Gastrin, passend zu einer Entnahme aus dem Corpusbereich, es bestätigt sich somit auch das Vorliegen einer Typ-A-Gastritis. Kein Anhalt für Malignität. .  1. Magenschleimhaut vom 
real label:  ['stomach', 'inflammation']
predicted labels: 
[('stomach',)]
description: 

 1 (rechts Basis medial): Eine maximal 1,3 cm lange Stanze. Einbettung in toto. 2 (rechts Basis lateral): Eine maximal 1 cm lange Stanze und ein Bröckel. Einbettung in toto. 3 (Rechtsmitte medial): Zwei 1,0-0,9 cm lange Stanzen. Einbettung in toto. 4 (Rechtsmitte lateral): Eine 1 cm lange Stanze und multiple Bröckel. Einbettung in toto. 5 (rechts Apex medial): Zwei 0,8-0,3 cm lange Stanzen.

description: 

 als PE Magen zusammen max. 0,5 × 0,5 cm messende Gewebsfragmente. .  Histologisch Corpusschleimhaut mit leichtgradig chronischer, inaktiver Entzündung mit foveolärer Hyperplasie und netziger Stromafibrose. Keine Drüsenkörperatrophie. Kein H.p.-Nachweis in der mod. Giemsa-Färbung. Der Befund entspricht einer Typ C-Gastritis. Kein Anhalt für Malignität.  
real label:  ['stomach']
predicted labels: 
[('stomach',)]
description: 

 1 (Descendens 5 mm großer Polyp): Fragmentierte, zusammen max. 0,4 cm durchmessende Schleimhautprobe. Einbettung in toto. 2 (Transversum,ca. 1 cm großer, gestielter Polyp): Eine 1 x 0,6 x 0,3 cm große, bräunliche, annähernd fingerförmige Schleimhautprobe. Die vermeintlich basale Resektionskante wird blau getuscht. Einbettung in Längsschnitten in toto. 3 (Rechte Flexur, ca. 2 cm großer, flacher Polyp): Fragmentierte, zusammen max. 1 cm durchmessende Schleimhautprobe. Einbettung in toto. .  1. Ei 
real label:  ['colon', 'adenoma']
predicted labels: 

In [232]:
# load a model along with its weights, biases and hyperparameters
#m_name = "dbmdz/bert-base-german-cased"
#n_c = 7
#model = #LabelsClassifier(m_name, n_c)
#model = LabelsClassifier.load_from_checkpoint("/home/fb198/BA/classification/multi_class/lightning_logs/version_7_sp_v1000/checkpoints/Labels-epoch=06-val_loss=0.25-train_loss=0.25.ckpt")
model.eval()

LabelsClassifier(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(1000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

# del example BPH adenoma translate and original text

In [4]:
import datasets
translated_text_path = "/home/fb198/BA/DataNephroTexts/classification_data/disease/detsch_translated_data_disease_filtered.json"
dataset_translated = datasets.Dataset.from_json(translated_text_path)
dataset_translated

Using custom data configuration default-bcca669560a510d7
Reusing dataset json (/home/fb198/.cache/huggingface/datasets/json/default-bcca669560a510d7/0.0.0)


Dataset({
    features: ['text', 'label'],
    num_rows: 1728
})

In [3]:
original_text_path= "/home/fb198/BA/DataNephroTexts/classification_data/data_files_classification_data_hf_dataset_disorder.json"
original_texts = datasets.Dataset.from_json(original_text_path)
original_texts

Using custom data configuration default-95a779ba03b23aaa
Reusing dataset json (/home/fb198/.cache/huggingface/datasets/json/default-95a779ba03b23aaa/0.0.0)


Dataset({
    features: ['text', 'label'],
    num_rows: 1779
})

In [86]:
original_texts['label'][78], dataset_translated['label'][53]

([3, 0], 3)

In [92]:
original_bph_dx = [idx_ for idx_,val in enumerate(original_texts['label']) if val == 3]
(original_bph_dx[:5])

[18, 56, 73, 78, 139]

In [93]:
original_texts['text'][56][54:-70]

'e: 1. (Enukleationsgewebe, HOLEP Prostata): Fragmentiertes, insgesamt 75 g schweres Material. Einbettung in toto. Beurteilung: Fragmentiertes Prostataparenchym ohne Nachweis von Karzinominfiltraten, mit einer herdförmigen, teils follikelbildenden, lymphozytären chronischen, unspezifischen Entzündung. Miterfasstes Urothel der prostatischen Urehtra dysplasiefrei. Kein Anhalt für Malignitätim vorliegenden Material. Prof. Dr. me'

In [84]:
dataset_translated['text'][53]

'(Enukleationsgewebe HOLEP Prostata): Fragmentiert insgesamt 75 g schweres Material. Einbetten in Toto. • Fragmentiertes Prostataparenchym ohne Nachweis von Karzinomfiltraten mit einer herdenförmigen teilweise follikelbildenden lymphatischen chronischen unspezifischen Entzündung. Das Prostataurehtra ist frei von Dysplasie. Kein Anhalt für Malignität im verfügbaren Material'

In [96]:
"Fragmentiertes, insgesamt 75 g schweres Material. Einbettung in toto. Beurteilung: Fragmentiertes Prostataparenchym ohne Nachweis von Karzinominfiltraten, mit einer herdförmigen, teils follikelbildenden, lymphozytären chronischen, unspezifischen Entzündung. Miterfasstes Urothel der prostatischen Urehtra dysplasiefrei."

'Fragmentiertes, insgesamt 75 g schweres Material. Einbettung in toto. Beurteilung: Fragmentiertes Prostataparenchym ohne Nachweis von Karzinominfiltraten, mit einer herdförmigen, teils follikelbildenden, lymphozytären chronischen, unspezifischen Entzündung. Miterfasstes Urothel der prostatischen Urehtra dysplasiefrei.'

In [97]:
" Fragmentiert insgesamt 75 g schweres Material. Einbetten in Toto. • Fragmentiertes Prostataparenchym ohne Nachweis von Karzinomfiltraten mit einer herdenförmigen teilweise follikelbildenden lymphatischen chronischen unspezifischen Entzündung. Das Prostataurehtra ist frei von Dysplasie."

' Fragmentiert insgesamt 75 g schweres Material. Einbetten in Toto. • Fragmentiertes Prostataparenchym ohne Nachweis von Karzinomfiltraten mit einer herdenförmigen teilweise follikelbildenden lymphatischen chronischen unspezifischen Entzündung. Das Prostataurehtra ist frei von Dysplasie.'

In [98]:
model

NameError: name 'model' is not defined