In [73]:
!nvidia-smi

Sat Oct 19 17:31:06 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       On  | 00000000:00:04.0 Off |                    0 |
| N/A   76C    P0              31W /  70W |  14746MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [1]:
!export HUGGINGFACEHUB_API_TOKEN=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
!export HF_TOKEN=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

In [2]:
from huggingface_hub import login
access_token_read = "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"
access_token_write = "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"
login(token = access_token_read)

# Training data 
wget http://nlp.dmis.korea.edu/projects/biobert-2020-checkpoints/datasets.tar.gz -O

## BC2GM (BioCreative II Gene Mention Recognition):
This dataset is used for gene mention recognition in biomedical texts. It contains sentences annotated with gene mentions, helping to identify gene names in scientific literature1.
## BC4CHEMD (BioCreative IV Chemical Compound and Drug Name Recognition):
This dataset includes 10,000 PubMed abstracts with 84,355 manually annotated chemical entity mentions. It’s used for recognizing chemical compounds and drug names in biomedical literature2.
## BC5CDR (BioCreative V Chemical Disease Relation):
## BC5CDR-chem: Focuses on chemical entities, with 4409 annotated chemicals in 1500 PubMed articles3.
BC5CDR-disease: Focuses on disease entities, with 5818 annotated diseases in the same set of articles3.
The dataset also includes 3116 chemical-disease interactions, making it useful for studying relationships between chemicals and diseases3.
## JNLPBA (Joint Workshop on Natural Language Processing in Biomedicine and its Applications):
This dataset is derived from the GENIA corpus and contains 2000 abstracts annotated with biomedical entities like proteins, DNA, RNA, cell lines, and cell types. It’s widely used for named entity recognition in biomedical texts4.
Linnaeus:
This dataset is designed for species name recognition in biomedical literature. It includes a variety of document formats and aims to identify and classify species names with high precision and recall5.
## NCBI-disease:
The NCBI Disease corpus consists of 793 PubMed abstracts annotated with disease mentions. It includes concept identifiers from MeSH or OMIM, making it a valuable resource for disease name recognition and normalization6.
## S800:
The S800 corpus contains 800 PubMed abstracts annotated with organism mentions, mapped to NCBI Taxonomy identifiers. It covers a diverse range of species, including bacteria, fungi, plants, and animals7.
considering only /home/sridhanya_ganapathi_team_neustar/PMC-Patients/ddata/datasets/NER/BC5CDR-disease




In [199]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
import csv
import glob
from transformers import AutoTokenizer
import gc

In [200]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


In [201]:

def clear_cuda_cache_and_gc():
    torch.cuda.empty_cache()
    gc.collect()

# Call the function
clear_cuda_cache_and_gc()

In [5]:
basefolderpath='ddata/datasets/NER/'
folders = ['BC2GM','BC4CHEMD','BC5CDR-chem','BC5CDR-disease','JNLPBA','linnaeus','NCBI-disease','s800']
trainfilename='train_dev.tsv'
trainfilename2='train.tsv'
valfilename='devel.tsv'
testfilename='test.tsv'

In [6]:
train_file_paths = []
train_file_paths2=[]
val_file_paths=[]
test_file_paths=[]

for folder in folders:
    train_file_paths.append(basefolderpath+folder+"/"+trainfilename)
    train_file_paths2.append(basefolderpath+folder+"/"+trainfilename2)
    val_file_paths.append(basefolderpath+folder+"/"+valfilename)
    test_file_paths.append(basefolderpath+folder+"/"+testfilename)


## BC2GM (BioCreative II Gene Mention Recognition):
B-GENE: Beginning of a gene mention.
I-GENE: Inside a gene mention.
O: Outside any gene mention.
## BC4CHEMD (BioCreative IV Chemical Compound and Drug Name Recognition):
B-CHEMICAL: Beginning of a chemical compound or drug name.
I-CHEMICAL: Inside a chemical compound or drug name.
O: Outside any chemical mention.
## BC5CDR (BioCreative V Chemical Disease Relation):
BC5CDR-chem:
B-CHEMICAL: Beginning of a chemical entity.
I-CHEMICAL: Inside a chemical entity.
## BC5CDR-disease:
B-DISEASE: Beginning of a disease entity.
I-DISEASE: Inside a disease entity.
O: Outside any chemical or disease mention.
## JNLPBA (Joint Workshop on Natural Language Processing in Biomedicine and its Applications):
B-PROTEIN, I-PROTEIN: For protein mentions.
B-DNA, I-DNA: For DNA mentions.
B-RNA, I-RNA: For RNA mentions.
B-CELL_LINE, I-CELL_LINE: For cell line mentions.
B-CELL_TYPE, I-CELL_TYPE: For cell type mentions.
O: Outside any entity mention.
## Linnaeus:
B-SPECIES: Beginning of a species name.
I-SPECIES: Inside a species name.
O: Outside any species mention.
## NCBI-disease:
B-DISEASE: Beginning of a disease mention.
I-DISEASE: Inside a disease mention.
O: Outside any disease mention.
## S800:
B-ORGANISM: Beginning of an organism mention.
I-ORGANISM: Inside an organism mention.
O: Outside any organism mention.

In [7]:
# Initialize an empty list to hold DataFrames

dataframes = []
column_names = ['Value', 'Entity']
# Loop through the file paths and read each file
for file_path in train_file_paths:
    df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_NONE, header=None, names=column_names)  # Adjust header=None if no header
    if 'BC2GM' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-GENE', 'I': 'I-GENE', 'O': 'OUTSIDE'})
    elif 'BC4CHEMD' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-CHEMICAL', 'I': 'I-CHEMICAL', 'O': 'OUTSIDE'})
    elif 'BC5CDR-chem' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-CHEMICAL', 'I': 'I-CHEMICAL', 'O': 'OUTSIDE'})
    elif 'BC5CDR-disease' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-DISEASE', 'I': 'I-DISEASE', 'O': 'OUTSIDE'})
    elif 'JNLPBA' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-PROTEINPROTEIN', 'I': 'I-PROTEIN', 'O': 'OUTSIDE'})
    elif 'linnaeus' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-SPECIES', 'I': 'I-SPECIES', 'O': 'OUTSIDE'})
    elif 'NCBI-disease' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-DISEASE', 'I': 'I-DISEASE', 'O': 'OUTSIDE'})
    elif 's800' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-ORGANISM', 'I': 'I-ORGANISM', 'O': 'OUTSIDE'})
                           
    dataframes.append(df)

# Concatenate all DataFrames into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(combined_df.head())

                 Value   Entity
0  Immunohistochemical  OUTSIDE
1             staining  OUTSIDE
2                  was  OUTSIDE
3             positive  OUTSIDE
4                  for  OUTSIDE


In [8]:
# Initialize an empty list to hold DataFrames

tdataframes = []
column_names = ['Entity', 'Value']
# Loop through the file paths and read each file
for file_path in val_file_paths:
    df = pd.read_csv(file_path, sep='\t', quoting=csv.QUOTE_NONE, header=None, names=column_names)  # Adjust header=None if no header
    if 'BC2GM' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-GENE', 'I': 'I-GENE', 'O': 'OUTSIDE'})
    elif 'BC4CHEMD' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-CHEMICAL', 'I': 'I-CHEMICAL', 'O': 'OUTSIDE'})
    elif 'BC5CDR-chem' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-CHEMICAL', 'I': 'I-CHEMICAL', 'O': 'OUTSIDE'})
    elif 'BC5CDR-disease' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-DISEASE', 'I': 'I-DISEASE', 'O': 'OUTSIDE'})
    elif 'JNLPBA' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-PROTEINPROTEIN', 'I': 'I-PROTEIN', 'O': 'OUTSIDE'})
    elif 'linnaeus' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-SPECIES', 'I': 'I-SPECIES', 'O': 'OUTSIDE'})
    elif 'NCBI-disease' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-DISEASE', 'I': 'I-DISEASE', 'O': 'OUTSIDE'})
    elif 's800' in file_path:
        df['Entity'] = df['Entity'].replace({'B': 'B-ORGANISM', 'I': 'I-ORGANISM', 'O': 'OUTSIDE'})
                           
    tdataframes.append(df)

# Concatenate all DataFrames into one
test_combined_df = pd.concat(dataframes, ignore_index=True)

# Display the combined DataFrame
print(test_combined_df.head())

                 Value   Entity
0  Immunohistochemical  OUTSIDE
1             staining  OUTSIDE
2                  was  OUTSIDE
3             positive  OUTSIDE
4                  for  OUTSIDE


In [9]:
test_combined_df.shape

(3944377, 2)

In [10]:
combined_df.shape

(3944377, 2)

In [11]:
combined_df.isnull().sum()

Value     389
Entity      0
dtype: int64

In [12]:
test_combined_df.isnull().sum()

Value     389
Entity      0
dtype: int64

In [13]:
cleaned_df = combined_df.dropna()

In [14]:
test_cleaned_df = test_combined_df.dropna()

In [15]:
cleaned_df.shape

(3943988, 2)

In [16]:
test_cleaned_df.shape

(3943988, 2)

In [17]:
cleaned_df.dtypes


Value     object
Entity    object
dtype: object

In [18]:
cleaned_df.groupby(['Entity']).size()


Entity
B-CHEMICAL            69514
B-DISEASE             14349
B-GENE                18258
B-ORGANISM             2941
B-PROTEINPROTEIN      40753
B-SPECIES              2830
I-CHEMICAL            74434
I-DISEASE             12846
I-GENE                26537
I-ORGANISM             3795
I-PROTEIN             81563
I-SPECIES              1493
OUTSIDE             3594675
dtype: int64

In [19]:
test_cleaned_df.groupby(['Entity']).size()

Entity
B-CHEMICAL            69514
B-DISEASE             14349
B-GENE                18258
B-ORGANISM             2941
B-PROTEINPROTEIN      40753
B-SPECIES              2830
I-CHEMICAL            74434
I-DISEASE             12846
I-GENE                26537
I-ORGANISM             3795
I-PROTEIN             81563
I-SPECIES              1493
OUTSIDE             3594675
dtype: int64

In [20]:
frequencies=cleaned_df['Entity'].unique()

In [28]:
labels_to_ids = {k: v for v, k in enumerate(frequencies)}
ids_to_labels = {v: k for v, k in enumerate(frequencies)}
labels_to_ids

{'OUTSIDE': 0,
 'B-GENE': 1,
 'I-GENE': 2,
 'B-CHEMICAL': 3,
 'I-CHEMICAL': 4,
 'B-DISEASE': 5,
 'I-DISEASE': 6,
 'B-PROTEINPROTEIN': 7,
 'I-PROTEIN': 8,
 'B-SPECIES': 9,
 'I-SPECIES': 10,
 'B-ORGANISM': 11,
 'I-ORGANISM': 12}

## Preparing the dataset and dataloader

In [221]:
# MAX_LEN = 128
# TRAIN_BATCH_SIZE = 512
# VALID_BATCH_SIZE = 128
# EPOCHS = 1
# LEARNING_RATE = 1e-05
# MAX_GRAD_NORM = 10
# # MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 1
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

In [222]:
def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    sentence = sentence.strip()

    for word, label in zip(sentence.split(), text_labels.split(",")):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

In [223]:

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, index):
        # step 1: get the sentence and word labels 
        sentence = self.data.Value[index].strip().split()  
        word_labels = [self.data.Entity[index]]

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        encoding = self.tokenizer(sentence,
                                  is_split_into_words=True, 
                                  return_offsets_mapping=True, 
                                  padding='max_length', 
                                  truncation=True, 
                                  max_length=self.max_len)
        
        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels] 
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100
        
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
            if mapping[0] == 0 and mapping[1] != 0:
                encoded_labels[idx] = labels[i]
                i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        
        return item

    def __len__(self):
        return self.len 

# Function to create DataLoader
def create_dataloader(file_path, tokenizer, max_len, batch_size):
    dataframe = pd.read_csv(file_path)
    dataset = CustomDataset(dataframe, tokenizer, max_len)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    return dataloader


In [224]:
drop_size = 0.5
drop_dataset = test_cleaned_df.sample(frac=drop_size,random_state=200)
test_dataset = test_cleaned_df.drop(drop_dataset.index).reset_index(drop=True)
train_dataset = cleaned_df.reset_index(drop=True)

print("FULL Dataset: {}".format(cleaned_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

# training_set = dataset(train_dataset, tokenizer, MAX_LEN)
# testing_set = dataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (3943988, 2)
TRAIN Dataset: (3943988, 2)
TEST Dataset: (1971994, 2)


In [225]:

# Save the datasets
train_dataset.to_csv('processeddata/train_dataset.csv', index=False)
test_dataset.to_csv('processeddata/test_dataset.csv', index=False)

print("Datasets saved in the current directory.")

Datasets saved in the current directory.


In [226]:

# Example usage
training_loader = create_dataloader('processeddata/train_dataset.csv', tokenizer, max_len=128, batch_size=TRAIN_BATCH_SIZE)
testing_loader = create_dataloader('processeddata/test_dataset.csv', tokenizer, max_len=128, batch_size=VALID_BATCH_SIZE)

In [227]:
labels_to_ids

{'OUTSIDE': 0,
 'B-GENE': 1,
 'I-GENE': 2,
 'B-CHEMICAL': 3,
 'I-CHEMICAL': 4,
 'B-DISEASE': 5,
 'I-DISEASE': 6,
 'B-PROTEINPROTEIN': 7,
 'I-PROTEIN': 8,
 'B-SPECIES': 9,
 'I-SPECIES': 10,
 'B-ORGANISM': 11,
 'I-ORGANISM': 12}

In [228]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["input_ids"]), training_set[0]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
im          0
##mu        -100
##no        -100
##his       -100
##to        -100
##chemical  -100
[SEP]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100
[PAD]       -100


In [229]:
# train_params = {'batch_size': TRAIN_BATCH_SIZE,
#                 'shuffle': True,
#                 'num_workers': 0
#                 }

# test_params = {'batch_size': VALID_BATCH_SIZE,
#                 'shuffle': True,
#                 'num_workers': 0
#                 }

# training_loader = DataLoader(training_set, **train_params)
# testing_loader = DataLoader(testing_set, **test_params)

In [230]:
model = BertForTokenClassification.from_pretrained('dmis-lab/biobert-base-cased-v1.1', 
                                                   num_labels=len(labels_to_ids), id2label=ids_to_labels,label2id=labels_to_ids)
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [231]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(3.2635, device='cuda:0', grad_fn=<NllLossBackward0>)

In [232]:
len(labels)

1

In [233]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 128, 13])

In [234]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

In [235]:
# Defining the training function on the 80% of the dataset for tuning the bert model
def train(epoch):
    print("training_loader*****")
    print(training_loader)
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    # put model in training mode
    model.train()
    print(" model.train()*****")
    for idx, batch in enumerate(training_loader):
        
        ids = batch['input_ids'].to(device, dtype = torch.long)
        mask = batch['attention_mask'].to(device, dtype = torch.long)
        labels = batch['labels'].to(device, dtype = torch.long)


        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()
        
        # loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels)
        # tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")
           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        tr_labels.extend(labels)
        tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [236]:
import torch
torch.cuda.empty_cache()

In [237]:
EPOCHS

1

In [238]:
!export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
training_loader*****
<torch.utils.data.dataloader.DataLoader object at 0x7ffa1db9a6d0>
 model.train()*****
Training loss per 100 training steps: 3.3056726455688477
Training loss per 100 training steps: 0.8076972484146014
Training loss per 100 training steps: 0.609671280825909
Training loss per 100 training steps: 0.544011575810339
Training loss per 100 training steps: 0.503787974039664
Training loss per 100 training steps: 0.4784373146167653
Training loss per 100 training steps: 0.460940418575895
Training loss per 100 training steps: 0.450889910641691
Training loss per 100 training steps: 0.4360365275986558
Training loss per 100 training steps: 0.4276676043686142
Training loss per 100 training steps: 0.41529766067520124
Training loss per 100 training steps: 0.4093970442825182
Training loss per 100 training steps: 0.401973235213836
Training loss per 100 training steps: 0.3955968199534017
Training loss per 100 training steps: 0.3904570037290337
Training loss per 100 tra

In [None]:
model.save_pretrained('model/sri_biobert-base-cased-v1.1')
tokenizer.save_pretrained('token/sri_biobert-base-cased-v1.1')

# PMC-Patients

Patient information are presented as a dataframe 
- `patient_id`: string. A continuous id of patients, starting from 0.
- `patient_uid`: string. Unique ID for each patient, with format PMID-x, where PMID is the PubMed Identifier of source article of the note and x denotes index of the note in source article.
- `PMID`: string. PMID for source article.
- `file_path`: string. File path of xml file of source article.
- `title`: string. Source article title.
- `patient`: string. Patient note.
- `age`: list of tuples. Each entry is in format `(value, unit)` where value is a float number and unit is in 'year', 'month', 'week', 'day' and 'hour' indicating age unit. For example, `[[1.0, 'year'], [2.0, 'month']]` indicating the patient is a one-year- and two-month-old infant.
- `gender`: 'M' or 'F'. Male or Female.
- `relevant_articles`: dict. The key is PMID of the relevant articles and the corresponding value is its relevance score (2 or 1 as defined in the ``Methods'' section).
- `similar_patients`: dict. The key is patient_uid of the similar patients and the corresponding value is its similarity score (2 or 1 as defined in the ``Methods'' section).

In [None]:
import torch
import pandas as pd
import numpy as np
from tqdm import tqdm, trange
import fhirclient.models.patient as fhir_patient
import fhirclient.models.condition as fhir_condition
from langchain_experimental.graph_transformers import LLMGraphTransformer
from langchain_community.llms import Ollama
from langchain_huggingface import HuggingFaceEndpoint
from langchain import PromptTemplate, LLMChain
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
import warnings
import re
import nltk
from autocorrect import Speller
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from autocorrect import Speller
from nltk.tokenize import word_tokenize

import spacy
from spacy import displacy


import gc


In [None]:
warnings.filterwarnings("ignore")
nltk.download("punkt")
nltk.download('stopwords')

In [None]:
nlp = spacy.blank('en')

In [None]:

biomodel = HuggingFaceEndpoint(
   # repo_id="dmis-lab/biobert-v1.1",
    repo_id="alvaroalon2/biobert_diseases_ner",
    temperature=0,
    model_kwargs={"max_length": 180, "device": "cuda"},
    use_auth_token="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"
)

In [None]:
ollama = Ollama(
    base_url='http://10.113.8.4:8086',
    model="cniongolo/biomistral:latest"
)

In [None]:
from datasets import load_dataset

ds = load_dataset("zhengyun21/PMC-Patients")

In [None]:
ds

In [None]:
ds['train'][0]

In [None]:
df = ds['train'].to_pandas()

In [None]:
df.head()

In [None]:
df.columns.to_list()

In [None]:
df.info()

In [None]:
df['gender'].value_counts()

In [None]:
df['age'].value_counts()

In [None]:
# age
extract_age = lambda x: re.findall(r"\d+\.\d+", x)[0]
df.age = df.age.apply(extract_age).astype(float)

In [None]:
df.age

In [None]:
#df.drop(columns=["file_path", "patient_id", "patient_uid", "relevant_articles"], inplace=True)

In [None]:
clinical_note = """
The patient is a 45-year-old male with a history of diabetes and hypertension.
He was prescribed metformin and lisinopril.
"""

In [None]:
# example didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't"
stop_words  = stopwords.words('english')


In [None]:
def to_lower(text):

    """
    Converting text to lower case as in, converting "Hello" to  "hello" or "HELLO" to "hello".
    """
    
    # Specll check the words
    spell  = Speller(lang='en')
    
    texts = spell(text)
    
    return ' '.join([w.lower() for w in word_tokenize(text)])

In [None]:
def clean_text(lower_case):
    # split text phrases into words
    words  = nltk.word_tokenize(lower_case)
    
    
    # Create a list of all the punctuations
    punctuations = [ '/', '!', '?', ';', ':', '(',')', '[',']', '-', '_', '%']
    
    # Remove all the special characters
    punctuations = re.sub(r'\W', ' ', str(lower_case))
    
    # Initialize the stopwords variable, which is a list of words ('and', 'the', 'i', 'yourself', 'is') that do not hold much values as key words
    stop_words  = stopwords.words('english')
    
    # Getting rid of all the words that contain numbers in them
    w_num = re.sub('\w*\d\w*', '', lower_case).strip()
    
    # remove all single characters
    lower_case = re.sub(r'\s+[a-zA-Z]\s+', ' ', lower_case)
    
    # Substituting multiple spaces with single space
    lower_case = re.sub(r'\s+', ' ', lower_case, flags=re.I)
    
    # Removing prefixed 'b'
    lower_case = re.sub(r'^b\s+', '', lower_case)
    
    # Removing non-english characters
    lower_case = re.sub(r'^b\s+', '', lower_case)
    
    # Return keywords which are not in stop words 
    keywords = [word for word in words if not word in stop_words  and word in punctuations and  word in w_num]
    
    return keywords

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('There are %d GPU(s) available.' % torch.cuda.device_count())
print('We will use the GPU:', torch.cuda.get_device_name(0))


In [None]:
data = pd.read_csv("/home/sridhanya_ganapathi_team_neustar/PMC-Patients/ddata/datasets/NER/BC5CDR-disease/train.tsv", sep="\t").fillna(method="ffill")
# data['Value'][0] = 'B-Chemical'
data.head()

In [None]:
# Install necessary libraries
# !pip install transformers
# !pip install torch
 
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
 
# Load the BioBERT model and tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)
 
# Define the NER pipeline using the BioBERT model
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
 
# Example biomedical text
text = """
A 45-year-old male patient was diagnosed with hypertension and prescribed Lisinopril.
He also has a history of diabetes and his blood test showed elevated glucose levels.
"""
text=clean_text(to_lower(clinical_note))
text=" ".join(text)
 
# Perform Named Entity Recognition (NER) on the text
entities = ner_pipeline(text)
 
# Print the detected entities
for entity in entities:
    print(f"Entity: {entity['word']}, Type: {entity['entity_group']}, Confidence: {entity['score']:.4f}")
#Label_1 typically represents medical text.
#Label_0 typically represents non-medical text.

In [None]:
!wget --load-cookies /tmp/cookies.txt "https://docs.google.com/uc?export=download&confirm=$(wget --quiet --save-cookies /tmp/cookies.txt --keep-session-cookies --no-check-certificate 'https://docs.google.com/uc?export=download&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1\n/p')&id=1R84voFKHfWV9xjzeLzWBbmY1uOMYpnyD" -O biobert_weights && rm -rf /tmp/cookies.txt


In [None]:
import transformers
import torch
tokenizer = transformers.LlamaTokenizer.from_pretrained('axiong/PMC_LLaMA_13B')
model = transformers.LlamaForCausalLM.from_pretrained('axiong/PMC_LLaMA_13B')
model.cuda()  # move the model to GPU

prompt_input = (
    'Below is an instruction that describes a task, paired with an input that provides further context.'
    'Write a response that appropriately completes the request.\n\n'
    '### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:'
)

example = {
    "instruction": "You're a doctor, kindly address the medical queries according to the patient's account. Answer with the best option directly.",
    "input": (
        "###Question: A 23-year-old pregnant woman at 22 weeks gestation presents with burning upon urination. "
        "She states it started 1 day ago and has been worsening despite drinking more water and taking cranberry extract. "
        "She otherwise feels well and is followed by a doctor for her pregnancy. "
        "Her temperature is 97.7°F (36.5°C), blood pressure is 122/77 mmHg, pulse is 80/min, respirations are 19/min, and oxygen saturation is 98% on room air."
        "Physical exam is notable for an absence of costovertebral angle tenderness and a gravid uterus. "
        "Which of the following is the best treatment for this patient?"
        "###Options: A. Ampicillin B. Ceftriaxone C. Doxycycline D. Nitrofurantoin"
    )
}
input_str = [prompt_input.format_map(example)]

model_inputs = tokenizer(
    input_str,
    return_tensors='pt',
    padding=True,
)
print( f"\033[32mmodel_inputs\033[0m: { model_inputs }" )


topk_output = model.generate(
    model_inputs.input_ids.cuda(),
    max_new_tokens=1000,
    top_k=50
)
output_str = tokenizer.batch_decode(topk_output)
print('model predict: ', output_str[0])

In [None]:
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl

In [None]:
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent

# Load medspacy model
nlp = medspacy.load()
print(nlp.pipe_names)

text = """
Past Medical History:
1. Atrial fibrillation
2. Type II Diabetes Mellitus

Assessment and Plan:
There is no evidence of pneumonia. Continue warfarin for Afib. Follow up for management of type 2 DM.
"""

# Add rules for target concept extraction
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = [
    TargetRule("atrial fibrillation", "PROBLEM"),
    TargetRule("atrial fibrillation", "PROBLEM", pattern=[{"LOWER": "afib"}]),
    TargetRule("pneumonia", "PROBLEM"),
    TargetRule("Type II Diabetes Mellitus", "PROBLEM", 
              pattern=[
                  {"LOWER": "type"},
                  {"LOWER": {"IN": ["2", "ii", "two"]}},
                  {"LOWER": {"IN": ["dm", "diabetes"]}},
                  {"LOWER": "mellitus", "OP": "?"}
              ]),
    TargetRule("warfarin", "MEDICATION")
]
target_matcher.add(target_rules)

doc = nlp(text)
visualize_ent(doc)

In [None]:
!pip install https://huggingface.co/kormilitzin/en_core_med7_trf/blob/main/en_core_med7_trf-any-py3-none-any.whl


In [None]:
import medspacy
from medspacy.ner import TargetRule
from medspacy.visualization import visualize_ent

# Load medspacy model
nlp = medspacy.load()
print(nlp.pipe_names)

text = """
Past Medical History:
1. Atrial fibrillation
2. Type II Diabetes Mellitus

Assessment and Plan:
There is no evidence of pneumonia. Continue warfarin for Afib. Follow up for management of type 2 DM.
"""

# Add rules for target concept extraction
target_matcher = nlp.get_pipe("medspacy_target_matcher")
target_rules = [
    TargetRule("atrial fibrillation", "PROBLEM"),
    TargetRule("atrial fibrillation", "PROBLEM", pattern=[{"LOWER": "afib"}]),
    TargetRule("pneumonia", "PROBLEM"),
    TargetRule("Type II Diabetes Mellitus", "PROBLEM", 
              pattern=[
                  {"LOWER": "type"},
                  {"LOWER": {"IN": ["2", "ii", "two"]}},
                  {"LOWER": {"IN": ["dm", "diabetes"]}},
                  {"LOWER": "mellitus", "OP": "?"}
              ]),
    TargetRule("warfarin", "MEDICATION")
]
target_matcher.add(target_rules)

doc = nlp(text)
visualize_ent(doc)

In [None]:
import warnings
warnings.filterwarnings("ignore", message="numpy.dtype size changed")
warnings.filterwarnings("ignore", message="numpy.ufunc size changed")

In [None]:
from transformers import pipeline

# Load BioBERT NER model
ner_model = pipeline('ner', model='d4data/biomedical-ner-all')
 
# Input text
text = "The patient was diagnosed with glioblastoma and treated with temozolomide."
 
# Extract entities
entities = ner_model(text)
 
# Output named entities
for entity in entities:
    print(f"Entity: {entity['word']}, Label: {entity['entity']}")

In [None]:
!pip install transformers

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForTokenClassification

tokenizer = AutoTokenizer.from_pretrained("d4data/biomedical-ner-all")
model = AutoModelForTokenClassification.from_pretrained("d4data/biomedical-ner-all")

pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0) # pass device=0 if using gpu
pipe("""The patient reported no recurrence of palpitations at follow-up 6 months after the ablation.""")


In [None]:
!pip install scispacy

In [None]:
!python -m spacy download en_ner_bc5cdr_md

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz


In [None]:
!pip install --upgrade pip setuptools wheel


In [None]:
!pip install numpy==1.21

In [None]:
!pip install spacy==2.2

In [None]:
import spacy
import srsly

nlp = spacy.load("en_core_web_sm")

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz


In [None]:
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl


In [None]:
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scispacy.umls_linking import UmlsEntityLinker
from collections import OrderedDict
from spacy.tokens import Doc

# Load the spaCy model
nlp = spacy.load("en_ner_bc5cdr_md")

# Define the text
text = """
The patient is well known to me for a history of iron-deficiency anemia due to chronic blood loss from colitis. We corrected her hematocrit last year with intravenous (IV) iron. Ultimately, she had a total proctocolectomy done on 03/14/2007 to treat her colitis. Her course has been very complicated since then with needing multiple surgeries for removal of hematoma. This is partly because she was on anticoagulation for a right arm deep venous thrombosis (DVT) she had early this year, complicated by septic phlebitis. Chart was reviewed, and I will not reiterate her complex history. I am asked to see the patient again because of concerns for coagulopathy. She had surgery again last month to evacuate a pelvic hematoma, and was found to have vancomycin resistant enterococcus, for which she is on multiple antibiotics and followed by infectious disease now. She is on total parenteral nutrition (TPN) as well. LABORATORY DATA: Labs today showed a white blood cell count of 12,000.
"""

# Set up stopwords
stop_words = set(stopwords.words('english'))

# Tokenize and filter stopwords
word_tokens = word_tokenize(text)
filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]

# Ensure the filtered_sentence list is populated correctly
filtered_sentence = []
for w in word_tokens:
    if w.lower() not in stop_words:
        filtered_sentence.append(w)

# Load the UMLS Entity Linker
linker = UmlsEntityLinker(k=10, max_entities_per_mention=2)

# Register the extension attribute
if not Doc.has_extension("umls_ents"):
    Doc.set_extension("umls_ents", default=[])

# Add the linker to the pipeline
nlp.add_pipe("scispacy_linker", last=True)

# Process the text
doc = nlp(text)

# Extract entities
entities = doc.ents
entity_texts = [str(item) for item in entities]

# Create an ordered dictionary of entities
entity_dict = OrderedDict.fromkeys(entity_texts)

# Process the entities with spaCy
entity_doc = nlp(" ".join(entity_dict.keys()))

# Print the entities and their UMLS concepts
for entity in entity_doc.ents:
    if entity._.umls_ents:  # Check if umls_ents attribute is present
        for umls_ent in entity._.umls_ents:
            print("Entity_name:", entity.text)
            concept_id, score = umls_ent
            print("concept_id={} Score={}".format(concept_id, score))
    else:
        print(f"No UMLS entities found for: {entity.text}")


In [None]:

# Register the extension attribute
if Doc.has_extension("umls_ents"):
    print("not")


In [None]:
for entity in entities:
    print(entity)

In [None]:
doc

In [None]:
text

In [None]:
doc._.umls_ents

In [None]:
entity_doc._.umls_ents

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
!pip install https://huggingface.co/kormilitzin/en_core_med7_trf




In [None]:
entity_doc.ents

In [None]:
!pip install https://huggingface.co/kormilitzin/en_core_med7_lg/resolve/main/en_core_med7_lg-any-py3-none-any.whl

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz