In [1]:
import json
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from transformers import AutoModel, AutoTokenizer, AdamW
from sklearn.model_selection import train_test_split
from collections import Counter

from tqdm import tqdm
import pandas as pd
from utils import *
from contrastive_utils import *
import random
# from class_balanced_loss import *



%load_ext autoreload
%autoreload 2

In [2]:
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

In [3]:
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) |available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) |available.
We will use the GPU: NVIDIA GeForce GTX 1080 Ti


In [4]:
def train_contrastive(model, dataloader, tokenizer, optimizer, device, epochs,augment,generation_loss_fn):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            long_answer_ids = batch['long_answer_ids'].to(device)
            
            # Create two views of the same batch
            aug_input_ids_1, aug_attention_mask_1 = augment(input_ids, attention_mask, tokenizer)
            aug_input_ids_2, aug_attention_mask_2 = augment(input_ids, attention_mask, tokenizer)
            
            proj_1,long_answer_logits_1 = model(aug_input_ids_1, aug_attention_mask_1)
            proj_2,long_answer_logits_2 = model(aug_input_ids_2, aug_attention_mask_2)
            
            loss = contrastive_loss(proj_1, proj_2)
            generation_loss = generation_loss_fn(long_answer_logits_1.view(-1, long_answer_logits_1.size(-1)), 
                                                 long_answer_ids.view(-1)) + generation_loss_fn(long_answer_logits_2.view(-1, long_answer_logits_2.size(-1)), 
                                                 long_answer_ids.view(-1))

            loss += generation_loss
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
        
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

# train_contrastive(model, train_loader, tokenizer, optimizer, device, epochs)

In [5]:
expert_train_processed,artificial_train_processed,unlabeled_processed,expert_test_processed = load_pubmedqa_data(data_path)

In [6]:
# model_name = "dmis-lab/biobert-base-cased-v1.1"
# model_name = "nlpie/bio-mobilebert"
# model_name = 'nlpie/bio-tinybert'
model_name = "nlpie/tiny-biobert"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = PubMedQAContrastive(model_name).to(device)

Some weights of BertModel were not initialized from the model checkpoint at nlpie/tiny-biobert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
train_dataset = PubMedQADataset(expert_train_processed + artificial_train_processed, tokenizer,max_length = 512)
unlabeled_dataset = PubMedQADataset(unlabeled_processed, tokenizer,max_length = 512)
test_dataset = PubMedQADataset(expert_test_processed,tokenizer,max_length = 512)

In [9]:
test_set_path = os.path.join(data_path,"test_set.json")
df_test = pd.read_json(test_set_path).T

In [10]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

23087604

In [11]:
def compute_class_counts(dataset):
    class_counts = Counter()
    for data in tqdm(dataset):
        label = data['label'].item()
        class_counts[label] += 1
    return class_counts

In [11]:

# Compute class counts and weights
class_counts = compute_class_counts(train_dataset)

100%|██████████| 211769/211769 [07:02<00:00, 501.26it/s]


In [12]:
class_count_list = [196420, 15294, 55]

In [13]:
# class_count_list = [class_counts[i] for i in range(len(class_counts))]
class_weights = [max(class_count_list) / count for count in class_count_list]
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda')

In [14]:
class_weights

tensor([1.0000e+00, 1.2843e+01, 3.5713e+03], device='cuda:0')

In [15]:
labeled_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
unlabeled_loader = DataLoader(unlabeled_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset,batch_size=32, shuffle=False)

In [16]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [14]:
num_epochs = 2

In [15]:
generation_loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [38]:
train_contrastive(model, unlabeled_loader, tokenizer,optimizer, device, 
                  num_epochs, augment,generation_loss_fn)

Epoch 1/2: 100%|██████████| 7657/7657 [29:45<00:00,  4.29it/s, loss=0.000563]


Epoch 1/2, Loss: 1.0848


Epoch 2/2: 100%|██████████| 7657/7657 [29:45<00:00,  4.29it/s, loss=6.44e-5]

Epoch 2/2, Loss: 0.7442





In [39]:
# torch.save(model.state_dict(), 'pubmedqa_contrastive_model.pth')

state = {
    'state_dict': model.state_dict(),
    'optimizer': optimizer.state_dict(),
    'epochs': 2,
    'lr':5e-5
}

torch.save(state, f"weights/{model_name.split('/')[1]}_512_contrastive_model.pt")

In [17]:

# Load the trained contrastive model
model = PubMedQAContrastive(model_name).to(device)
# model.load_state_dict(torch.load('pubmedqa_contrastive_model.pth'))

checkpoint = torch.load(f"weights/{model_name.split('/')[1]}_512_contrastive_model.pt")
model.load_state_dict(checkpoint['state_dict'])
optimizer.load_state_dict(checkpoint['optimizer'])


Some weights of BertModel were not initialized from the model checkpoint at nlpie/tiny-biobert and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)

# Create and train the classifier
classifier = PubMedQAClassifier(model).to(device)

Before pre-training = 0.39

After pre-training for 2 epochs = 0.522

In [19]:
print(f"Test Accuracy before finetuning : {get_acc(classifier,test_loader,device)}")


100%|██████████| 16/16 [00:03<00:00,  4.65it/s]

Test Accuracy before finetuning : 0.522





## PHASE 2 finetuning

In [20]:
classifier_optimizer = torch.optim.AdamW(classifier.parameters(), lr=2e-5)
classification_loss_fn = nn.CrossEntropyLoss(weight = class_weights)
generation_loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [21]:
def train_classifier(model, dataloader, testloader, optimizer,classification_loss_fn,generation_loss_fn, device, epochs):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        progress_bar = tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}")
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            label = batch['label'].to(device)
            long_answer_ids = batch['long_answer_ids'].to(device)


            classification_logits,long_answer_logits = model(input_ids, attention_mask)
            
            # Compute losses
            classification_loss = classification_loss_fn(classification_logits, label)
            # classification_loss = CB_loss(label.to('cpu'), classification_logits.to('cpu'), class_count_list, num_classes,loss_type, beta, gamma)
            generation_loss = generation_loss_fn(long_answer_logits.view(-1, long_answer_logits.size(-1)), long_answer_ids.view(-1))
    
            # Combine losses
            loss = classification_loss + generation_loss
      
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
                
            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})
            
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")
        print(f"Test Accuracy : {get_acc(model,testloader,device)}")
        model.train()



In [None]:
train_classifier(classifier,labeled_loader,test_loader,classifier_optimizer,classification_loss_fn,
                 generation_loss_fn,device,1)

Epoch 1/1:  73%|███████▎  | 19254/26472 [40:29<15:10,  7.93it/s, loss=7.97]

In [46]:
state = {
    'state_dict': classifier.state_dict(),
    'optimizer': classifier_optimizer.state_dict(),
    'epochs': 1,
    'lr':2e-5
}

torch.save(state, f"weights/{model_name.split('/')[1]}_512_contrastive_QA_model.pt")

In [22]:
checkpoint = torch.load(f"weights/{model_name.split('/')[1]}_512_contrastive_QA_model.pt")
classifier.load_state_dict(checkpoint['state_dict'])
classifier_optimizer.load_state_dict(checkpoint['optimizer'])


In [23]:
get_acc(classifier,test_loader,device)

100%|██████████| 16/16 [00:03<00:00,  5.17it/s]


0.614

In [28]:
save_preds(df_test.index.to_list(),
           get_pred(classifier,test_loader,device),
           "tinybiobert_phase_1_phase_2",
          pred_dir=save_path)

100%|██████████| 16/16 [00:03<00:00,  5.04it/s]


In [42]:
train_classifier(classifier,labeled_loader,test_loader,classifier_optimizer,classification_loss_fn,
                 generation_loss_fn,device,1)

Epoch 1/1: 100%|██████████| 26472/26472 [55:44<00:00,  7.92it/s, loss=6.5] 


Epoch 1/1, Loss: 6.6541


100%|██████████| 16/16 [00:03<00:00,  5.19it/s]

Test Accuracy : 0.53





## Without artificial data

In [19]:
expert_train = PubMedQADataset(expert_train_processed, tokenizer,max_length = 512)
expert_train_loader = DataLoader(expert_train,batch_size=8, shuffle=False)

In [20]:
class_counts = compute_class_counts(expert_train)

100%|██████████| 500/500 [00:00<00:00, 556.28it/s]


In [21]:
class_count_list = [class_counts[i] for i in range(len(class_counts))]
class_weights = [max(class_count_list) / count for count in class_count_list]
class_weights = torch.tensor(class_weights, dtype=torch.float).to('cuda')

In [22]:
classifier_optimizer = torch.optim.AdamW(classifier.parameters(), lr=2e-5)
classification_loss_fn = nn.CrossEntropyLoss(weight = class_weights)
generation_loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

In [23]:
train_classifier(classifier,expert_train_loader,test_loader,classifier_optimizer,classification_loss_fn,
                 generation_loss_fn,device,1)

Epoch 1/1: 100%|██████████| 63/63 [00:08<00:00,  7.48it/s, loss=12]  


Epoch 1/1, Loss: 11.3995


100%|██████████| 16/16 [00:03<00:00,  5.07it/s]

Test Accuracy : 0.552





In [24]:
state = {
    'state_dict': classifier.state_dict(),
    'optimizer': classifier_optimizer.state_dict(),
    'epochs': 1,
    'lr':2e-5
}

torch.save(state, f"weights/{model_name.split('/')[1]}_512_contrastive_phase_1_phase_2_expert_QA_model.pt")

In [25]:
save_preds(df_test.index.to_list(),
           get_pred(classifier,test_loader,device),
           "tinybiobert_phase_1_phase_2_expert_only",
          pred_dir=save_path)

100%|██████████| 16/16 [00:03<00:00,  5.14it/s]


## Inference

In [25]:
with open("predictions/tinybiobert_phase_1_phase_2.json",'r') as f:
    test_preds = json.load(f)

In [71]:
list(test_preds.items())[:5]

[('12377809', 'yes'),
 ('26163474', 'yes'),
 ('19100463', 'yes'),
 ('18537964', 'yes'),
 ('12913878', 'yes')]

In [70]:
df_test.index[:5].values

array([12377809, 26163474, 19100463, 18537964, 12913878])

In [75]:
pd.set_option('display.max_colwidth', None)

In [76]:
df_test.head()[['QUESTION','CONTEXTS','reasoning_required_pred','final_decision']]

Unnamed: 0,QUESTION,CONTEXTS,reasoning_required_pred,final_decision
12377809,Is anorectal endosonography valuable in dyschesia?,"[Dyschesia can be provoked by inappropriate defecation movements. The aim of this prospective study was to demonstrate dysfunction of the anal sphincter and/or the musculus (m.) puborectalis in patients with dyschesia using anorectal endosonography., Twenty consecutive patients with a medical history of dyschesia and a control group of 20 healthy subjects underwent linear anorectal endosonography (Toshiba models IUV 5060 and PVL-625 RT). In both groups, the dimensions of the anal sphincter and the m. puborectalis were measured at rest, and during voluntary squeezing and straining. Statistical analysis was performed within and between the two groups., The anal sphincter became paradoxically shorter and/or thicker during straining (versus the resting state) in 85% of patients but in only 35% of control subjects. Changes in sphincter length were statistically significantly different (p<0.01, chi(2) test) in patients compared with control subjects. The m. puborectalis became paradoxically shorter and/or thicker during straining in 80% of patients but in only 30% of controls. Both the changes in length and thickness of the m. puborectalis were significantly different (p<0.01, chi(2) test) in patients versus control subjects.]",yes,yes
26163474,Is there a connection between sublingual varices and hypertension?,"[Sublingual varices have earlier been related to ageing, smoking and cardiovascular disease. The aim of this study was to investigate whether sublingual varices are related to presence of hypertension., In an observational clinical study among 431 dental patients tongue status and blood pressure were documented. Digital photographs of the lateral borders of the tongue for grading of sublingual varices were taken, and blood pressure was measured. Those patients without previous diagnosis of hypertension and with a noted blood pressure ≥ 140 mmHg and/or ≥ 90 mmHg at the dental clinic performed complementary home blood pressure during one week. Those with an average home blood pressure ≥ 135 mmHg and/or ≥ 85 mmHg were referred to the primary health care centre, where three office blood pressure measurements were taken with one week intervals. Two independent blinded observers studied the photographs of the tongues. Each photograph was graded as none/few (grade 0) or medium/severe (grade 1) presence of sublingual varices. Pearson's Chi-square test, Student's t-test, and multiple regression analysis were applied. Power calculation stipulated a study population of 323 patients., An association between sublingual varices and hypertension was found (OR = 2.25, p<0.002). Mean systolic blood pressure was 123 and 132 mmHg in patients with grade 0 and grade 1 sublingual varices, respectively (p<0.0001, CI 95 %). Mean diastolic blood pressure was 80 and 83 mmHg in patients with grade 0 and grade 1 sublingual varices, respectively (p<0.005, CI 95 %). Sublingual varices indicate hypertension with a positive predictive value of 0.5 and a negative predictive value of 0.80.]",yes,yes
19100463,Is the affinity column-mediated immunoassay method suitable as an alternative to the microparticle enzyme immunoassay method as a blood tacrolimus assay?,"[Tacrolimus is a potent immunosuppressive drug used in organ transplantation. Because of its substantial toxic effects, narrow therapeutic index, and interindividual pharmacokinetic variability, therapeutic drug monitoring of whole-blood tacrolimus concentrations has been recommended. We investigated the comparability of the results of 2 immunoassay systems, affinity column-mediated immunoassay (ACMIA) and microparticle enzyme immunoassay (MEIA), comparing differences in the tacrolimus concentrations measured by the 2 methods in relation to the hematologic and biochemical values of hepatic and renal functions., A total of 154 samples from kidney or liver transplant recipients were subjected to Dimension RxL HM with a tacrolimus Flex reagent cartilage for the ACMIA method and IMx tacrolimus II for the MEIA method., Tacrolimus concentrations measured by the ACMIA method (n = 154) closely correlated with those measured by the MEIA method (r = 0.84). The Bland-Altman plot using concentration differences between the 2 methods and the average of the 2 methods showed no specific trends. The tacrolimus levels determined by both the MEIA method and the ACMIA method were not influenced by hematocrit levels, but the difference between the 2 methods (ACMIA - MEIA) tended to be larger in low hematocrit samples (P<.001).]",maybe,yes
18537964,Does a physician's specialty influence the recording of medication history in patients' case notes?,"[To determine the impact of a physician's specialty on the frequency and depth of medication history documented in patient medical records., A cross-sectional assessment of the frequency and depth of medication history information documented by 123 physicians for 900 randomly selected patients stratified across Cardiology, Chest, Dermatology, Endocrine, Gastroenterology, Haematology, Neurology, Psychiatry and Renal specialties was carried out at a 900-bed teaching hospital located in Ibadan, Nigeria., Four hundred and forty-three (49.2%) of the cohort were males and 457 (50.8%) were females; with mean ages 43.2 +/- 18.6 and 43.1 +/- 17.9 years respectively. Physicians' specialties significantly influenced the depth of documentation of the medication history information across the nine specialties (P<0.0001). Post hoc pair-wise comparisons with Tukey's HSD test showed that the mean scores for adverse drug reactions and adherence to medicines was highest in the Cardiology specialty; while the Chest specialty had the highest mean scores for allergy to drugs, food, chemicals and cigarette smoking. Mean scores for the use of alcohol; illicit drugs; dietary restrictions was highest for Gastroenterology, Psychiatry and Endocrine specialties respectively. Physicians' specialties also significantly influenced the frequency of documentation of the medication history across the nine specialties (P<0.0001).]",yes,yes
12913878,Locoregional opening of the rodent blood-brain barrier for paclitaxel using Nd:YAG laser-induced thermo therapy: a new concept of adjuvant glioma therapy?,"[Nd:YAG laser-induced thermo therapy (LITT) of rat brains is associated with blood-brain barrier (BBB) permeability changes. We address the question of whether LITT-induced locoregional disruption of the BBB could possibly allow a locoregional passage of chemotherapeutic agents into brain tissue to treat malignant glioma.STUDY DESIGN/, CD Fischer rats were subject to LITT of the left forebrain. Disruption of the BBB was analyzed using Evans blue and immunohistochemistry (IH). Animals were perfused with paclitaxel, and high-pressure liquid chromatography (HPLC) was employed to analyze the content of paclitaxel in brain and plasma samples., LITT induces an opening of the BBB as demonstrated by locoregional extravasation of Evans blue, C3C, fibrinogen, and IgM. HPLC proved the passage of paclitaxel across the disrupted BBB.]",yes,yes
