In [None]:
preds = np.concatenate(ret[0]);
real = np.concatenate(ret[1]);

print('test predictions: {}'.format(preds));
print('real values: {}'.format(real));

test_acc = accuracy_score(real, preds);
test_prec = precision_score(real, preds,average=None);
test_rec = recall_score(real, preds,average=None);
test_f1 = f1_score(real, preds,average=None);

print('test accuracy: {}'.format(test_acc));
print('test precision: {}'.format(test_prec));
print('test recall: {}'.format(test_rec));
print('test f1: {}'.format(test_f1));

In [None]:
ret = test_DDI_model(model, test_data);

In [None]:

# load test_accumulated.tsv file
avg_test_accuracy = 0;

test_data = load_and_process_DDI_test_data('all_data_test_data.txt','test', tokenizer=tokenizer);




In [None]:


def test_DDI_model(model, data, batch_size=12):
    model.load_state_dict(torch.load('bioelectra-base-discriminator-pubmed.pt'));

    model.eval();

    ds = data;

    test_dataloader = DataLoader(
        ds,
        sampler=SequentialSampler(ds),
        batch_size=batch_size,
    );

    preds_list, real_labels_list = [], [];

    for batch_nbr, batch in enumerate(test_dataloader):
        b_input_ids = batch[0].to(device);
        b_input_mask = batch[1].to(device);
        b_labels = batch[2].to(device);

        with torch.no_grad():
            output = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask
            );
        loss = output.loss;
        preds = torch.argmax(output.logits, dim=1).flatten()
        logits = preds.detach().cpu().numpy();
        label_ids = b_labels.to('cpu').numpy();
        

        preds_list.append(logits);
        real_labels_list.append(label_ids);

        print('finished batch {}'.format(batch_nbr));

    ret = (
        preds_list,
        real_labels_list
    );
    
    return ret;

In [None]:
from IPython.display import display;
def load_and_process_DDI_test_data(filepath,set_type, tokenizer, maxlen=512):
    
    relation_labels=['false', 'mechanism', 'effect', 'advise', 'int']
    print( "Input File Reading")
    fp = open(filepath, 'r')
    samples = fp.read().strip().split('\n\n')
    sent_lengths   = []		#1-d array
    sent_contents  = []		#2-d array [[w1,w2,....] ...]
    labels    = []		#1-d array
    entity1_list   = []		#2-d array [[e1,e1_t] [e1,e1_t]...]
    entity2_list   = []		#2-d array [[e1,e1_t] [e1,e1_t]...]
    examples = []
    i=0
    for sample in samples:
      guid = "%s-%s" % (set_type, i)
      sent, entities, relation = sample.strip().split('\n')

      e1, e1_t, e2, e2_t = entities.split('\t') 
      sent_contents.append(sent.lower())
      label = relation_labels.index(relation) 
      examples.append(InputExample(guid=guid, text_a=sent.lower(), label=label))
      entity1_list.append([e1, e1_t])
      entity2_list.append([e2, e2_t])
      print(label)
      labels.append(label)
      i = i+1



    input_ids = [];
    attention_masks = [];
    
    for sent in sent_contents:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        );

        input_ids.append(encoded_dict['input_ids']);
        attention_masks.append(encoded_dict['attention_mask']);

    # convert lists into tensors
    input_ids = torch.cat(input_ids, dim=0);
    attention_masks = torch.cat(attention_masks, dim=0);
    labels_tensor = torch.tensor(labels);

    dataset = TensorDataset(input_ids, attention_masks, labels_tensor);


    return dataset;

In [None]:
train_DDI_model(model, data);

In [None]:
def train_DDI_model(model, data, optimizer=AdamW, batch_size=12, epochs=3):
    max_val_loss = np.float('inf');

    train_ds = data[0];
    val_ds = data[1];

    train_dataloader = DataLoader(
        train_ds,
        sampler=RandomSampler(train_ds),
        batch_size=batch_size
    );

    val_dataloader = DataLoader(
        val_ds,
        sampler=SequentialSampler(val_ds),
        batch_size=batch_size
    );

    for e in range(epochs):
        train_loss = 0;
        train_acc = 0;

        model.train();

        optim = optimizer(model.parameters(), lr=2e-5, eps=1e-8);

        scheduler = get_linear_schedule_with_warmup(
            optim,
            num_warmup_steps=0,
            num_training_steps=len(train_dataloader) * epochs
        );

        for batch in train_dataloader:
            b_input_ids = batch[0].to(device);
            b_input_mask = batch[1].to(device);
            b_labels = batch[2].to(device);

            model.zero_grad();

            output = model(
                b_input_ids,
                token_type_ids=None,
                attention_mask=b_input_mask,
                labels=b_labels
            );

            loss = output.loss;
            preds = torch.argmax(output.logits, dim=1).flatten()
            preds = preds.detach().cpu().numpy();
            #print(preds)
            labels = b_labels.to('cpu').numpy();
            #print(labels)
            train_loss += loss.item();
            train_acc += accuracy(preds, labels);
            #print(train_acc)
            loss.backward();

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0);

            optim.step();

            scheduler.step();
        
        avg_train_loss = train_loss / len(train_dataloader);
        avg_train_acc = train_acc / len(train_dataloader);

        print('average training loss for epoch: {}'.format(avg_train_loss));
        print('average training accuracy for epoch: {}'.format(avg_train_acc));

        # validation
        val_loss = 0;
        val_acc = 0;

        model.eval();

        for batch in val_dataloader:
            b_input_ids = batch[0].to(device);
            b_attention_mask = batch[1].to(device);
            b_labels = batch[2].to(device);

            with torch.no_grad():
                output = model(
                    b_input_ids,
                    token_type_ids=None,
                    attention_mask=b_attention_mask,
                    labels=b_labels
                );
            
            loss = output['loss'];
            preds = torch.argmax(output.logits, dim=1).flatten()
            preds = preds.detach().cpu().numpy();
            labels = b_labels.to('cpu').numpy();

            val_loss += loss.item();
            val_acc += accuracy(preds, labels);
        
        avg_val_loss = val_loss / len(val_dataloader);
        avg_val_acc = val_acc / len(val_dataloader);

        if avg_val_loss < max_val_loss:
            max_val_loss = avg_val_loss;
            torch.save(model.state_dict(), 'bioelectra-base-discriminator-pubmed.pt');

        print('average validation loss for epoch: {}'.format(avg_val_loss));
        print('average validation accuracy for epoch: {}'.format(avg_val_acc));

In [None]:

data = load_and_process_DDI_train_data('all_data_train_data.txt','train' ,tokenizer=tokenizer, maxlen=256);




In [None]:

tokenizer = ElectraTokenizer.from_pretrained('kamalkraj/bioelectra-base-discriminator-pubmed',do_lower_case=True)

model = ElectraForSequenceClassification.from_pretrained(
    'kamalkraj/bioelectra-base-discriminator-pubmed',
    num_labels=5,
    output_attentions=False,
    output_hidden_states=False
);

device = torch.device('cuda');
model.cuda();

epochs = 2;

In [None]:
def load_and_process_DDI_train_data(filepath,set_type, tokenizer, maxlen=512, train_percentage=0.7):
    # load dataset

    relation_labels=['false', 'mechanism', 'effect', 'advise', 'int']
    print( "Input File Reading")
    fp = open(filepath, 'r')
    samples = fp.read().strip().split('\n\n')
    sent_lengths   = []		#1-d array
    sent_contents  = []		#2-d array [[w1,w2,....] ...]
    labels    = []		#1-d array
    entity1_list   = []		#2-d array [[e1,e1_t] [e1,e1_t]...]
    entity2_list   = []		#2-d array [[e1,e1_t] [e1,e1_t]...]
    examples = []
    i=0
    for sample in samples:
      guid = "%s-%s" % (set_type, i)
      sent, entities, relation = sample.strip().split('\n')

      e1, e1_t, e2, e2_t = entities.split('\t') 
      sent_contents.append(sent.lower())
      label = relation_labels.index(relation) 
      examples.append(InputExample(guid=guid, text_a=sent.lower(), label=label))
      entity1_list.append([e1, e1_t])
      entity2_list.append([e2, e2_t])
      print(label)
      labels.append(label)
      i = i+1



    input_ids = [];
    attention_masks = [];
    
    for sent in sent_contents:
        encoded_dict = tokenizer.encode_plus(
            sent,
            add_special_tokens=True,
            max_length=maxlen,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        );

        input_ids.append(encoded_dict['input_ids']);
        attention_masks.append(encoded_dict['attention_mask']);
        

    # convert lists into tensors
    input_ids = torch.cat(input_ids, dim=0);
    attention_masks = torch.cat(attention_masks, dim=0);
    labels_tensor = torch.tensor(labels);

    dataset = TensorDataset(input_ids, attention_masks, labels_tensor);

    train_size = int(train_percentage * len(dataset));
    val_size = len(dataset) - train_size;

    train_dataset, val_dataset = random_split(dataset, [train_size, val_size]);

    return (train_dataset, val_dataset);


In [None]:
!pip install sentencepiece

!pip install transformers
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
import numpy as np;
import pandas as pd;

import torch;

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertConfig;
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler;
from transformers import get_linear_schedule_with_warmup;

from transformers import (WEIGHTS_NAME, BertConfig, BertForSequenceClassification, BertTokenizer, BertModel, BertForTokenClassification,
                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer, 
                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer,
                                  RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                                  AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer, AlbertModel,AlbertForMaskedLM,
                                  AutoTokenizer, AutoModelForTokenClassification, AutoConfig,
                           XLMRobertaTokenizer,XLMRobertaForSequenceClassification,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer,
                          ElectraTokenizer,ElectraForSequenceClassification
                          )
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score;
from transformers import AlbertTokenizer, AlbertForSequenceClassification, AlbertConfig, AdamW, DistilBertForSequenceClassification, DistilBertTokenizer, DistilBertConfig


import os
import re
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

import gc
    
gc.collect()
import torch
torch.cuda.empty_cache()

import torch

if torch.cuda.is_available():       
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(0))

else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

import os
import csv
import copy
import json
import logging

import torch
from torch.utils.data import TensorDataset


logger = logging.getLogger(__name__)

class InputExample(object):
    """
    A single training/test example for simple sequence classification.
    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
        label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """

    def __init__(self, guid, text_a, label):
        self.guid = guid
        self.text_a = text_a
        self.label = label

    def __repr__(self):
        return str(self.to_json_string())

    def to_dict(self):
        """Serializes this instance to a Python dictionary."""
        output = copy.deepcopy(self.__dict__)
        return output

    def to_json_string(self):
        """Serializes this instance to a JSON string."""
        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


from transformers import BertTokenizer

# Load the BERT tokenizer

tokenizer = ElectraTokenizer.from_pretrained('kamalkraj/bioelectra-base-discriminator-pubmed',do_lower_case=True)



In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/
!ls
