# Transformer 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tensorflow as tf 
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import os  
from sklearn.model_selection import train_test_split
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import random 
from py_help import torch_helper as tc_help
from Model_CBT import CustomBertModel
from transformers import DistilBertModel, DistilBertTokenizerFast, AdamW,BertModel,BertTokenizerFast

In [1]:
## tiny grad setup
# ! if [ ! -d tinygrad/.git ]; then git clone https://github.com/geohot/tinygrad.git ; cd tinygrad ; python3.8 setup.py develop ; else echo "Tinygrad exists"; fi

! was unexpected at this time.


In [3]:
# check gpu 
# !python3.8 -c "import torch; print(torch.cuda.is_available())" 

zsh:1: command not found: python3.8


In [4]:
files = ['../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/' + file for file in os.listdir('../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/')]
files

['../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt',
 '../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt',
 '../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt']

# Preparing the dataset for label encoding

In [5]:
train_data = tc_help().get_lines(files[0])
train_data[:2]

['###24293578\n',
 'OBJECTIVE\tTo investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n']

In [6]:
train_contents = tc_help().pre_processor(files[0])
test_contents = tc_help().pre_processor(files[1])
val_contents = tc_help().pre_processor(files[2])

train_df = pd.DataFrame(train_contents)
test_df = pd.DataFrame(test_contents)
val_df = pd.DataFrame(val_contents)

In [7]:
train_df.head()

Unnamed: 0,target,text,line_number,total_lines
0,OBJECTIVE,to investigate the efficacy of @ weeks of dail...,0,11
1,METHODS,a total of @ patients with primary knee oa wer...,1,11
2,METHODS,outcome measures included pain reduction and i...,2,11
3,METHODS,pain was assessed using the visual analog pain...,3,11
4,METHODS,secondary outcome measures included the wester...,4,11


In [8]:
test_df.head()

Unnamed: 0,target,text,line_number,total_lines
0,BACKGROUND,this study analyzed liver function abnormaliti...,0,8
1,RESULTS,a post hoc analysis was conducted with the use...,1,8
2,RESULTS,liver function tests ( lfts ) were measured at...,2,8
3,RESULTS,survival analyses were used to assess the asso...,3,8
4,RESULTS,the percentage of patients with abnormal lfts ...,4,8


In [9]:
val_df.head()

Unnamed: 0,target,text,line_number,total_lines
0,BACKGROUND,ige sensitization to aspergillus fumigatus and...,0,9
1,BACKGROUND,it is not clear whether these patients would b...,1,9
2,OBJECTIVE,we sought to determine whether a @-month cours...,2,9
3,METHODS,asthmatic patients who were ige sensitized to ...,3,9
4,METHODS,primary outcomes were improvement in quality o...,4,9


In [10]:
# TODO STOP WORD REMOVAL

nltk.download("stopwords")
swrds = stopwords.words("english")
print(swrds[:15])
porter = PorterStemmer()


def nltk_preprocessor(sentence,stopwords=swrds):
        """preprocessing the data based on nltk STOPWORDS

        Args:
            sentence (string): The string or the sentence that is to be passed 

        Returns:
            sentence (string): The pre proceesed result from the function 
        """

        sentence = sentence.lower()
        # get rid of the stop words
        pt = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
        sentence = pt.sub("", sentence)
        # paranthesis cases 
        sentence = re.sub(r"\([^)]*\)", "", sentence)
        # handling the spaces and the filters
        sentence = re.sub(r"([-;;.,!?<=>])", r" \1", sentence)
        sentence = re.sub(r"[^A-Za-z0-9]", " ", sentence) # removing all cases for non alpha numeric characters 
        sentence = re.sub(" +", " ", sentence)
        sentence = sentence.strip()

        return sentence 

prep_df = train_df.copy()
prep_df.text = prep_df.text.apply(nltk_preprocessor)
print(f"{train_df.text.values[0]}\n\n{prep_df.text.values[0]}")
print("The number of sentences for training are : {} \nThe number of sentences for vaildation are : {}\n The number of sentences for testing are : {}".format(len(train_df['text'].tolist()),len(val_df['text'].tolist()),len(test_df['text'].tolist())))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/markins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours']
to investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .

investigate efficacy weeks daily low dose oral prednisolone improving pain mobility systemic low grade inflammation short term whether effect would sustained weeks older adults moderate severe knee osteoarthritis
The number of sentences for training are : 180040 
The number of sentences for vaildation are : 30212
 The number of sentences for testing are : 30135


In [11]:
train_sz,val_sz,test_sz = 0.7,0.2,0.1
x,y = prep_df['text'].values,prep_df['target'].values
x_train,x_val,x_test,y_train,y_val,y_test = tc_help().data_splitter(x,y,train_sz)
print('Trained Data shape ----> X_train : {} , Y_train : {} \nValidation Data Shape -----> X_val : {} , Y_val : {}\nTesting Data Shape -----> X_test : {} , Y_test : {}'.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape))

Trained Data shape ----> X_train : (126027,) , Y_train : (126027,) 
Validation Data Shape -----> X_val : (27006,) , Y_val : (27006,)
Testing Data Shape -----> X_test : (27007,) , Y_test : (27007,)


# Label Encoding

In [12]:
lb = tc_help().lb_encoder
lb.lb_fit(y_train)
classes = lb.__length__()
print('The nos label encoded classes : {}'.format(classes))
lb.encoded_classes
cl_names = lb.encoded_classes.keys()
print(cl_names)
# train_df['target'].values

# targets to numbers
y_train,y_val,y_test = lb.lb_encoder(train_df['target'].values),lb.lb_encoder(val_df['target'].values),lb.lb_encoder(test_df['target'].values)
# weights of the classes
cnts = np.bincount(y_train)
clw = {index : 1.0/cnts for index , cnts in enumerate(cnts)}
print("Counts and weights of the classes respectively : {} and \n {} ".format(cnts,clw))

The nos label encoded classes : 5
dict_keys(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'])
Counts and weights of the classes respectively : [21727 27168 59353 13839 57953] and 
 {0: 4.6025682330740555e-05, 1: 3.680800942285041e-05, 2: 1.684834801947669e-05, 3: 7.225955632632416e-05, 4: 1.7255362103773747e-05} 


# Tokenizing

In [13]:
model="distilbert-base-uncased" # url --> https://huggingface.co/distilbert-base-uncased and https://huggingface.co/docs/transformers/model_doc/distilbert
bertmdl,tknzr = DistilBertModel.from_pretrained(model), DistilBertTokenizerFast.from_pretrained(model)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
train_dt,val_dt,test_dt = train_df['text'].tolist(), val_df['text'].tolist(), test_df['text'].tolist()

In [15]:
tokenized_res = tknzr.encode_plus(train_dt[0], return_tensors='pt', max_length=128, padding='max_length',truncation=True)
result=bertmdl(tokenized_res['input_ids'],attention_mask=tokenized_res['attention_mask'])[0][:,0,:].view(-1,768)
result.shape

torch.Size([1, 768])

In [16]:
tknzr = BertTokenizerFast.from_pretrained('bert-base-cased')

In [17]:
class CustomDataset(Dataset):
    """Generates custom tokenized preprocessed dataset
    """


    def __init__(self, text_seq, l_num, tot_ln, target, toknzer):
        self.text_seq = text_seq
        self.l_num = l_num
        self.tot_ln = tot_ln
        self.target = target
        self.toknzer = toknzer

    def collation(self, data):
        """Preprocessing on a batch of dataset

        Args:
            data (ndarray): A batch of dataset in an array format
        """
        # grabbing the input
        data = np.array(data)
        txt = data[:,0]
        ln_nums,total_lns,target = data[:,1], data[:,2],data[:,3]
        # one hot encoding
        ln_nums,total_lns = tf.one_hot(ln_nums, depth=20), tf.one_hot(total_lns, depth=24)
        # tokenizing the inputs
        toknzed_res = self.toknzer(txt.tolist(), return_tensors='pt', max_length=128, padding='max_length', truncation=True)
        ln_nums = torch.tensor(ln_nums.numpy())
        total_lns = torch.tensor(total_lns.numpy())
        target = torch.LongTensor(target.astype(np.int32))

        return toknzed_res,ln_nums,total_lns, target

    def create_datald(self, batch_size, shuffle=False,drop_last=False):
        dloader = DataLoader(dataset=self, batch_size=batch_size, collate_fn=self.collation, shuffle=shuffle, drop_last=drop_last, pin_memory=True)
        return dloader

    def __len__(self):
        return len(self.target)
    
    def __getitem__(self, pos):
        return [self.text_seq[pos], self.l_num[pos],self.tot_ln[pos],self.target[pos]]

train_data = CustomDataset(train_df['text'].values, train_df['line_number'], train_df['total_lines'], y_train,tknzr)
test_data = CustomDataset(train_df['text'].values, train_df['line_number'], train_df['total_lines'], y_test, tknzr)
val_data = CustomDataset(train_df['text'].values, train_df['line_number'], train_df['total_lines'], y_val,tknzr)
train_dl = train_data.create_datald(batch_size=64)
test_dl = test_data.create_datald(batch_size=64)
val_dl = test_data.create_datald(batch_size=64)
batch = next(iter(train_dl))
token_text,ln_nos,total_lnos = batch[:-1]
token_text

2022-09-04 22:04:57.857489: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/lib/
2022-09-04 22:04:57.857512: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-09-04 22:04:57.858236: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


{'input_ids': tensor([[  101,  1106,  8242,  ...,     0,     0,     0],
        [  101,   170,  1703,  ...,     0,     0,     0],
        [  101,  9386,  5252,  ...,     0,     0,     0],
        ...,
        [  101,  2962, 16756,  ...,     0,     0,     0],
        [  101,  1107,  1241,  ...,     0,     0,     0],
        [  101,  1175,  1127,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [18]:
# keeping the data and model on the same device
device = 'cuda' if torch.cuda.is_available() else 'cpu'
token_text['input_ids'] = token_text['input_ids'].to(device) 
token_text['attention_mask'] = token_text['attention_mask'].to(device)
token_text

{'input_ids': tensor([[  101,  1106,  8242,  ...,     0,     0,     0],
        [  101,   170,  1703,  ...,     0,     0,     0],
        [  101,  9386,  5252,  ...,     0,     0,     0],
        ...,
        [  101,  2962, 16756,  ...,     0,     0,     0],
        [  101,  1107,  1241,  ...,     0,     0,     0],
        [  101,  1175,  1127,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

# Custom Bert Model

In [19]:
mdl = CustomBertModel(classes=classes)
mdl = mdl.to(device)
mdl

Downloading pytorch_model.bin:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


CustomBertModel(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=T