In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
# import tensorflow as tf 
import collections
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import os  
from sklearn.model_selection import train_test_split
import nltk 
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import random 
from py_help import torch_helper as tc_help

In [2]:
## tiny grad setup
! if [ ! -d tinygrad/.git ]; then git clone https://github.com/geohot/tinygrad.git ; cd tinygrad ; python3.8 setup.py develop ; else echo "Tinygrad exists"; fi

Tinygrad exists


In [3]:
# check gpu 
!python3.8 -c "import torch; print(torch.cuda.is_available())" 

True


In [4]:
files = ['../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/' + file for file in os.listdir('../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/')]
files

['../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/train.txt',
 '../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/test.txt',
 '../Dataset/pubmed-rct/PubMed_20k_RCT_numbers_replaced_with_at_sign/dev.txt']

# Preparing the dataset for label encoding

In [5]:
train_data = tc_help().get_lines(files[0])
train_data[:2]

['###24293578\n',
 'OBJECTIVE\tTo investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n']

In [6]:
train_contents = tc_help().pre_processor(files[0])
test_contents = tc_help().pre_processor(files[1])
val_contents = tc_help().pre_processor(files[2])

train_df = pd.DataFrame(train_contents)
test_df = pd.DataFrame(test_contents)
val_df = pd.DataFrame(val_contents)

In [7]:
train_df.head()

      target                                               text  line_number  \
0  OBJECTIVE  to investigate the efficacy of @ weeks of dail...            0   
1    METHODS  a total of @ patients with primary knee oa wer...            1   
2    METHODS  outcome measures included pain reduction and i...            2   
3    METHODS  pain was assessed using the visual analog pain...            3   
4    METHODS  secondary outcome measures included the wester...            4   

   total_lines  
0           11  
1           11  
2           11  
3           11  
4           11  

In [8]:
test_df.head()

       target                                               text  line_number  \
0  BACKGROUND  this study analyzed liver function abnormaliti...            0   
1     RESULTS  a post hoc analysis was conducted with the use...            1   
2     RESULTS  liver function tests ( lfts ) were measured at...            2   
3     RESULTS  survival analyses were used to assess the asso...            3   
4     RESULTS  the percentage of patients with abnormal lfts ...            4   

   total_lines  
0            8  
1            8  
2            8  
3            8  
4            8  

In [9]:
val_df.head()

       target                                               text  line_number  \
0  BACKGROUND  ige sensitization to aspergillus fumigatus and...            0   
1  BACKGROUND  it is not clear whether these patients would b...            1   
2   OBJECTIVE  we sought to determine whether a @-month cours...            2   
3     METHODS  asthmatic patients who were ige sensitized to ...            3   
4     METHODS  primary outcomes were improvement in quality o...            4   

   total_lines  
0            9  
1            9  
2            9  
3            9  
4            9  

In [10]:
# TODO STOP WORD REMOVAL

nltk.download("stopwords")
swrds = stopwords.words("english")
print(swrds[:15])
porter = PorterStemmer()


def nltk_preprocessor(sentence,stopwords=swrds):
        """preprocessing the data based on nltk STOPWORDS

        Args:
            sentence (string): The string or the sentence that is to be passed 

        Returns:
            sentence (string): The pre proceesed result from the function 
        """

        sentence = sentence.lower()
        # get rid of the stop words
        pt = re.compile(r"\b(" + r"|".join(stopwords) + r")\b\s*")
        sentence = pt.sub("", sentence)
        # paranthesis cases 
        sentence = re.sub(r"\([^)]*\)", "", sentence)
        # handling the spaces and the filters
        sentence = re.sub(r"([-;;.,!?<=>])", r" \1", sentence)
        sentence = re.sub(r"[^A-Za-z0-9]", " ", sentence) # removing all cases for non alpha numeric characters 
        sentence = re.sub(" +", " ", sentence)
        sentence = sentence.strip()

        return sentence 

prep_df = train_df.copy()
prep_df.text = prep_df.text.apply(nltk_preprocessor)
print(f"{train_df.text.values[0]}\n\n{prep_df.text.values[0]}")
print("The number of sentences for training are : {} \nThe number of sentences for vaildation are : {}\n The number of sentences for testing are : {}".format(len(train_df['text'].tolist()),len(val_df['text'].tolist()),len(test_df['text'].tolist())))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/markins/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours']
to investigate the efficacy of @ weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at @ weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .

investigate efficacy weeks daily low dose oral prednisolone improving pain mobility systemic low grade inflammation short term whether effect would sustained weeks older adults moderate severe knee osteoarthritis
The number of sentences for training are : 180040 
The number of sentences for vaildation are : 30212
 The number of sentences for testing are : 30135


In [11]:
train_sz,val_sz,test_sz = 0.7,0.2,0.1
x,y = prep_df['text'].values,prep_df['target'].values
x_train,x_val,x_test,y_train,y_val,y_test = tc_help().data_splitter(x,y,train_sz)
print('Trained Data shape ----> X_train : {} , Y_train : {} \nValidation Data Shape -----> X_val : {} , Y_val : {}\nTesting Data Shape -----> X_test : {} , Y_test : {}'.format(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape))

Trained Data shape ----> X_train : (126027,) , Y_train : (126027,) 
Validation Data Shape -----> X_val : (27006,) , Y_val : (27006,)
Testing Data Shape -----> X_test : (27007,) , Y_test : (27007,)


Label Encoding

In [12]:
lb = tc_help().lb_encoder
lb.lb_fit(y_train)
classes = lb.__length__()
print('The nos label encoded classes : {}'.format(classes))
lb.encoded_classes
cl_names = lb.encoded_classes.keys()
print(cl_names)
# train_df['target'].values

# targets to numbers
y_train,y_val,y_test = lb.lb_encoder(train_df['target'].values),lb.lb_encoder(val_df['target'].values),lb.lb_encoder(test_df['target'].values)
# weights of the classes
cnts = np.bincount(y_train)
clw = {index : 1.0/cnts for index , cnts in enumerate(cnts)}
print("Counts and weights of the classes respectively : {} and \n {} ".format(cnts,clw))

The nos label encoded classes : 5
dict_keys(['BACKGROUND', 'CONCLUSIONS', 'METHODS', 'OBJECTIVE', 'RESULTS'])
Counts and weights of the classes respectively : [21727 27168 59353 13839 57953] and 
 {0: 4.6025682330740555e-05, 1: 3.680800942285041e-05, 2: 1.684834801947669e-05, 3: 7.225955632632416e-05, 4: 1.7255362103773747e-05} 
