In [1]:
# pip install -U --user torchtext

In [4]:
import torch
import pandas as pd
import numpy as np
import os
import re
import torch.nn as nn
from collections import Counter
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import string
import spacy
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import mean_squared_error
from nltk.corpus import stopwords
from nltk import word_tokenize
import torchtext
from torchtext.data import get_tokenizer

In [5]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ss4yd/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
print(torchtext.__version__)

0.11.0


In [7]:
data_dir = "/project/ds7003-fall22/team1/data/abstracts_data/20k_abstracts/"

In [8]:
filenames = [data_dir + filename for filename in os.listdir(data_dir)]
filenames

['/project/ds7003-fall22/team1/data/abstracts_data/20k_abstracts/train.txt',
 '/project/ds7003-fall22/team1/data/abstracts_data/20k_abstracts/test.txt',
 '/project/ds7003-fall22/team1/data/abstracts_data/20k_abstracts/dev.txt']

In [9]:
def get_lines(filename):
    with open(filename, 'r') as f:
        return f.readlines()

In [10]:
train_data_lines = get_lines(data_dir+'train.txt')
train_data_lines[:20]

['###24293578\n',
 'OBJECTIVE\tTo investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .\n',
 'METHODS\tA total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .\n',
 'METHODS\tOutcome measures included pain reduction and improvement in function scores and systemic inflammation markers .\n',
 'METHODS\tPain was assessed using the visual analog pain scale ( 0-100 mm ) .\n',
 'METHODS\tSecondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .\n',
 'METHODS\tSerum levels of interleukin 1 ( IL-1 ) , IL-6 , tumor necrosis factor ( TNF )

In [11]:
def preprocess_data(filename):
    input_lines = get_lines(filename)
    abstract_lines = ""
    abstract_samples = []
    for line in input_lines:
        if line.startswith("###"):
            abstract_id = line
            abstract_lines = ""
        elif line.isspace():
            abstract_line_split = abstract_lines.splitlines()
            for abstract_line_number, abstract_line in enumerate(abstract_line_split):
                line_data={}
                target_text_split=abstract_line.split("\t")
                line_data["target"]=target_text_split[0]
                line_data["text"]=target_text_split[1].lower()
                line_data["line_number"] = abstract_line_number
                line_data["total_lines"] = len(abstract_line_split)-1
                abstract_samples.append(line_data)
        else:
            abstract_lines+=line
    return abstract_samples

In [12]:
train_samples = preprocess_data(data_dir + "train.txt")
test_samples = preprocess_data(data_dir + "test.txt")

In [13]:
len(train_samples), len(test_samples)

(180040, 30135)

In [14]:
train_samples[:2]

[{'target': 'OBJECTIVE',
  'text': 'to investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( oa ) .',
  'line_number': 0,
  'total_lines': 11},
 {'target': 'METHODS',
  'text': 'a total of 125 patients with primary knee oa were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .',
  'line_number': 1,
  'total_lines': 11}]

In [15]:
train_df = pd.DataFrame(train_samples)
test_df = pd.DataFrame(test_samples)

In [16]:
train_df.head(11)

Unnamed: 0,target,text,line_number,total_lines
0,OBJECTIVE,to investigate the efficacy of 6 weeks of dail...,0,11
1,METHODS,a total of 125 patients with primary knee oa w...,1,11
2,METHODS,outcome measures included pain reduction and i...,2,11
3,METHODS,pain was assessed using the visual analog pain...,3,11
4,METHODS,secondary outcome measures included the wester...,4,11
5,METHODS,"serum levels of interleukin 1 ( il-1 ) , il-6 ...",5,11
6,RESULTS,there was a clinically relevant reduction in t...,6,11
7,RESULTS,the mean difference between treatment arms ( 9...,7,11
8,RESULTS,"further , there was a clinically relevant redu...",8,11
9,RESULTS,these differences remained significant at 12 w...,9,11


In [17]:
train_df.target.value_counts()

METHODS        59353
RESULTS        57953
CONCLUSIONS    27168
BACKGROUND     21727
OBJECTIVE      13839
Name: target, dtype: int64

In [18]:
print(train_df.shape)

(180040, 4)


In [19]:
train_df = train_df.reset_index(drop=True)

In [20]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text)
    text = text.replace('x', '')
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

train_df["text"] = train_df["text"].apply(clean_text)

In [21]:
test_df["text"] = test_df["text"].apply(clean_text)

In [22]:
train_df.head(10)

Unnamed: 0,target,text,line_number,total_lines
0,OBJECTIVE,investigate efficacy 6 weeks daily lowdose ora...,0,11
1,METHODS,total 125 patients primary knee oa randomized ...,1,11
2,METHODS,outcome measures included pain reduction impro...,2,11
3,METHODS,pain assessed using visual analog pain scale 0...,3,11
4,METHODS,secondary outcome measures included western on...,4,11
5,METHODS,serum levels interleukin 1 il1 il6 tumor necro...,5,11
6,RESULTS,clinically relevant reduction intervention gro...,6,11
7,RESULTS,mean difference treatment arms 95 ci 109 4818 ...,7,11
8,RESULTS,clinically relevant reduction serum levels il1...,8,11
9,RESULTS,differences remained significant 12 weeks,9,11


In [23]:
test_df.head(10)

Unnamed: 0,target,text,line_number,total_lines
0,BACKGROUND,study analyzed liver function abnormalities he...,0,8
1,RESULTS,post hoc analysis conducted use data evaluatio...,1,8
2,RESULTS,liver function tests lfts measured 7 time poin...,2,8
3,RESULTS,survival analyses used assess association admi...,3,8
4,RESULTS,percentage patients abnormal lfts decreased si...,4,8
5,RESULTS,mean hemodynamic profiles compared patients ab...,5,8
6,RESULTS,multivariable analyses revealed patients eleva...,6,8
7,CONCLUSIONS,abnormal lfts common adhf population dynamic m...,7,8
8,CONCLUSIONS,elevated meldi scores associated poor outcomes...,8,8
9,BACKGROUND,minimally invasive endovascular aneurysm repai...,0,12


## PyTorch pre-processing

In [24]:
# Function which directly tokenize the tweet data
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# tt = TweetTokenizer()
# train_df['text'].apply(word_tokenize)

[nltk_data] Downloading package punkt to /home/ss4yd/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [25]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

ps = PorterStemmer()

In [26]:
def tokenize(text):
    return word_tokenize(text)

def stemming(words):
    stem_words = []
    for w in words:
        w = ps.stem(w)
        stem_words.append(w)
    
    return stem_words

In [27]:
# apply tokenize function
train_df['text_tokens'] = train_df['text'].apply(tokenize)

In [28]:
test_df['text_tokens'] = test_df['text'].apply(tokenize)

In [29]:
# apply steming function
train_df['tokenized'] = train_df['text_tokens'].apply(stemming)
test_df['tokenized'] = test_df['text_tokens'].apply(stemming)

In [30]:
train_df.to_csv('../data/abstracts_data/20k_abstracts/processed_train.csv', index=False)
test_df.to_csv('../data/abstracts_data/20k_abstracts/processed_test.csv', index=False)

In [69]:
train_df.to_pickle('../data/abstracts_data/20k_abstracts/processed_train.pickle')
test_df.to_pickle('../data/abstracts_data/20k_abstracts/processed_test.pickle')

## Checkpoint 1

In [31]:
train_df.head()

Unnamed: 0,target,text,line_number,total_lines,text_tokens,tokenized
0,OBJECTIVE,investigate efficacy 6 weeks daily lowdose ora...,0,11,"[investigate, efficacy, 6, weeks, daily, lowdo...","[investig, efficaci, 6, week, daili, lowdos, o..."
1,METHODS,total 125 patients primary knee oa randomized ...,1,11,"[total, 125, patients, primary, knee, oa, rand...","[total, 125, patient, primari, knee, oa, rando..."
2,METHODS,outcome measures included pain reduction impro...,2,11,"[outcome, measures, included, pain, reduction,...","[outcom, measur, includ, pain, reduct, improv,..."
3,METHODS,pain assessed using visual analog pain scale 0...,3,11,"[pain, assessed, using, visual, analog, pain, ...","[pain, assess, use, visual, analog, pain, scal..."
4,METHODS,secondary outcome measures included western on...,4,11,"[secondary, outcome, measures, included, weste...","[secondari, outcom, measur, includ, western, o..."


In [32]:
test_df.head()

Unnamed: 0,target,text,line_number,total_lines,text_tokens,tokenized
0,BACKGROUND,study analyzed liver function abnormalities he...,0,8,"[study, analyzed, liver, function, abnormaliti...","[studi, analyz, liver, function, abnorm, heart..."
1,RESULTS,post hoc analysis conducted use data evaluatio...,1,8,"[post, hoc, analysis, conducted, use, data, ev...","[post, hoc, analysi, conduct, use, data, evalu..."
2,RESULTS,liver function tests lfts measured 7 time poin...,2,8,"[liver, function, tests, lfts, measured, 7, ti...","[liver, function, test, lft, measur, 7, time, ..."
3,RESULTS,survival analyses used assess association admi...,3,8,"[survival, analyses, used, assess, association...","[surviv, analys, use, assess, associ, admiss, ..."
4,RESULTS,percentage patients abnormal lfts decreased si...,4,8,"[percentage, patients, abnormal, lfts, decreas...","[percentag, patient, abnorm, lft, decreas, sig..."


In [33]:
index2word = ["<PAD>", "<SOS>", "<EOS>"]

In [34]:
for ds in [train_df, test_df]:
    for index, row in ds.iterrows():
        label=row['target']
        text=row['text_tokens']
        for token in text:
            if token not in index2word:
                index2word.append(token)

In [35]:
word2index = {token: idx for idx, token in enumerate(index2word)}

In [36]:
def label_map(label):
    if label == "RESULTS":
        return 0
    elif label == "METHODS":
        return 1
    elif label == "CONCLUSIONS":
        return 2
    elif label == "BACKGROUND":
        return 3
    else: #positive
        return 4

In [43]:
# max(test_df.text_tokens.apply(len))

128

In [44]:
seq_length = 194
def encode_and_pad(tweet, length):
    sos = [word2index["<SOS>"]]
    eos = [word2index["<EOS>"]]
    pad = [word2index["<PAD>"]]

    if len(tweet) < length - 2: # -2 for SOS and EOS
        n_pads = length - 2 - len(tweet)
        encoded = [word2index[w] for w in tweet]
        return sos + encoded + eos + pad * n_pads 
    else: # tweet is longer than possible; truncating
        encoded = [word2index[w] for w in tweet]
        truncated = encoded[:length - 2]
        return sos + truncated + eos

In [45]:
train_set = [(label, tokens) for label, tokens in zip(train_df['target'], train_df['text_tokens'])]
test_set = [(label, tokens) for label, tokens in zip(test_df['target'], test_df['text_tokens'])]

In [46]:
train_encoded = [(encode_and_pad(tweet, seq_length), label_map(label)) for label, tweet in train_set]
test_encoded = [(encode_and_pad(tweet, seq_length), label_map(label)) for label, tweet in test_set]

In [47]:
for i in train_encoded[:3]:
    print(i)

([1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 6, 24, 25, 26, 27, 28, 29, 30, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 4)
([1, 31, 32, 33, 34, 28, 30, 35, 36, 37, 38, 39, 40, 41, 10, 42, 38, 43, 5, 6, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset
batch_size = 50

train_x = np.array([tweet for tweet, label in train_encoded])
train_y = np.array([label for tweet, label in train_encoded])
test_x = np.array([tweet for tweet, label in test_encoded])
test_y = np.array([label for tweet, label in test_encoded])

train_ds = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_ds = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))


train_dl = DataLoader(train_ds, shuffle=True, batch_size=batch_size, drop_last=True)
test_dl = DataLoader(test_ds, shuffle=True, batch_size=batch_size, drop_last=True)

In [56]:
import pickle
with open('word2index.pickle', 'wb') as handle:
    pickle.dump(word2index, handle)
# with open('word2index.pickle', 'rb') as handle:
#     word2index = pickle.load(handle)

## Model

In [58]:
class BiLSTM_SentimentAnalysis(torch.nn.Module) :
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout) :
        super().__init__()

        # The embedding layer takes the vocab size and the embeddings size as input
        # The embeddings size is up to you to decide, but common sizes are between 50 and 100.
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM layer takes in the the embedding size and the hidden vector size.
        # The hidden dimension is up to you to decide, but common values are 32, 64, 128
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)

        # We use dropout before the final layer to improve with regularization
        self.dropout = nn.Dropout(dropout)

        # The fully-connected layer takes in the hidden dim of the LSTM and
        #  outputs a a 3x1 vector of the class scores.
        self.fc = nn.Linear(hidden_dim, 3)

    def forward(self, x, hidden):
        """
        The forward method takes in the input and the previous hidden state 
        """

        # The input is transformed to embeddings by passing it to the embedding layer
        embs = self.embedding(x)

        # The embedded inputs are fed to the LSTM alongside the previous hidden state
        out, hidden = self.lstm(embs, hidden)

        # Dropout is applied to the output and fed to the FC layer
        out = self.dropout(out)
        out = self.fc(out)

        # We extract the scores for the final hidden state since it is the one that matters.
        out = out[:, -1]
        return out, hidden
    
    def init_hidden(self):
        return (torch.zeros(1, batch_size, 32), torch.zeros(1, batch_size, 32))

In [59]:
use_cuda = torch.cuda.is_available()
# use_mps = torch.backends.mps.is_available()
if use_cuda:
    device = torch.device("cuda")
else:
    device = torch.device("cpu")
    
model = BiLSTM_SentimentAnalysis(len(word2index), 64, 32, 0.2)
model = model.to(device)

In [62]:
device

device(type='cuda')

In [60]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 3e-4)

In [66]:
from tqdm.notebook import trange, tqdm

epochs = 50
losses = []
for e in range(epochs):
    
    print('\nEpoch: {}'.format(e))
    
    h0, c0 =  model.init_hidden()

    h0 = h0.to(device)
    c0 = c0.to(device)

    for batch_idx, batch in tqdm(enumerate(train_dl)):

        input = batch[0].to(device)
        target = batch[1].to(device)

        optimizer.zero_grad()
        with torch.set_grad_enabled(True):
            out, hidden = model(input, (h0, c0))
            loss = criterion(out, target)
            loss.backward()
            optimizer.step()
    losses.append(loss.item())


Epoch: 0


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.