In [1]:
import pandas as pd

df_train = pd.read_csv("data/train.dat",sep="\t",names=['label','text'])

In [2]:
df_train.head()

Unnamed: 0,label,text
0,4,Catheterization laboratory events and hospital...
1,5,Renal abscess in children. Three cases of rena...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...
3,5,Subclavian artery to innominate vein fistula a...
4,4,Effect of local inhibition of gamma-aminobutyr...


In [3]:
df=df_train

In [4]:
labels = {1:'digestive system diseases',2:'cardiovascular diseases',3:'neoplasms',4:'nervous system diseases',5:'general pathological conditions'}
df['label'] = df.label.replace(labels)

In [5]:
df.head()

Unnamed: 0,label,text
0,nervous system diseases,Catheterization laboratory events and hospital...
1,general pathological conditions,Renal abscess in children. Three cases of rena...
2,cardiovascular diseases,Hyperplastic polyps seen at sigmoidoscopy are ...
3,general pathological conditions,Subclavian artery to innominate vein fistula a...
4,nervous system diseases,Effect of local inhibition of gamma-aminobutyr...


In [6]:
possible_labels = df.label.unique()

label_dict = {}
for index, possible_label in enumerate(possible_labels):
    label_dict[possible_label] = index
label_dict

{'nervous system diseases': 0,
 'general pathological conditions': 1,
 'cardiovascular diseases': 2,
 'digestive system diseases': 3,
 'neoplasms': 4}

In [7]:
df['label'] = df.label.replace(label_dict)

In [8]:
import re
def clean_txt(text):
    text = re.sub("'", "",text)
    text=re.sub("(\\W)+"," ",text)    
    return text

In [9]:
df['text']  = df.text.apply(clean_txt)
df.text = df.text.apply(lambda x: x.strip())
df.head(30)

Unnamed: 0,label,text
0,0,Catheterization laboratory events and hospital...
1,1,Renal abscess in children Three cases of renal...
2,2,Hyperplastic polyps seen at sigmoidoscopy are ...
3,1,Subclavian artery to innominate vein fistula a...
4,0,Effect of local inhibition of gamma aminobutyr...
5,3,Infection during chronic epidural catheterizat...
6,1,Mediastinal tracheostomy using a pectoralis ma...
7,1,Tumefactive fibroinflammatory lesion of the ex...
8,4,Multiple representations contribute to body kn...
9,1,Increasing asthma prevalence in a rural New Ze...


In [10]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(df.index.values, 
                                                  df.label.values, 
                                                  test_size=0.15, 
                                                  random_state=42 
                                                  )
df['data_type'] = ['not_set']*df.shape[0]

df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
label,data_type,Unnamed: 2_level_1
0,train,2572
0,val,479
1,train,4117
1,val,688
2,train,1255
2,val,239
3,train,2679
3,val,484
4,train,1649
4,val,276


In [11]:
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(12272,) (2166,) (12272,) (2166,)


In [12]:

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
from torch import Tensor

from transformers import BertForSequenceClassification


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].text.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    return_tensors='pt'
)


input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

batch_size = 3

dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)

dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

In [15]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=1e-5, 
                  eps=1e-8)
                  
epochs = 10

scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)



In [16]:
from sklearn.metrics import f1_score
import numpy as np

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==labels]
        y_true = labels_flat[labels_flat==labels]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==labels])}/{len(y_true)}\n')

In [17]:
device = "cuda:0"
model = model.to(device)

In [18]:
import random
import numpy as np

seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def evaluate(dataloader_val):

    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals
    
for epoch in tqdm(range(1, epochs+1)):
    
    model.train()
    
    loss_train_total = 0

    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:

        model.zero_grad()
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       

        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()
        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})
                
    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')
    tqdm.write(f'F1 Score (Weighted): {val_f1}')

    torch.save(model.state_dict(), f'model_artifacts/newmodel-epoch-{epoch}.model')


  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/4091 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.0211367580071693
Validation loss: 0.9101143399165773
F1 Score (Weighted): 0.6281253286097446


Epoch 2:   0%|          | 0/4091 [00:00<?, ?it/s]


Epoch 2
Training loss: 0.8689330916435611
Validation loss: 0.900071810936127
F1 Score (Weighted): 0.6350408620450676


Epoch 3:   0%|          | 0/4091 [00:00<?, ?it/s]


Epoch 3
Training loss: 0.8042488208935721
Validation loss: 0.9334259705045299
F1 Score (Weighted): 0.6285885199702894


Epoch 4:   0%|          | 0/4091 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.7512758045189357
Validation loss: 1.078186763699458
F1 Score (Weighted): 0.6225305905160968


Epoch 5:   0%|          | 0/4091 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.7070120469838553
Validation loss: 1.2303875117166705
F1 Score (Weighted): 0.6077964841033732


Epoch 6:   0%|          | 0/4091 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.6694634552662352
Validation loss: 1.2659547452394362
F1 Score (Weighted): 0.5857950909222


Epoch 7:   0%|          | 0/4091 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [19]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('model_artifacts/newmodel-epoch-2.model', map_location=torch.device('cpu')))

_, predictions, true_vals = evaluate(dataloader_validation)
accuracy_per_class(predictions, true_vals)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Class: nervous system diseases
Accuracy: 1415/2166

Class: general pathological conditions
Accuracy: 1415/2166

Class: cardiovascular diseases
Accuracy: 1415/2166

Class: digestive system diseases
Accuracy: 1415/2166

Class: neoplasms
Accuracy: 1415/2166



In [22]:
df_test = pd.read_csv("data/test.dat",sep="\t",names=['text','label'])

In [23]:
df_test.head()

Unnamed: 0,text,label
0,Excision of limbal dermoids. We reviewed the c...,
1,Bell's palsy. A diagnosis of exclusion. In cas...,
2,Retained endobronchial foreign body removal fa...,
3,Recurrent buccal space abscesses: a complicati...,
4,Intracranial fibromatosis. Fibromatoses are un...,


In [31]:
review_text = df_test['text'][2]

In [33]:
review_text

'Retained endobronchial foreign body removal facilitated by steroid therapy of an obstructing, inflammatory polyp. Oral and topical steroids were used to induce regression in an inflammatory, obstructing endobronchial polyp caused by a retained foreign body. The FB (a peanut half), which had been present for over six months, was then able to be easily and bloodlessly retrieved with fiberoptic bronchoscopy. '

In [34]:
encoded_data_pred = tokenizer.encode_plus(
    review_text, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=512, 
    return_tensors='pt'
)

input_ids_pred = encoded_data_pred['input_ids'].to(device)
attention_masks_pred = encoded_data_pred['attention_mask'].to(device)
dataset_test = TensorDataset(input_ids_pred,attention_masks_pred)



In [35]:
dataloader_test = DataLoader(dataset_test, 
                                   sampler=SequentialSampler(dataset_test), 
                                   batch_size=1)

def test(dataloader_test):

    model.eval()
    
    loss_val_total = 0
    predictionss= []
    
    for batch in dataloader_test:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            print(outputs[0].shape)
            
            
        #loss = outputs[0]
        logits = outputs[0]
        _,prediction = torch.max(logits,dim = 1)
        print(type(prediction))
        predictionss.append(prediction.item())
              
    return predictionss

predictions = test(dataloader_test)



torch.Size([1, 5])
<class 'torch.Tensor'>


In [36]:
label_dict_inverse = {v: k for k, v in label_dict.items()}
print(f'Test text: {review_text}')
print()
print(f'Class: {label_dict_inverse[predictions[0]]}')

Test text: Retained endobronchial foreign body removal facilitated by steroid therapy of an obstructing, inflammatory polyp. Oral and topical steroids were used to induce regression in an inflammatory, obstructing endobronchial polyp caused by a retained foreign body. The FB (a peanut half), which had been present for over six months, was then able to be easily and bloodlessly retrieved with fiberoptic bronchoscopy. 

Class: general pathological conditions
