In [1]:
import numpy as np
import pandas as pd
import torch
import pickle
import re
from itertools import chain
from collections import Counter
import torch.nn as nn
import glob
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
import transformers
from transformers import AdamW
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, Dataset, random_split
from tqdm import tqdm
import time

In [2]:
# specify device
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# Load the BERT tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Bert mode
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
class DescriptionDataset(Dataset):
    
    """Description dataset without species names."""
    
    def __init__(self, root_dir):
        
        self.root_dir = root_dir
        self.samples = []
        self._init_dataset()
        
    def __len__(self):
        return len(self.samples)
        
    def __getitem__(self, idx):
        return self.samples[idx]
     
    def _init_dataset(self):
        
        # Load the pickle list
        datalist = glob.glob(self.root_dir + 'train*.pkl')
        # Init list
        data_values = []
        # Loop over the pickles
        for data in datalist:
            # Open the pickles
            datadict = pickle.load(open(data, 'rb'))
            # Undict and append
            data_values += (list(chain.from_iterable(datadict.values())))
        
        # Drop double values 
        data_values = list(set(data_values))
        
        nested_values = [[tuple([1, span]) if text[0] == 1 else tuple([0, span]) 
                          for span in self.random_text_splitter(text[1])] 
                         for text in data_values]
        
        self.samples += list(chain.from_iterable(nested_values))
        
    def random_text_splitter(self, text):

        import random

        """
        Random breaks up a text into an X amount of sentences. 
        The output sentences consist of a minimum of 10 sentences.
        """

        # Split text
        words = text.split()
        # Get the amount of words
        word_amount = len(words)
        if word_amount <= 10:
            return [text]
        
        # Create counter
        remaining_word_amount = word_amount
        # Init list
        parts = []
        # While words remaining
        while remaining_word_amount > 0:
            if len(words) < 10:
                # Add last part if less then 10
                parts[-1] = parts[-1] + ' '.join(words)
                # exit
                remaining_word_amount = 0
            # Generate random int
            randint = random.randint(10, word_amount)
            # Append to list 
            parts.append(' '.join(words[:randint]))
            # Delete previous selection
            words = words[randint:]
            # Update counter
            remaining_word_amount -= randint

        return parts

In [4]:
try:
    # Colab
    from google.colab import drive
    root = '/content/gdrive/My Drive/'
    drive.mount('/content/gdrive')
    print('Mounted @Google')
except:
    # Local
    root = "../data/processed/"
    print('Mounted @Local')

start = time.time()
# Load data
data = DescriptionDataset(root)
end = time.time()
print("Time consumed in working: ",end - start)

Mounted @Local
Time consumed in working:  15.779551982879639


In [5]:
# Inspect the data
ones = Counter(ones[0] for ones in data if ones[0] == 1)
zeros = Counter(ones[0] for ones in data if ones[0] == 0)

print('{0} samples.'. format(len(data)))
print('{0} labels with 1 (true).'.format(ones[1]))
print('{0} labels with 0 (false).'.format(zeros[0]))

2257831 samples.
578314 labels with 1 (true).
1679517 labels with 0 (false).


In [6]:
total_count = len(data[0:500])
train_count = int(0.8 * total_count)
valid_count = int(0.1 * total_count)
test_count = total_count - train_count - valid_count
train_dataset, valid_dataset, test_dataset = random_split(data[0:500], (train_count, valid_count, test_count), 
                                                       generator=torch.Generator().manual_seed(33))

In [7]:
batch_size = 2

# Random sample (skewed set)
train_sampler = RandomSampler(train_dataset)
# DataLoader for train set
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

# Random sample
val_sampler = SequentialSampler(valid_dataset)
# DataLoader for validation set
val_dataloader = DataLoader(valid_dataset, sampler=val_sampler, batch_size=batch_size)

In [8]:
# Freeze all the parameters
for param in bert.parameters():
    param.requires_grad = False

In [9]:
class BERT(nn.Module):
    def __init__(self, bert):
        
        super(BERT, self).__init__()
        
        # Distil Bert model
        self.bert = bert
        ## Additional layers
        # Dropout layer
        self.dropout = nn.Dropout(0.1)
        # Relu 
        self.relu =  nn.ReLU()
        # Linear I 
        self.fc1 = nn.Linear(768, 512)
        # Linear II (Out)
        self.fc2 = nn.Linear(512, 2)
        # Softmax
        self.softmax = nn.LogSoftmax(dim=1)
        #self.sigmoid = nn.Sigmoid()


    # Forward pass
    def forward(self, sent_id, mask):

        # Pass data trough bert and extract 
        cls_hs = self.bert(sent_id, attention_mask=mask)
        hidden_state = cls_hs[0]
        pooler = hidden_state[:, 0]
        
        # Dense layer 1        
        x = self.fc1(pooler)
        # ReLU activation
        x = self.relu(x)
        # Drop out
        x = self.dropout(x)
        # Dense layer 2
        x = self.fc2(x)
        # Activation
        x = self.softmax(x)
        #x = self.sigmoid(x)

        return x

In [10]:
# Load the entire model
model = BERT(bert)

# Load trained model (colab)
try:
    try:
        model_save_name = 'saved_weights.pt'
        path = F"/content/gdrive/My Drive/{model_save_name}"
        model.load_state_dict(torch.load(path))
        print('Google Success')

    except:
        model_save_name = 'model_weights_splitted_reducednegatives.pt'
        path = "../models/" + model_save_name
        model.load_state_dict(torch.load(path, 
                                         map_location=torch.device('cpu')))
        print('Local Success')
except:
    print('No pretrained model found.')

# Push the model to GPU
model = model.to(device)

Local Success


In [None]:
model = BERT(bert)
# Push the model to GPU
model = model.to(device)

In [15]:
batch = next(iter(train_dataloader))

train_seq, train_mask, train_y = tokenize_batch(batch)
# Push to device
sent_id, mask, labels = [t.to(device) for t in [train_seq, train_mask, train_y]]

# Get predictions
preds = model(sent_id, mask)

In [16]:
beta=0.95

In [17]:
softmax = nn.Softmax(1)
CEloss = nn.CrossEntropyLoss()


start = time.time()
loss_entropy = (softmax(preds) * softmax(preds).log()).mean()
total_loss = CEloss(preds, labels) * beta + loss_entropy * (1-beta)

end = time.time()
print("Time consumed in working: ",end - start)

Time consumed in working:  0.0012750625610351562


In [18]:
loss_entropy

tensor(-0.0008, grad_fn=<MeanBackward0>)

In [19]:
predicted = preds
target = labels



sigmoid = nn.Sigmoid()
NLLLos = nn.NLLLoss()
CE = nn.CrossEntropyLoss()

import time
start = time.time()
cross_entropy = NLLLos(sigmoid(predicted).log(), target)
soft_reed = -predicted * torch.log(sigmoid(predicted) + 1e-8) # Roundings
print(beta * cross_entropy + (1 - beta) * torch.sum(soft_reed) / batch_size)

end = time.time()
print("Time consumed in working: ",end - start)

tensor(-3.1531, grad_fn=<AddBackward0>)
Time consumed in working:  0.0020418167114257812


In [None]:
CE(predicted, target)

In [None]:
cross_entropy

In [None]:
soft_reed

In [11]:
# Load optimizer (Adam best for bert)
optimizer = torch.optim.Adam(params = model.parameters(), lr=3e-5)
# Define loss function
softmax = nn.Softmax(1)
CEloss = nn.CrossEntropyLoss()

def soft_loss(predictions, targets, beta=0.95):
    
    """
    Uses bootstrapping to make the model more robust to noisy data labels.
    See https://arxiv.org/pdf/1412.6596.pdf (6) for the equation.
    """
    
    
    loss_entropy = (softmax(predictions) * softmax(predictions).log()).mean()
    return CEloss(predictions, targets) * beta + loss_entropy * (1 - beta)


def tokenize_batch(batch_set):
    
    """
    Tokenize a pytorch dataset using the hugging face tokenizer.
    """
    
    # Extract the labels and text
    y = batch_set[0]
    text = batch_set[1]
    
    # Tokenize the text
    tokens = tokenizer.batch_encode_plus(text,
                max_length = 512,
                padding=True,
                truncation=True)
    
    # Convert to tensors
    seq = torch.tensor(tokens['input_ids'])
    mask = torch.tensor(tokens['attention_mask'])
    
    return seq, mask, y

def train():
  
    """
    Function to train classification Bert model.
    """
    
    model.train()
    total_loss = 0
    
    # Iterate over batches
    for batch in tqdm(train_dataloader):
        
        # Tokenize batch
        train_seq, train_mask, train_y = tokenize_batch(batch)
        # Push to device
        sent_id, mask, labels = [t.to(device) for t in [train_seq, train_mask, train_y]]
        # Clear gradients 
        model.zero_grad()        
        # Get predictions
        preds = model(sent_id, mask)
        # Compute loss
        loss =  soft_loss(preds, labels) 
        #loss = cross_entropy(preds, labels)
        # Update total loss
        total_loss = total_loss + loss.item()
        # Backward pass to calculate the gradients
        loss.backward()
        # Clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters
        optimizer.step()

    # Compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    return avg_loss


def evaluate():
    
    """
    Function to test classification Bert model.
    """
  
    # Deactivate dropout layers
    model.eval()
    total_loss = 0

    # Iterate over batches
    for batch in tqdm(val_dataloader):   
        # Tokenize batch
        val_seq, val_mask, val_y = tokenize_batch(batch)
        # Push to device
        sent_id, mask, labels = [t.to(device) for t in [val_seq, val_mask, val_y]]
        # Deactivate autograd
        with torch.no_grad():
            # Model predictions
            preds = model(sent_id, mask)
            # Compute the validation loss between actual and predicted values
            loss =  soft_loss(preds, labels) 
            #loss = cross_entropy(preds,labels)
            total_loss = total_loss + loss.item()

    # Compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    return avg_loss

In [12]:
# Epochs
epochs = 1

# Init loss
best_valid_loss = float('inf')

# data lists
train_losses=[]
valid_losses=[]

# Loop over epochs
for epoch in range(epochs):
     
    print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))
    
    # Train model
    train_loss = train() 
    # Evaluate model
    valid_loss  = evaluate()
        
    # Append training and validation loss
    train_losses.append(train_loss)
    valid_losses.append(valid_loss)
    
    print(f'\nTraining Loss: {train_loss:.6f}')
    print(f'Validation Loss: {valid_loss:.6f}')


 Epoch 1 / 1


100%|█████████████████████████████████████████| 200/200 [00:34<00:00,  5.74it/s]
100%|███████████████████████████████████████████| 25/25 [00:03<00:00,  7.80it/s]


Training Loss: 0.136058
Validation Loss: 0.184174





In [None]:
try:
    # Save @Google
    model_save_name = 'saved_weights.pt'
    path = F"/content/gdrive/My Drive/{model_save_name}" 
    torch.save(model.state_dict(), path)
    print('Saved @Google Drive')
except:
    # Save locally
    model_save_name = 'saved_weights.pt'
    path = '/notebooks/model/" + model_save_name
    torch.save(model.state_dict(), path)
    print('Saved @local drive')

In [13]:
test_batch = 10
test_dataloader = DataLoader(test_dataset, batch_size=test_batch)
pred_list = np.array([]).reshape(0, 2)
y_list = np.array([])

# Loop over test data
for test_batch in tqdm(test_dataloader):
    # Gradients off
    with torch.no_grad():
        # Extract text and label and tokenize
        test_seq, test_mask, test_y = tokenize_batch(test_batch)
        # Push to device
        sent_id, mask, labels = [t.to(device) for t in [test_seq, test_mask, test_y]]
        # Predictions
        preds = model(sent_id, mask)
        # Detach
        preds = torch.exp(preds).detach().cpu().numpy()
        test_y = test_y.detach().cpu().numpy()
        # append to array
        pred_list = np.vstack([pred_list, preds])
        y_list = np.hstack([y_list, test_y])

100%|█████████████████████████████████████████████| 5/5 [00:05<00:00,  1.11s/it]


In [14]:
y_list_model = np.argmax(pred_list, axis = 1)
print(classification_report(y_list, y_list_model))

              precision    recall  f1-score   support

         0.0       0.94      0.94      0.94        35
         1.0       0.87      0.87      0.87        15

    accuracy                           0.92        50
   macro avg       0.90      0.90      0.90        50
weighted avg       0.92      0.92      0.92        50



In [None]:
# Get incorrect bool list
misclass_bool = y_list!=y_list_model
# Get their index
misclass_idx = np.where(misclass_bool)
# Extract the misclassified spans from the data
misclass_sents = [test_dataset[i] for i in misclass_idx[0]]

In [None]:
misclass_sents

In [None]:
misclass_sents = [test_dataset[i] for i in misclass_idx[0]]

In [None]:
misclass_sents

In [None]:
'''
def soft_loss(predicted, target, beta=0.95):
    
    """Uses bootstrapping to make the model more robust to noisy data labels.
    See https://arxiv.org/pdf/1412.6596.pdf (6) for the equation.
    """
    
    sigmoid = nn.Sigmoid()
    NLLLos = nn.NLLLoss()
    cross_entropy = NLLLos(sigmoid(predicted).log(), target)
    soft_reed = -predicted * torch.log(sigmoid(predicted))
    return (beta * cross_entropy + (1 - beta) * torch.sum(soft_reed))/ batch_size


# Testing, Deactivate dropout layer
model.eval()
# Push a dataset trough the mode
BatchTest = next(iter(train_dataloader))

train_seq, train_mask, train_y = tokenize_batch(BatchTest)

# Push to device
sent_id, mask, labels = [t.to(device) for t in [train_seq, train_mask, train_y]]

# Push the data trough the model
preds = model(sent_id, mask)

# Check the prediction 

print(torch.exp(preds), labels)


# number of training epochs
epochs = 1

# set initial loss to infinite
best_valid_loss = float('inf')

# empty lists to store training and validation loss of each epoch
train_losses=[]
valid_losses=[]

#for each epoch
for epoch in range(epochs):
    
    with tqdm(train_dataloader, unit="batch") as tepoch:
        
        for batch in tepoch:
            
            tepoch.set_description(f"Epoch {epoch}")
         
            # Set model to train
            model.train()
            # empty list to save model predictions
            total_preds=[]

            # push the batch to gpu
            batch = [r.to(device) for r in batch]
            sent_id, mask, labels = batch

            # clear previously calculated gradients 
            model.zero_grad()        
            # get model predictions for the current batch
            preds = model(sent_id, mask)

            # compute the loss between actual and predicted values
            loss = cross_entropy(preds, labels)           
            # backward pass to calculate the gradients
            loss.backward()
            # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # update parameters
            optimizer.step()

            # Set taqaddum params   
            tepoch.set_postfix(loss=loss.item(), accuracy=total_accuracy)
            
            
# function to train the model
def train():
  
    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds=[]

    # iterate over batches
    for step, batch in enumerate(train_dataloader):
    
        # progress update after every 50 batches.
        if step % 4 == 0 and not step == 0:
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

        # push the batch to gpu
        batch = [r.to(device) for r in batch]

        sent_id, mask, labels = batch

        # clear previously calculated gradients 
        model.zero_grad()        

        # get model predictions for the current batch
        preds = model(sent_id, mask)

        # compute the loss between actual and predicted values
        loss = cross_entropy(preds, labels)

        # add on to the total loss
        total_loss = total_loss + loss.item()

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        preds=preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds)

    # compute the training loss of the epoch
    avg_loss = total_loss / len(train_dataloader)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    #returns the loss and predictions
    return avg_loss, total_preds


# function for evaluating the model
def evaluate():
  
    print("\nEvaluating:")

    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_preds = []

    # iterate over batches
    for step, batch in enumerate(val_dataloader):
    
        # Progress update every 50 batches.
        if step % 4 == 0 and not step == 0:
      
            # Calculate elapsed time in minutes.
            #elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            preds = model(sent_id, mask)

            # compute the validation loss between actual and predicted values
            loss = cross_entropy(preds,labels)

            total_loss = total_loss + loss.item()

            preds = preds.detach().cpu().numpy()

            total_preds.append(preds)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader) 

    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds  = np.concatenate(total_preds, axis=0)

    return avg_loss, total_preds

In [None]:
'''
sklearn version slightly faster
'''

'''
# Split
TotalCount = len(data)
TrainCount = int(0.8 * TotalCount)
ValidCount = int(0.1 * TotalCount)
TestCount = TotalCount - TrainCount - ValidCount
TrainDataset, ValidDataset, TestDataset = random_split(data, (TrainCount, ValidCount, TestCount), 
                                                       generator=torch.Generator().manual_seed(33))
train_label = [label[0] for label in TrainDataset]
train_text = [text[1] for text in TrainDataset]

valid_label = [label[0] for label in ValidDataset]
val_text = [text[1] for text in ValidDataset]

test_label = [label[0] for label in TestDataset]
test_text = [text[1] for text in TestDataset]
'''

In [None]:
'''

class DescriptionDataset(Dataset):
    
    """Description dataset without species names."""
    
    def __init__(self, root_dir):
        
        self.root_dir = root_dir
        self.samples = []
        self._init_dataset()
        
    def __len__(self):
        return len(self.samples)
        
    def __getitem__(self, idx):       
        return self.samples[idx]
    
    def _init_dataset(self):
        
        # Load the pickle list
        datalist = glob.glob(self.root_dir + 'train*.pkl')
        # Loop over the pickles
        for data in datalist:
            # Open the pickles
            datadict = pickle.load(open(data, 'rb'))
            # Undict and append
            self.samples += (list(chain.from_iterable(datadict.values())))
        
        # Drop double values 
        self.samples = list(set(self.samples))
def random_text_sampler(text):
    
    """
    Randomly breaks a piece of text into x pieces.
    """
    
    # Dont split short sentences
    if len(text.split()) <= 10:
        return [text]
    
    # Split text
    words = text.split()
    # Get length
    sentlength = len(words)
    # Random int
    randomint = random.randint(10, sentlength)
    # Check sentences from text
    parts = sentlength // randomint
    
    # Create sentences
    sentences = [' '.join(words[randomint*i:randomint*(i+1)]) for i in range(0, parts)]
    sentences += [' '.join(words[randomint*parts:])]
    
    return sentences
    
def random_text_sampler(text):
    
    """
    Randomly breaks a piece of text into 1 to 5 pieces.
    """
    
    if len(text.split()) < 10:
        return text
    
    # Get the length of the sentence
    n = int(len(text) / random.randint(1, 5))
    # Break up the text into spans
    spans = [text[i:i+n] for i in range(0, len(text), n)]
    
    # Check if end span is long enough
    if len(spans[-1].split()) < 2:
        spans_new = spans[:-2]
        spans_new.append(spans[-2] + spans[-1])
        
        return spans_new
    else:
        return spans
'''
