In [1]:
import os
from sklearn.model_selection import KFold
import numpy as np

class CFG:
    class data:
        batch_size=32
        validation_size = 0.2
        lr = 1e-4
        epochs = 10  
        epsilon = 1e-8
        MAX_LEN = 128 #max sentence length
        seed_val = 42 #random seed
        k_folds = 10
        hidden_size = 768 #hidden layer size (embedding size) for feedforward net
#         PATH = "/content/drive/MyDrive/Minor Project1/"
        PATH = "./"

ModuleNotFoundError: No module named 'sklearn'

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Minor Project/ETHOS_binary.csv')

print('Number of training sentences: {:,}\n'.format(df.shape[0]))

sentences = df.text.values
labels = df.label.values

df.head(10)

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW

# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

#Load the Bert model.
print('Loading BERT Model...')
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

filepath = CFG.data.PATH + 'original.pth'
if not os.path.exists(filepath):
    torch.save(model.state_dict(), filepath) 
    
# Tell pytorch to run this model on the GPU.
model.cuda()

In [None]:
# Tokenize all of the sentences and map the tokens to thier word IDs.
input_ids = []

# For every sentence...
for sent in sentences:
    # `encode` will:
    #   (1) Tokenize the sentence.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    encoded_sent = tokenizer.encode(
                        sent,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'

                        # This function also supports truncation and conversion
                        # to pytorch tensors, but we need to do padding, so we
                        # can't use these features :( .
                        #max_length = 128,          # Truncate all sentences.
                        #return_tensors = 'pt',     # Return pytorch tensors.
                   )
    
    # Add the encoded sentence to the list.
    input_ids.append(encoded_sent)

# Print sentence 0, now as a list of IDs.
print('Original: ', sentences[0])
print('Token IDs:', input_ids[0])

In [None]:
print('Max sentence length: ', max([len(sen) for sen in input_ids]))

In [None]:
# We'll borrow the `pad_sequences` utility function to do this.
from keras_preprocessing.sequence import pad_sequences

print('\nPadding/truncating all sentences to %d values...' % CFG.data.MAX_LEN)

print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

# Pad our input tokens with value 0.
# "post" indicates that we want to pad and truncate at the end of the sequence,
# as opposed to the beginning.
input_ids = pad_sequences(input_ids, maxlen=CFG.data.MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post")

print('\nDone.')

In [None]:
# Create attention masks
attention_masks = []

# For each sentence...
for sent in input_ids:
    
    # Create the attention mask.
    #   - If a token ID is 0, then it's padding, set the mask to 0.
    #   - If a token ID is > 0, then it's a real token, set the mask to 1.
    att_mask = [int(token_id > 0) for token_id in sent]
    
    # Store the attention mask for this sentence.
    attention_masks.append(att_mask)

In [None]:
#one-hot encode labels
def one_hot(categorical_labels):
    num_categories = 2
    labels = np.zeros((categorical_labels.shape[0], num_categories))
    labels[:,0] = (categorical_labels == 0).astype(int)
    labels[:,1] = (categorical_labels == 1).astype(int)
    return labels

labels = one_hot(labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import get_linear_schedule_with_warmup

# Convert all inputs and labels into torch tensors, the required datatype 
# for our model.
# train_inputs = torch.tensor(train_inputs)
# validation_inputs = torch.tensor(validation_inputs)

# train_labels = torch.tensor(train_labels)
# validation_labels = torch.tensor(validation_labels)

# train_masks = torch.tensor(train_masks)
# validation_masks = torch.tensor(validation_masks)

input_ids = torch.tensor(input_ids)
labels = torch.tensor(labels)
attention_masks = torch.tensor(attention_masks)

dataset = TensorDataset(input_ids, labels, attention_masks)

In [None]:
# Number of training epochs (authors recommend between 2 and 4)
epochs = CFG.data.epochs

In [None]:
import numpy as np

# Function to calculate the accuracy of our predictions vs labels
# def flat_accuracy(preds, labels):
#     labels_flat = labels.flatten()
#     return np.sum(pred_flat == labels_flat) / len(labels_flat)

def flat_accuracy(preds, labels):
    y_hat = np.argmax(preds, axis=1).flatten()
    y = labels[:,1].flatten()
      
    #find tp, tn, fp, fn
    tp = ((y==1) & (y_hat==1)).sum()
    tn = ((y==0) & (y_hat==0)).sum()
    fp = ((y==0) & (y_hat==1)).sum()
    fn = ((y==1) & (y_hat==0)).sum()
    total = len(labels)
    return tp, tn, fp, fn, total
    
def get_accuracy(tp, tn, fp, fn, total):
    epsilon = CFG.data.epsilon

    def accuracy():
      return (tp + tn)/(total + epsilon) 

    def precision():
      return tp/(tp + fp + epsilon)
    
    def recall():
      return tp/(tp + fn + epsilon)

    def f1():
      p = precision()
      r = recall()
      return 2*p*r/(p + r + epsilon)

    return {
        'accuracy': accuracy(),
        'precision': precision(),
        'recall': recall(),
        'f1-score': f1()
    }

In [None]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

for layer in range(12):
    model.bert.encoder.layer[layer].output.LayerNorm.register_forward_hook(get_activation(layer))

In [None]:
input_ids_ = input_ids[:5].to(device)
input_mask_ = attention_masks[:5].to(device)
input_labels_ = labels[:5].to(device)

In [None]:
input_ids_.shape

In [None]:
outputs = model(input_ids_, 
                token_type_ids=None, 
                attention_mask=input_mask_)

In [None]:
import copy
a1 = copy.deepcopy(activation)

In [None]:
print(a1[11].shape)
a1[11].mean(dim=1).shape
a1[11][:,0,:].shape

In [None]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.dense = nn.Linear(CFG.data.hidden_size, CFG.data.hidden_size)
        self.tanh = nn.Tanh()
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p = 0.2)
        self.fc1 = nn.Linear(CFG.data.hidden_size, 2) #input_size, output_size
        # self.fc2 = nn.Linear(64,64)
        # self.fc3 = nn.Linear(64,64)
        # self.fc4 = nn.Linear(64,10) #10 because 10 classes {0,1,2,3,4,5,6,7,8,9}
        
    def forward(self, x):
        x = self.dense(x)
        # x = self.tanh(x)
        x = self.relu(x)
        x = self.dropout(x)
        # x = F.relu(self.fc1(x))
        # x = F.relu(self.fc2(x))
        # x = F.relu(self.fc3(x))
        x = self.fc1(x)
        
        return x


In [None]:
activation = {}
def get_activation(name):
    def hook(model, input, output):
        activation[name] = output.detach()
    return hook

for layer in range(12):
    model.bert.encoder.layer[layer].output.LayerNorm.register_forward_hook(get_activation(layer))

def prepare_data(inputs, masks, labels):
    inputs = inputs.to(device)
    masks = masks.to(device)
    labels = labels.to(device)
    outputs = model(inputs, 
                token_type_ids=None, 
                attention_mask=masks)
    # inputs = activation[11][:,0,:]
#     inputs = activation[11].mean(dim=1) #exp1: taking mean of all tokens in last layer
    
    #exp2
    cnt = 1e-8
    inputs = torch.zeros_like(activation[0][:,0,:])
    for i in range(9,12):
      cnt += 1
      inputs += activation[i][:,0,:]
    inputs/=cnt
    
    return [inputs, labels]

In [None]:
import time
import datetime

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))


In [None]:
batch_size = CFG.data.batch_size

def train1(train_data, validation_data, fold_id, exp_no):
    '''
      Data: (list) -> [inputs, masks, labels]
    '''
    
    net = Net()
    net.cuda()

    # Create the DataLoader for our training set.
    train_data = TensorDataset(*train_data)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create the DataLoader for our validation set.
    validation_data = TensorDataset(*validation_data)
    validation_sampler = SequentialSampler(validation_data)
    validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

    # Function to validate model
    def validate_model(validation_dataloader):
        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        net.eval()

        # Tracking variables 
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        tp, tn, fp, fn, total = 0, 0, 0, 0, 0
        # Evaluate data for one epoch
        for batch in validation_dataloader:
            
            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)
            
            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels = batch
            
            # Telling the model not to compute or store gradients, saving memory and
            # speeding up validation
            with torch.no_grad():        

                # Forward pass, calculate logit predictions.
                # This will return the logits rather than the loss because we have
                # not provided labels.
                # token_type_ids is the same as the "segment ids", which 
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here: 
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                b_inputs, b_labels = prepare_data(b_input_ids, b_input_mask, b_labels)

                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                logits = net(b_inputs)


            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            
            # Calculate the accuracy for this batch of test sentences.
            tp_, tn_, fp_, fn_, tot_ = flat_accuracy(logits, label_ids)
            
            tp+=tp_
            tn+=tn_
            fp+=fp_
            fn+=fn_
            total+=tot_

            # Track the number of batches
            nb_eval_steps += 1

        # Report the final accuracy for this validation run.
        acc_metrics = get_accuracy(tp, tn, fp, fn, total)
        for k, v in acc_metrics.items():
          print("{} : {:.5f},".format(k,v), end=" ")

        print("\nValidation took: {:}".format(format_time(time.time() - t0)))
        return acc_metrics
    

    best_metrics = {}
    best_f1_so_far = 0.0
    
    # Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
    optimizer = AdamW(net.parameters(),
                      lr = CFG.data.lr, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                      eps = CFG.data.epsilon
                    )
    
    #define loss function
    criterion = nn.CrossEntropyLoss()

    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps = 0, # Default value in run_glue.py
                                                num_training_steps = total_steps)
    
    print('Current Best f1-score: {:.5}'.format(best_f1_so_far))

    # For each epoch...
    for epoch_i in range(0, epochs):
        
        # ========================================
        #               Training
        # ========================================
        
        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        
        # For each batch of training data...
        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 1

        # Put the model into training mode. Don't be mislead--the call to 
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        net.train()

        print('Training...')
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                
                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader. 
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the 
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            b_inputs, b_labels = prepare_data(b_input_ids, b_input_mask, b_labels)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because 
            # accumulating the gradients is "convenient while training RNNs". 
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            net.zero_grad()        

            # Perform a forward pass (evaluate the model on this training batch).
            # This will return the loss (rather than the model output) because we
            # have provided the `labels`.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = net(b_inputs)
            
            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple.
            loss = criterion(outputs, b_labels)

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)            
        
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        accuracy_metrics = validate_model(validation_dataloader)
        current_f1 = accuracy_metrics['f1-score']
        if current_f1>best_f1_so_far:
            print('f1 score improved, old f1 = {:.5}, new f1 = {:.5}'.format(best_f1_so_far, current_f1))
            best_f1_so_far = current_f1
            best_metrics = accuracy_metrics
            print('Saving new weights...')
            torch.save(net.state_dict(), filepath + f'net_exp{exp_no}_{fold_id}.pth')

        elif current_f1 == best_f1_so_far:
            print('f1 score did not change')
            b_acc = best_metrics['accuracy']
            cur_acc = accuracy_metrics['accuracy']
            if cur_acc>b_acc:
                print('accuracy score improved, old f1 = {:.5}, new f1 = {:.5}'.format(b_acc, cur_acc))
                best_metrics = accuracy_metrics
                print('Saving new weights...')
                torch.save(net.state_dict(), filepath + f'net_exp{exp_no}_{fold_id}.pth')

    return best_metrics

In [None]:
kfold = KFold(n_splits=CFG.data.k_folds, shuffle=True)
best_f1_scores = {}
model.train(False)

for fold_id, (train_ids, val_ids) in enumerate(kfold.split(dataset)):

    print('------------------ Fold {} ------------------'.format(fold_id))

    train_inputs, train_masks, train_labels = input_ids[train_ids], attention_masks[train_ids], labels[train_ids]
    validation_inputs, validation_masks, validation_labels = input_ids[val_ids], attention_masks[val_ids], labels[val_ids]
    train_data = [train_inputs, train_masks, train_labels]
    validation_data = [validation_inputs, validation_masks, validation_labels]

    best_f1_scores[fold_id] = train1(train_data, validation_data, fold_id, 1)

print("")
print("Training complete!")

In [None]:
for fold_id in range(10):
  print(f"{fold_id}: ", end = " ")
  for k,v in best_f1_scores[fold_id]:
    print(f"{k}: {:.5v}", end=", ")
