In [2]:
!pip install transformers
import sys
import time
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, random_split
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer, PreTrainedModel
from transformers import get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
import datetime
from google.colab import drive
import os

drive.mount('/content/gdrive')





Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/00/92/6153f4912b84ee1ab53ab45663d23e7cf3704161cb5ef18b0c07e207cef2/transformers-4.7.0-py3-none-any.whl (2.5MB)
[K     |████████████████████████████████| 2.5MB 6.8MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 48.4MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 35.9MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbeac6b25

# New Section

In [46]:
def process_text_data(data, tokenizer):
    # I truncate the text if it's too large, might possibly cause issues not sure though
    tokenized = data['Question'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))
    max_len = max([len(i) for i in tokenized.values])
    padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized.values])
    attention_mask = np.where(padded != 0, 1, 0)

    attention_masks = torch.tensor(attention_mask)
    input_ids = torch.tensor(padded, dtype=torch.int64)

    return input_ids, attention_masks


def dataLoaders(dataset):
    """this function splits dataset, and creates dataloaders for training and validation sets."""
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    batch_size = 16

    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )

    validation_dataloader = DataLoader(
        val_dataset,  # The validation samples.
        sampler=SequentialSampler(val_dataset),  # Pull out batches sequentially.
        batch_size=batch_size  # Evaluate with this batch size.
    )

    return train_dataloader, validation_dataloader


def load_model():
    pass


def format_time(elapsed):
    elapsed_rounded = int(round((elapsed)))
    return str(datetime.timedelta(seconds=elapsed_rounded))  # Format as hh:mm:ss


def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def rmse(preds, actual):
  preds = preds.squeeze()
  return np.sqrt(np.sum((preds-actual)**2)/len(preds))


def train_validate(model, scheduler, optimizer, epochs, train_dataloader, validation_dataloader, save_location = None):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    if device.type == 'cuda':
        print('We will use the GPU:', torch.cuda.get_device_name(0))

    if device.type == 'cuda':
        model.cuda()

    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        t0 = time.time()
        total_train_loss = 0
        total_train_rmse = 0

        # """what happens here with drop out rate?"""
        model.train()

        for step, batch in enumerate(train_dataloader):
            # """each batch contains three pytorch tensors: input ids, attention masks, labels)"""
            if step % 200 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
            
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device).float()

            # one forward pass is performed on one epoch at the same time
            # gradients are set to zero every time
            # backward pass to capture gradients for back propagation"""
            model.zero_grad()

            outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
            loss = outputs.loss
            logits = outputs.logits

            # print(logits)
            # print(b_labels)
            # print(loss)


            total_train_loss += loss.item()
            loss.backward()

            """ Clip the norm of the gradients to 1.0 to prevent the "exploding gradients" problem.
            update parameters and learning rate"""
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.cpu().numpy()
            total_train_rmse += rmse(logits, label_ids)

        """calculate average loss over all examples"""
        avg_train_loss = total_train_loss / len(train_dataloader)
        avg_train_rmse = total_train_rmse / len(train_dataloader)
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.4f}".format(avg_train_loss))
        print("  Average training rmse: {0:.4f}".format(avg_train_rmse))

        print("  Training epcoh took: {:}".format(training_time))

        """measure our performance on our validation set"""
        print("")
        print("Running Validation...")
        t0 = time.time()

        """evaluation mode makes sure that you can still get to the gradients even if drop out"""
        model.eval()

        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():
                """forward pass, no grad as a graph is not necessary in forward prop
                Get the "logits" output : values prior to activation function like the softmax."""
                outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
                loss = outputs.loss
                logits = outputs.logits


            total_eval_loss += loss.item()

            """ Move logits and labels to CPU"""
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            """ calculate total accuracy over all batches."""
            total_eval_accuracy += rmse(logits, label_ids)

        """ final accuracy for this validation run."""
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  RMSE: {0:.4f}".format(avg_val_accuracy))

        """ average loss over all of the batches."""
        avg_val_loss = total_eval_loss / len(validation_dataloader)
        validation_time = format_time(time.time() - t0)
        print("  Validation Loss: {0:.4f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        if save_location:
          try:
            ckpoint_name = 'chk_point{}'.format(epoch_i)
            directory = os.path.join(save_location, ckpoint_name)
            os.mkdir(directory)
            model.save_pretrained(directory)
            print('Saved model checkpoint to \'{}\''.format(directory))
          except:
            print('Something went wrong saving the model!')


  

    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

    print("")
    print("Training complete!")
    print("Total training took {:} (h:mm:ss)".format(format_time(time.time() - total_t0)))
    if save_location:
      try:
        directory = os.path.join(save_location, 'final_model')
        os.mkdir(directory)
        model.save_pretrained(directory)
        print('Saved model to \'{}\''.format(directory))
      except:
        print('Something went wrong saving the model!')


if __name__ == '__main__':
    seed = 2021

    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    df = pd.read_csv(r'gdrive/MyDrive/nlp_bert/bert_input')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    labels = torch.tensor(df['conflict_score'])
    input_ids, attention_masks = process_text_data(df, tokenizer)

    dataset = TensorDataset(input_ids, attention_masks, labels)

    train_dl, val_dl = dataLoaders(dataset)

    config = BertConfig.from_pretrained("bert-base-uncased", num_labels=1)
    pre_trained_model = BertForSequenceClassification.from_pretrained("bert-base-uncased", config=config)

    # pre_trained_model = BertForSequenceClassification.from_pretrained(
    #     "bert-base-uncased",  # Use the 12-layer BERT model, with an uncased vocab.
    #     num_labels = 1 ,  # The number of output labels--2 for binary classification.
    #     # You can increase this for multi-class tasks.
    #     # I have to select True otherwise it will not compute cost etc) 
    # )

    #pre_trained_model = transformers.BertForSequenceClassification.from_pretrained('gdrive/MyDrive/nlp_bert/bert_input)

    optimizer_bert = AdamW(pre_trained_model.parameters(),
                           lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
                           eps=1e-10)  # args.adam_epsilon  - default is 1e-8.

    num_epochs = 10  # Number of training epochs. Many epochs may be over-fitting training data.
    total_steps = len(input_ids) * num_epochs  # total number of training steps
    scheduler_bert = get_linear_schedule_with_warmup(optimizer_bert,
                                                     num_warmup_steps=0,  # Default value in run_glue.py
                                                     num_training_steps=total_steps)

    train_validate(pre_trained_model, scheduler_bert, optimizer_bert, num_epochs, train_dl, val_dl, save_location = 'gdrive/MyDrive/nlp_bert/trained_model')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

There are 1 GPU(s) available.
We will use the GPU: Tesla T4

Training...
  Batch   200  of    528.    Elapsed: 0:00:38.
  Batch   400  of    528.    Elapsed: 0:01:17.

  Average training loss: 0.0232
  Average training rmse: 0.1473
  Training epcoh took: 0:01:41

Running Validation...
  RMSE: 0.1017
  Validation Loss: 0.0113
  Validation took: 0:00:03
Something went wrong saving the model!

Training...
  Batch   200  of    528.    Elapsed: 0:00:38.
  Batch   400  of    528.    Elapsed: 0:01:16.

  Average training loss: 0.0101
  Average training rmse: 0.0971
  Training epcoh took: 0:01:40

Running Validation...
  RMSE: 0.0895
  Validation Loss: 0.0091
  Validation took: 0:00:03
Something went wrong saving the model!

Training...
  Batch   200  of    528.    Elapsed: 0:00:38.
  Batch   400  of    528.    Elapsed: 0:01:16.

  Average training loss: 0.0068
  Average training rmse: 0.0794
  Training epcoh took: 0:01:40

Running Validation...
  RMSE: 0.0899
  Validation Loss: 0.0088
  Valid

In [48]:
import gensim
from sklearn.linear_model import LinearRegression

from tqdm import tqdm
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

def baseline_prediction(x_train, x_test, y_train, y_test):
    mean_score = np.mean(y_train)
    mean_prediction = np.full(y_test.shape, mean_score)
    baseline_performance = np.sqrt(np.sum((mean_prediction-y_test)**2)/mean_prediction.shape[0])
    print('Baseline of predicting mean gives RMSE: {}'.format(baseline_performance))

def create_document_vec(input_text):
    documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(input_text)]
    model = Doc2Vec(documents, vector_size=300, window=16, min_count=1)
    return model


df = pd.read_csv(r'gdrive/MyDrive/nlp_bert/bert_input')
text_input = df['Question'].values
labels_input = df['conflict_score'].values
n_docs = text_input.shape[0]

doc_vecs = create_document_vec(text_input)
input_vectors = [doc_vecs[i] for i in range(n_docs)]
X_train, X_test, y_train, y_test = train_test_split(input_vectors, labels_input, test_size=0.1, random_state=2021)
baseline_prediction(X_train, X_test, y_train, y_test)

reg = LinearRegression().fit(X_train, y_train)
prediction = reg.predict(X_test)
baseline_performance_linear = np.sqrt(np.sum((prediction-y_test)**2)/y_test.shape[0])
print('Baseline of linear regression model: {}' .format(baseline_performance_linear))



Baseline of predicting mean gives RMSE: 0.17506008251424307
Baseline of linear regression model: 0.17643335638228275


In [47]:
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    config = BertConfig.from_pretrained("gdrive/MyDrive/nlp_bert/trained_model/chk_point5", num_labels=1)
    model = BertForSequenceClassification.from_pretrained("gdrive/MyDrive/nlp_bert/trained_model/chk_point5", config=config)

    questions = ['Why do you think people with vegan lifestyles are better?', 'Why do you think people with vegan lifestyles are annoying?', 'How would you cook brocolli?', 'Is milk vegan?']

    tokenized = [tokenizer.encode(question, add_special_tokens=True, max_length=512, truncation=True) for question in questions]
    max_len = max([len(i) for i in tokenized])
    padded = np.array([i + [0] * (max_len - len(i)) for i in tokenized])
    attention_mask = np.where(padded != 0, 1, 0)
    padded = torch.IntTensor(padded)
    attention_mask = torch.IntTensor(attention_mask)
    outputs = model.forward(padded, attention_mask=attention_mask)
    print(outputs)





SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3627],
        [ 0.3327],
        [-0.0110],
        [-0.0085]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
