In [None]:
import os
import torch
print(torch.__version__)
import pandas as pd
!pip install transformers
from transformers import BertTokenizer
import sys, time, datetime, random
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import numpy as np

# Metrics
from sklearn.metrics import classification_report
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import gc
import matplotlib.pyplot as plt
import random
import seaborn as sns

In [None]:
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

# only use when using google colab
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
MAX_LEN = 512

sentence_lengths = []
def tokenize_and_count(s, lst, max_len):
    answer = tokenizer.encode(s, add_special_tokens=True)
    lst.append(len(answer))
    return answer

def preprocess_context_bert(df, text_col_name, context_col_name, prefix='concat_'):
  contexts = df[context_col_name].apply(lambda s : tokenize_and_count(s, sentence_lengths, MAX_LEN))
  comments = df[text_col_name].apply(lambda s : tokenize_and_count(s, sentence_lengths, MAX_LEN))
  print(len(contexts), len(comments), contexts[0])
  df[prefix + 'bert'] = [context + comment[1:] for context, comment in zip(contexts, comments)]
  print(len(contexts + comments[1:]))
  df[prefix + 'type_id'] = [[0] * len(context) + [1] * len(comment[1:]) for context, comment in zip(contexts, comments)]
  df[prefix + 'bert'] = pad_sequences(df[prefix + 'bert'].values, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post").tolist()
  df[prefix + 'type_id'] = pad_sequences(df[prefix + 'type_id'].values, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post").tolist()
  df[prefix + 'attention'] = df[prefix + 'bert'].apply(lambda arr : [int(token_id > 0) for token_id in arr])

def preprocess_bert(df, text_col_name, prefix=''):
  df[prefix + 'bert'] = df[text_col_name].apply(lambda s : tokenize_and_count(s, sentence_lengths, MAX_LEN))
  df[prefix + 'bert'] = pad_sequences(df[prefix + 'bert'].values, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post").tolist()
  df[prefix + 'attention'] = df[prefix + 'bert'].apply(lambda arr : [int(token_id > 0) for token_id in arr])
  df[prefix + 'type_id'] = df[prefix + 'bert'].apply(lambda arr : [0 for token_id in arr])
  df[prefix + 'type_id'] = pad_sequences(df[prefix + 'type_id'].values, maxlen=MAX_LEN, dtype="long", 
                          value=0, truncating="post", padding="post").tolist()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)

Civility Data

In [None]:
categories = ['stereotype', 'namecalling', 'aspersion', 'demeaning', 'vulgarity', 'personal_attack', 'third_party_attack', 'civility']
columns = ['id', 'comment', 'parent_comment', 'original_post', 'divisiveness','divisiveness_keyword','human_incivility',
            'stereotype','namecalling','aspersion', 'demeaning','vulgarity','other']

In [None]:
annotated_data_dir = 'data/labeled/final_annotated_data_incivility_3030.pickle'
annotated_df = pd.read_pickle(annotated_data_dir)
annotated_df.head()

In [None]:
# Aggregate Raw Annotations
annotated_df['personal_attack'] = ((annotated_df['namecalling'] == 1) | (annotated_df['namecalling'] == 2) | (annotated_df['demeaning'] == 1)).astype(int)
annotated_df['third_party_attack'] = ((annotated_df['namecalling'] == 3)).astype(int)
annotated_df['other']=annotated_df['other'].apply(lambda x: 1 if str(type(x))=="<class 'str'>" else x)
annotated_df['civility'] = annotated_df['stereotype'].astype(int) + annotated_df['aspersion'].astype(int) + annotated_df['demeaning'].astype(int) + annotated_df['vulgarity'].astype(int) + annotated_df['namecalling'].astype(int)  + annotated_df['other'].astype(int)

# special treatment for stereotype data
annotated_df.stereotype = annotated_df.stereotype.astype(bool).astype(int)

optionally, we have a baseline model that concatenate the context and the actual comments, uncomment the following

In [None]:
# annotated_df['concat_comment'] = annotated_df['parent_comment'] + ' [SEP] ' + annotated_df['comment']
# preprocess_context_bert(annotated_df, 'comment_aug', 'parent_comment_aug', 'aug_concat_')
# preprocess_bert(annotated_df, 'comment_aug', 'aug_')

In [None]:
# aug_train_df, aug_test_df = train_test_split(df, random_state=42, test_size=test_size)
# aug_train_df

In [None]:
test_size = 0.2
validation_size = 0.5
train_df, test_df = train_test_split(annotated_df, random_state=42, test_size=test_size, stratify=annotated_df.namecalling.values)
test_df, validation_df = train_test_split(test_df, random_state=42, test_size=validation_size)


print(f"""{1 - test_size}/{test_size * (1-validation_size)}/{test_size * validation_size} split
{train_df.shape[0]} lines of training data,
{test_df.shape[0]} lines of test data
{validation_df.shape[0]} lines of validation data""")

# temp_data_dir = './data/labeled/temp/'
# train_df.to_pickle(temp_data_dir, os.path.join('train_df.pickle'))
# validation_df.to_pickle(temp_data_dir, os.path.join('validation_df.pickle'))
# test_df.to_pickle(temp_data_dir, os.path.join('test_df.pickle'))

print(f"""{1 - test_size}/{test_size * (1-validation_size)}/{test_size * validation_size} split
{train_df.shape[0]} lines of training data,
{test_df.shape[0]} lines of test data
{validation_df.shape[0]} lines of validation data""")

In [None]:
# split train, test for stereotypes

train_df_pos = train_df[train_df['stereotype'] == 1]
train_df_neg = train_df[train_df['stereotype'] == 0].sample(len(train_df_pos)*4)
train_df = pd.concat([train_df_pos, train_df_neg])


_, test_df = train_test_split(train_df, random_state=42, test_size=0.2, stratify=train_df.stereotype.values)
print(len(train_df), len(test_df))
validation_df, test_df = train_test_split(test_df, random_state=42, test_size=0.5)

print(len(train_df), len(test_df), len(validation_df))

validation_df.stereotype.value_counts()

In [None]:
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    try:
        roc = roc_auc_score(pred_flat, labels_flat)
    except ValueError:
        roc = 0
    return f1_score(pred_flat, labels_flat, average='weighted'), roc, np.sum(pred_flat == labels_flat) / len(labels_flat)

def _flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

def run_evaluation(model, test_x, test_labels, test_masks, test_type, batch_size, verbose=False):
    if verbose:
        print(f"{list(test_labels).count(1)} positive samples out of {len(test_labels)} total lines")
        print('Predicting labels for {:,} test sentences...'.format(len(test_x)))
    
    test_data = TensorDataset(test_x, test_masks, test_labels, test_type)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    # Put model in evaluation mode
    model.eval()
    
    # Tracking variables 
    predictions , true_labels = [], []

    # Predict 
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_type_ids = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=b_type_ids, 
                          attention_mask=b_input_mask)

        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
        
    # Create results
    matthews_set = []

    # Evaluate each test batch using Matthew's correlation coefficient
    if verbose:
        print('Calculating Matthews Corr. Coef. for each batch...')

    # For each input batch...
    for i in range(len(true_labels)):
        # The predictions for this batch are a 2-column ndarray (one column for "0" 
        # and one column for "1"). Pick the label with the highest value and turn this
        # in to a list of 0s and 1s.
        pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
        # Calculate and store the coef for this batch.  
        matthews = matthews_corrcoef(true_labels[i], pred_labels_i)
        
        if verbose:
            print("Predicted Label for Batch " + str(i) + " is " + str(pred_labels_i))
            print("True Label for Batch " + str(i) + " is " + str(true_labels[i])) 
            print("Matthew's correlation coefficient for Batch " + str(i) + " is " + str(matthews))
        matthews_set.append(matthews)
    
    # Combine the predictions for each batch into a single list of 0s and 1s.
    flat_predictions = [item for sublist in predictions for item in sublist]
    flat_predictions = np.argmax(flat_predictions, axis=1).flatten()

    # Combine the correct labels for each batch into a single list.
    flat_true_labels = [item for sublist in true_labels for item in sublist]

    diff = []
    for i in range(len(flat_true_labels)):
      if flat_true_labels[i] != flat_predictions[i]:
        diff.append(i)

    # Calculate the MCC
    acc = accuracy_score(flat_predictions, flat_true_labels)
    mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
    f1 = f1_score(flat_true_labels, flat_predictions, average='weighted')
    ra = roc_auc_score(flat_true_labels, flat_predictions)

    cm = confusion_matrix(flat_true_labels, flat_predictions)
    sns.heatmap(cm, annot=True, cmap='Blues', fmt='g')

    print('MCC: %.3f' % mcc)
    print('ROC_AUC: %.3f' % ra)
    print('F1: %.3f' % f1)
    print('Accuracy: %.3f' % acc)
    print(classification_report(flat_true_labels, flat_predictions))

    return diff

def train(model, epochs, train_dataloader, test_dataloader, optimizer, scheduler, seed_val=42):
    # This training code is based on the `run_glue.py` script here:
    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

    # Set the seed value all over the place to make this reproducible.
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)
    
    # Store the average loss after each epoch so we can plot them.
    training_losses = []
    testing_losses = []
    f1s = []

    for epoch_i in range(0, epochs):
        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')
        
        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode.
        model.train()
        
        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            if step % 40 == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print(f'  Batch {step:>5,}  of  {len(train_dataloader):>5,}.    Elapsed: {elapsed:}.')

            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention masks
            #   [2]: labels 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            b_type_ids = batch[3].to(device)


            model.zero_grad()        

            outputs = model(b_input_ids, 
                        token_type_ids=b_type_ids, 
                        attention_mask=b_input_mask, 
                        labels=b_labels)
            
            loss = outputs[0]

            total_train_loss += loss.item()

            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()
        
        # Calculate the average loss over the training data.
        avg_train_loss = total_train_loss / len(train_dataloader)            
        training_losses.append(avg_train_loss)
        
        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epoch took: {:}".format(format_time(time.time() - t0)))
        
        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")
        
        # Measure how long the testing takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_test_loss = 0

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables 
        eval_loss, eval_accuracy, eval_f1, eval_auc = 0, 0, 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0

        # Evaluate data for one epoch
        for batch in test_dataloader:

            # Add batch to GPU
            batch = tuple(t.to(device) for t in batch)

            # Unpack the inputs from our dataloader
            b_input_ids, b_input_mask, b_labels, b_type_ids = batch

            # Telling the model not to compute or store gradients, saving memory and
            # speeding up validation
            with torch.no_grad():        
                outputs = model(b_input_ids, 
                                token_type_ids=b_type_ids, 
                                attention_mask=b_input_mask, 
                                labels=b_labels)
            loss = outputs[0]

            total_test_loss += loss.item()

            logits = outputs[1]

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences.
            tmp_eval_f1, tmp_eval_auc, tmp_eval_accuracy = flat_accuracy(logits, label_ids)

            # Accumulate the total accuracy.
            eval_accuracy += tmp_eval_accuracy
            eval_f1 += tmp_eval_f1
            eval_auc += tmp_eval_auc
            # Track the number of batches
            nb_eval_steps += 1

        avg_test_loss = total_test_loss / len(test_dataloader)            
        testing_losses.append(avg_test_loss)
        f1s.append(eval_f1/nb_eval_steps)

        # Report the final accuracy for this validation run.
        print("  F1 Score: {0:.2f}".format(eval_f1/nb_eval_steps))
        print("  ROC_AUC: {0:.2f}".format(eval_auc/nb_eval_steps))
        print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
        print("  Validation took: {:}".format(format_time(time.time() - t0)))
        print("  Average validation loss: {0:.2f}".format(avg_test_loss))
        
    return model, training_losses, testing_losses, f1s

def draw_test_train_curve(test_losses, train_losses):
    # Use plot styling from seaborn.
    sns.set(style='darkgrid')

    # Increase the plot size and font size.
    sns.set(font_scale=1.5)
    plt.rcParams["figure.figsize"] = (12,6)

    # Plot the learning curve.
    plt.plot(train_losses, 'b-o', label='Train')
    plt.plot(test_losses, 'r-o', label='Test')

    # Label the plot.
    plt.title(f"Train/Test loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.legend()

    plt.show()

In [None]:
batch_size=16
gpu_id=0
epochs =10
lr = 1e-5 # TODO: fine tuning the models were manual for me before

# def eval_test_site
def single_category_train(train_df, validation_df, test_df, category, prefix=''):


  train_x = train_df[prefix+'bert'].values.tolist()
  train_y = train_df[category].values.astype(int).astype(bool).astype(int).tolist()
  train_masks = train_df[prefix+'attention'].values.tolist()
  train_type = train_df[prefix+'type_id'].values.tolist()

  # Create x, y for each
  train_x = torch.tensor(train_x)
  val_x = torch.tensor(validation_df[prefix+'bert'].values.tolist())
  test_x = torch.tensor(test_df[prefix+'bert'].values.tolist())

  train_masks = torch.tensor(train_masks)
  val_masks = torch.tensor(validation_df[prefix+'attention'].values.tolist())
  test_masks = torch.tensor(test_df[prefix+'attention'].values.tolist())

  train_y = torch.tensor(train_y)
  val_y = torch.tensor(validation_df[category].values.astype(int).astype(bool).astype(int))
  test_y = torch.tensor(test_df[category].values.astype(int).astype(bool).astype(int))

  train_type = torch.tensor(train_type)
  val_type = torch.tensor(validation_df[prefix+'type_id'].values.tolist())
  test_type = torch.tensor(test_df[prefix+'type_id'].values.tolist())

  print(len(train_x), len(train_masks), len(train_y), len(train_type))

  # Create DataLoaders for train data and test data
  train_data = TensorDataset(train_x, train_masks, train_y, train_type)
  train_sampler = RandomSampler(train_data)
  train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

  val_data = TensorDataset(val_x, val_masks, val_y, val_type)
  val_sampler = RandomSampler(val_data)
  val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

  test_data = TensorDataset(test_x, test_masks, test_y, test_type)
  test_sampler = SequentialSampler(test_data)
  test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

  print(len(list(train_dataloader)))


  model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 2, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = True, # Whether the model returns all hidden-states.
  )

  print(device)
  model.cuda(device=device)

  optimizer = AdamW(model.parameters(),
                lr = lr, # args.learning_rate - default is 5e-5
                eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
              )

  # Total number of training steps is [number of batches] x [number of epochs]. 
  total_steps = len(train_dataloader) * epochs
  scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps = 0,
                                              num_training_steps = total_steps)

  model, train_losses, test_losses, f1s = train(model, epochs,
                                                  train_dataloader,
                                                  val_dataloader,
                                                  optimizer,
                                                  scheduler,
                                                  seed_val=42)

  # Visualize test and train curve
  draw_test_train_curve(test_losses, train_losses)

    # Evaluation results
  print("====================")
  print("====================")
  print("EVALUATION")
  diff = run_evaluation(model, test_x, test_y, test_masks,  test_type, batch_size, verbose=False)
  print(f"lr = {lr}: {f1s}")
  print(diff)

  return model

for c in categories:
  gc.collect()
  torch.cuda.empty_cache()
  model = single_category_train(train_df, validation_df, test_df, c, '')
  model_name = "{}_{}_{}_3000".format(c, lr, epochs)
  model.save_pretrained('./models/{}'.format(model_name))

In [None]:
diff = run_evaluation(model, test_x, test_y, test_masks, batch_size, verbose=False)
print(diff)

In [None]:
diff = [9, 14, 15, 24, 25, 26, 27, 29, 30, 32, 50, 51, 56, 59, 64, 71, 78, 85, 86, 89, 91, 92, 94, 99, 102]

print(len(test_df), len(diff))
for i in range(len(test_df)):
  if i in diff:
    print("MISCLASSIFICATION")
    print(test_df.iloc[i]['comment'])
    print(test_df.iloc[i]['parent_comment'])
    print(test_df.iloc[i]['civility'])
    for cat in ['stereotype', 'namecalling', 'demeaning', 'vulgarity', 'other']:
      print(cat, test_df.iloc[i][cat])
    print(test_df.iloc[i]['id'])
    print('\n\n')

In [None]:
lr_5e_05 = ('5e_05', [0.7776543005866313, 0.6067515817515817, 0.7134108164183353, 0.6517287751807875, 0.7459083633453382])
lr_3e_05 = ('3e_05', [0.6232893157262904, 0.7727551020408162, 0.7434926602219836, 0.6489188657919307, 0.7466314731020613])
lr_2e_05 = ('2e_05', [0.0, 0.49230769230769234, 0.458974358974359, 0.4025641025641026, 0.4717948717948718])
lr_1e_05 = ('1e_05', [0.6545212956977663, 0.7439080760509332, 0.7513644588270091, 0.6913832199546485, 0.8059093812963782])
lr_5e_06 = ('5e_06', [0.5877032324400746, 0.5989232989232989, 0.7007700049716857, 0.738501708151536, 0.7803312629399585])
x = list(range(1, epochs + 1))


for lr, y in [lr_5e_05, lr_3e_05, lr_2e_05, lr_1e_05, lr_5e_06]:
    plt.plot(x, y, label=lr)
plt.legend()
plt.xlabel('epochs')
plt.ylabel('f1 score')
plt.title('Hyperparameter Search')
plt.show()

In [None]:
model_name = 'bert_1015+5000_2e_05'

# model.save_pretrained('./models/{}'.format(model_name))
# tokenizer.save_pretrained('./models/{}_tokenizer'.format(model_name))

The following function is separate from the model training, and is experimentally using BERT to create embedding for the texts

In [None]:
def get_post_bert_repr(model, test_x, test_labels, test_masks, batch_size, verbose=False):
    if verbose:
        print(f"{list(test_labels).count(1)} positive samples out of {len(test_labels)} total lines")
        print('Predicting labels for {:,} test sentences...'.format(len(test_x)))
    
    test_data = TensorDataset(test_x, test_masks, test_labels)
    test_sampler = SequentialSampler(test_data)
    test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
    
    # Put model in evaluation mode
    model.eval()
    
    # Tracking variables 
    predictions , true_labels,  = [], []
    all_hidden_states_mean = []
    # Predict 
    for batch in test_dataloader:
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch

        # Telling the model not to compute or store gradients, saving memory and 
        # speeding up prediction
        with torch.no_grad():
          # Forward pass, calculate logit predictions
          outputs = model(b_input_ids, token_type_ids=None, 
                          attention_mask=b_input_mask)

        logits = outputs[0]
        hidden_states = (outputs[1])
        hidden_states = hidden_states[1].cpu().numpy()
        hidden_states_mean = hidden_states.mean(axis=1)
        for hidden_state_mean in hidden_states_mean:
          all_hidden_states_mean.append(hidden_state_mean)


        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()

        # Store predictions and true labels
        predictions.append(logits)
        true_labels.append(label_ids)
      
    return np.stack(all_hidden_states_mean)
        

In [None]:
test_x = torch.tensor(annotated_df['bert'].values.tolist())

test_masks = torch.tensor(annotated_df['attention'].values.tolist())

test_y = torch.tensor(annotated_df.label.values.astype(int))

test_data = TensorDataset(test_x, test_masks, test_y)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

reprs = get_post_bert_repr(model, test_x, test_y, test_masks, batch_size, verbose=False)
print(reprs.shape)
np.save('data/labeled_comment_reprs.npy', reprs)