In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval
from transformers import AutoTokenizer, AutoModel

#clf
from transformers import  RobertaForSequenceClassification
from transformers import  BertForSequenceClassification

In [3]:
NUM_LABELS=4193
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

#max number of input tokens for one sentence
max_length = 512 

# set the expeiment model name
model_name="legalBert"
#cuda
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

# import pretrained bert tokenizer
if model_name=="roberta":
    tokenizer = AutoTokenizer.from_pretrained("/mnt/localdata/geng/model/legalRoberta/", do_lower_case=True) 
if model_name=="legalBert":
    tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")



In [3]:
import os
os.environ['CUDA_VISIBLE_DEVICES']="4,5,6,7"

In [4]:
! echo $CUDA_VISIBLE_DEVICES

4,5,6,7


In [5]:
torch.cuda.is_available()

True

In [12]:
df = pd.read_csv('/mnt/localdata/geng/data/downstream/multiLabelClassification/train.csv',index_col=0)

In [13]:
print('average sentence length: ', df["header+recital"].str.split().str.len().mean())
print('stdev sentence length: ', df["header+recital"].str.split().str.len().std())

average sentence length:  360.11186274509805
stdev sentence length:  265.16069190156566


In [14]:
cols = df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
# print('Label columns: ', label_cols)

df = df.sample(frac=1).reset_index(drop=True) #shuffle rows
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,celex_id,header+recital,10,1000,1002,1004,1005,1006,1007,1008,...,990,993,994,995,996,997,998,999,c_871b5612,one_hot_labels
0,32007D0333,15.5.2007 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,32013R1283,11.12.2013 EN Official Journal of the European...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,32004R1756,12.10.2004 EN Official Journal of the European...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,32013R1178,21.11.2013 EN Official Journal of the European...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,32006D0076,8.2.2006 EN Official Journal of the European U...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [15]:
num_labels

4193

In [16]:
labels = list(df.one_hot_labels.values)
comments = list(df["header+recital"].values)

In [24]:
type(tokenizer)

transformers.tokenization_bert.BertTokenizer

In [17]:
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,truncation=True, pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())



tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [36]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
token_type_ids=[tokenizer.create_token_type_ids_from_sequences(input_id) for input_id in input_ids]
# TODO, not sure which to use
attention_masks = encodings['attention_mask'] # attention masks

In [37]:
tokenizer.decode(input_ids[1])[:20]

'[CLS] 11. 12. 2013 e'

In [38]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [16218]


In [39]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]


In [40]:
# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
                                                            random_state=2020, test_size=0.10)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [41]:


# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [43]:
torch.save(validation_dataloader,'/mnt/localdata/geng/data/downstream/multiLabelClassification/{}/validation_data_loader'.format(model_name))
torch.save(train_dataloader,'/mnt/localdata/geng/data/downstream/multiLabelClassification/{}/train_data_loader'.format(model_name))

## Load Model & Set Params¶

In [6]:
validation_dataloader=torch.load('/mnt/localdata/geng/data/downstream/multiLabelClassification/{}/validation_data_loader'.format(model_name))
train_dataloader=torch.load('/mnt/localdata/geng/data/downstream/multiLabelClassification/{}/train_data_loader'.format(model_name))

In [7]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
if model_name=="roberta":
    model = RobertaForSequenceClassification.from_pretrained("/mnt/localdata/geng/model/legalRoberta/", num_labels=NUM_LABELS)
if model_name=="legalBert":
    model = BertForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=NUM_LABELS)


    
parallel_model = torch.nn.DataParallel(model) # Encapsulate the model
parallel_model.cuda()


Some weights of the model checkpoint at nlpaueb/legal-bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification wer

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [8]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [9]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [10]:
torch.cuda.get_device_name(0)

'GeForce GTX TITAN X'

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

# trange is a tqdm wrapper around the normal python range
for epoch__ in trange(epochs, desc="Epoch"):

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    parallel_model.train()

    # Tracking variables
    tr_loss = 0 #running loss
    nb_tr_examples, nb_tr_steps = 0, 0



    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # # Forward pass for multiclass classification
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # loss = outputs[0]
        # logits = outputs[1]

        # Forward pass for multilabel classification
        outputs = parallel_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,NUM_LABELS),b_labels.type_as(logits).view(-1,NUM_LABELS)) #convert labels to float for calculation
        # loss_func = BCELoss() 
        # loss = loss_func(torch.sigmoid(logits.view(-1,NUM_LABELS)),b_labels.type_as(logits).view(-1,NUM_LABELS)) #convert labels to float for calculation
        train_loss_set.append(loss.item())    

        # Backward pass
        loss.mean().backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss/nb_tr_steps))

    ###############################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    parallel_model.eval()

    # Variables to gather full output
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

    # Predict
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        with torch.no_grad():
            # Forward pass
            outs = parallel_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()

        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Train loss: 0.7054589986801147


In [11]:
torch.save(model.state_dict(), '/mnt/localdata/geng/model/lmtc_models/downstream/multiLabelClassification/{0}/clf_{0}'.format(model_name))

## Prediction 

### Preprocess test data

In [4]:
test_df = pd.read_csv('/mnt/localdata/geng/data/downstream/multiLabelClassification/test.csv',index_col=0)

In [5]:
cols = test_df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
# print('Label columns: ', label_cols)

test_df = test_df.sample(frac=1).reset_index(drop=True) #shuffle rows
test_df['one_hot_labels'] = list(test_df[label_cols].values)

In [6]:
labels = list(test_df.one_hot_labels.values)
comments = list(test_df["header+recital"].values)

In [7]:
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,truncation=True, pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())



tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [8]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
# token_type_ids = encodings['token_type_ids'] # token type ids
token_type_ids=[tokenizer.create_token_type_ids_from_sequences(input_id) for input_id in input_ids]
attention_masks = encodings['attention_mask'] # attention masks

In [9]:
# Convert all of our data into torch tensors, the required datatype for our model
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)
test_token_types = torch.tensor(token_type_ids)



# Create an iterator of our data with torch DataLoader. This helps save on memory during testing because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [10]:
torch.save(test_dataloader,'/mnt/localdata/geng/data/downstream/multiLabelClassification/{}/test_data_loader'.format(model_name))

## predict

In [4]:
test_dataloader=torch.load('/mnt/localdata/geng/data/downstream/multiLabelClassification/{}/test_data_loader'.format(model_name))

In [5]:
len(test_dataloader.batch_sampler.sampler.data_source.tensors[2][0])

4193

In [7]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
if model_name=="roberta":
    model = RobertaForSequenceClassification.from_pretrained("/mnt/localdata/geng/model/legalRoberta/", num_labels=NUM_LABELS)
if model_name=="legalBert":
    model = BertForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=NUM_LABELS)

model.load_state_dict(torch.load( '/mnt/localdata/geng/model/lmtc_models/downstream/multiLabelClassification/{0}/clf_{0}'.format(model_name)))
parallel_model = torch.nn.DataParallel(model) # Encapsulate the model
parallel_model.cuda()

Some weights of the model checkpoint at /mnt/localdata/geng/model/legalRoberta/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /mnt/localdata/geng/model/legalRoberta/ and are newly initialized: ['classifier.dense.w

DataParallel(
  (module): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(i

In [10]:
# Put model in evaluation mode to evaluate loss on the test set
parallel_model.eval()

# Variables to gather full output
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
        # Forward pass
        outs = parallel_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)

        b_logit_pred = b_logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
        b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

# Flatten outputs
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate Accuracy
threshold = 0.50
pred_bools = [pl>threshold for pl in pred_labels]
true_bools = [tl==1 for tl in true_labels]
val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

print('F1 test Accuracy: ', val_f1_accuracy)
print('Flat test Accuracy: ', val_flat_accuracy)

F1 test Accuracy:  0.0
Flat test Accuracy:  0.0


In [None]:
'/mnt/localdata/geng/model/lmtc_models/downstream/multiLabelClassification/{0}/prediction.pickle'.format(model_name)

In [None]:
import pickle

In [12]:

with open('/mnt/localdata/geng/model/lmtc_models/downstream/multiLabelClassification/{0}/prediction.pickle'.format(model_name), "wb") as f:
    pickle.dump((pred_labels,true_labels), f)

## Metrics

In [None]:
from metrics import mean_recall_k,ranking_rprecision_score

In [None]:
mean_recall_k(true_labels, pred_labels,k=5)

In [35]:
mean_recall_k(true_labels, pred_labels,k=10)

0.13032218013468014

In [22]:
from sklearn.metrics import coverage_error
coverage_error(true_labels, pred_labels)

1484.1946666666668

In [23]:
from sklearn.metrics import label_ranking_average_precision_score
label_ranking_average_precision_score(true_labels, pred_labels)

0.0929932930861894

In [24]:
from sklearn.metrics import label_ranking_loss
label_ranking_loss(true_labels, pred_labels)

0.13625427027379472

In [30]:
sum([ranking_rprecision_score(true_labels[i], pred_labels[i]) for i in range(len(true_labels))])/len(true_labels)

0.0822388888888884