In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

In [12]:
NUM_LABELS=4108

In [3]:
! echo $CUDA_VISIBLE_DEVICES

4,5,6,7


In [7]:
df = pd.read_csv('/mnt/localdata/geng/data/downstream/multiLabelClassification/train.csv',index_col=0)

In [8]:
df.head(2)

Unnamed: 0,celex_id,header+recital,10,1000,1002,1004,1005,1006,1007,1008,...,99,990,993,994,995,996,997,998,999,c_871b5612
0,32014R0727,1.7.2014 EN Official Journal of the European U...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,31975R2481,REGULATION (EEC) No 2481/75 OF THE COUNCIL of...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
print('average sentence length: ', df["header+recital"].str.split().str.len().mean())
print('stdev sentence length: ', df["header+recital"].str.split().str.len().std())

average sentence length:  360.19755555555554
stdev sentence length:  265.21689110489


In [10]:
cols = df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
# print('Label columns: ', label_cols)

df = df.sample(frac=1).reset_index(drop=True) #shuffle rows
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,celex_id,header+recital,10,1000,1002,1004,1005,1006,1007,1008,...,990,993,994,995,996,997,998,999,c_871b5612,one_hot_labels
0,32012R0279,29.3.2012 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,31999R1644,COMMISSION REGULATION (EC) No 1644/1999\nof 27...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,32014D0861,2.12.2014 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,32005D0118(05),18.1.2005 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,32008R0348,19.4.2008 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
num_labels

4108

In [12]:
labels = list(df.one_hot_labels.values)
comments = list(df["header+recital"].values)

In [16]:
max_length = 512 #max
tokenizer = RobertaTokenizer.from_pretrained("/mnt/localdata/geng/model/legalRoberta/", do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,truncation=True, pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())



tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [24]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
# token_type_ids = encodings['token_type_ids'] # token type ids
token_type_ids=[tokenizer.create_token_type_ids_from_sequences(input_id) for input_id in input_ids]
attention_masks = encodings['attention_mask'] # attention masks

In [26]:
tokenizer.decode(input_ids[1])[:20]

'<s>16.10.2007 en official journal of the european union l 271/16\ncommission decision\nof 15 october 2007\nauthorising the use of at risk bovine animals until the end of their productive lives in germany following official confirmation of the presence of bse\n(notified under document number c(2007) 4648)\n(only the german text is authentic)\n(2007/667/ec)\nthe commission of the european communities,\nhaving regard to the treaty establishing the european community,\nhaving regard to regulation (ec) no 999/2001 of the european parliament and of the council of 22 may 2001 laying down rules for the prevention, control and eradication of certain transmissible spongiform encephalopathies\xa0(1), and in particular the second subparagraph of article 13(1) thereof,\nwhereas:\n(1) regulation (ec) no 999/2001 lays down rules for the prevention, control and eradication of transmissible spongiform encephalopathies (tses) in animals. the first subparagraph of article 13(1) of that regulation provid

In [27]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

df label indices with only one instance:  [22864]


In [28]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]


In [29]:
# Use train_test_split to split our data into train and validation sets

train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
                                                            random_state=2020, test_size=0.10)

# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [30]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [31]:
torch.save(validation_dataloader,'/mnt/localdata/geng/data/downstream/multiLabelClassification/validation_data_loader')
torch.save(train_dataloader,'/mnt/localdata/geng/data/downstream/multiLabelClassification/train_data_loader')

## Load Model & Set Params¶

In [74]:
validation_dataloader=torch.load('/mnt/localdata/geng/data/downstream/multiLabelClassification/validation_data_loader')
train_dataloader=torch.load('/mnt/localdata/geng/data/downstream/multiLabelClassification/train_data_loader')

In [5]:
from transformers import  RobertaForSequenceClassification

In [13]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
model = RobertaForSequenceClassification.from_pretrained("/mnt/localdata/geng/model/legalRoberta/", num_labels=NUM_LABELS)
parallel_model = torch.nn.DataParallel(model) # Encapsulate the model
parallel_model.cuda()


Some weights of the model checkpoint at /mnt/localdata/geng/model/legalRoberta/ were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at /mnt/localdata/geng/model/legalRoberta/ and are newly initialized: ['classifier.dense.w

DataParallel(
  (module): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0): RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dense): Linear(i

In [14]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [15]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [16]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'GeForce GTX TITAN X'

In [17]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# trange is a tqdm wrapper around the normal python range
for epoch__ in trange(epochs, desc="Epoch"):

    # Training

    # Set our model to training mode (as opposed to evaluation mode)
    parallel_model.train()

    # Tracking variables
    tr_loss = 0 #running loss
    nb_tr_examples, nb_tr_steps = 0, 0



    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()

        # # Forward pass for multiclass classification
        # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
        # loss = outputs[0]
        # logits = outputs[1]

        # Forward pass for multilabel classification
        outputs = parallel_model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        logits = outputs[0]
        loss_func = BCEWithLogitsLoss() 
        loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        # loss_func = BCELoss() 
        # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
        train_loss_set.append(loss.item())    

        # Backward pass
        loss.mean().backward()
        # Update parameters and take a step using the computed gradient
        optimizer.step()
        # scheduler.step()
        # Update tracking variables
        tr_loss += loss.item()
        nb_tr_examples += b_input_ids.size(0)
        nb_tr_steps += 1

        print("Train loss: {}".format(tr_loss/nb_tr_steps))

    ###############################################################################

    # Validation

    # Put model in evaluation mode to evaluate loss on the validation set
    parallel_model.eval()

    # Variables to gather full output
    logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

    # Predict
    for i, batch in enumerate(validation_dataloader):
        batch = tuple(t.to(device) for t in batch)
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels, b_token_types = batch
        with torch.no_grad():
            # Forward pass
            outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
            b_logit_pred = outs[0]
            pred_label = torch.sigmoid(b_logit_pred)

            b_logit_pred = b_logit_pred.detach().cpu().numpy()
            pred_label = pred_label.to('cpu').numpy()
            b_labels = b_labels.to('cpu').numpy()

        tokenized_texts.append(b_input_ids)
        logit_preds.append(b_logit_pred)
        true_labels.append(b_labels)
        pred_labels.append(pred_label)

    # Flatten outputs
    pred_labels = [item for sublist in pred_labels for item in sublist]
    true_labels = [item for sublist in true_labels for item in sublist]

    # Calculate Accuracy
    threshold = 0.50
    pred_bools = [pl>threshold for pl in pred_labels]
    true_bools = [tl==1 for tl in true_labels]
    val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
    val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

    print('F1 Validation Accuracy: ', val_f1_accuracy)
    print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Train loss: 0.695422351360321
Train loss: 0.6953473091125488
Train loss: 0.6951509714126587
Train loss: 0.6948357820510864
Train loss: 0.6943077921867371
Train loss: 0.6938337087631226
Train loss: 0.6934069905962262
Train loss: 0.6930036619305611
Train loss: 0.6925860179795159
Train loss: 0.6921306490898133
Train loss: 0.6917045224796642
Train loss: 0.6912492116292318
Train loss: 0.6907895482503451
Train loss: 0.6903769161020007
Train loss: 0.6898632963498433
Train loss: 0.6893489137291908
Train loss: 0.6888203094987309
Train loss: 0.6883254283004336
Train loss: 0.6879256304941679
Train loss: 0.6874163955450058
Train loss: 0.6868754909152076
Train loss: 0.686154303225604
Train loss: 0.685464708701424
Train loss: 0.6848226065436999
Train loss: 0.6840346503257752
Train loss: 0.6832625178190378
Train loss: 0.6826375082687095
Train loss: 0.6818427805389676
Train loss: 0.6809011307255975
Train loss: 0.6798566540082296
Train loss: 0.6787191302545609
Train loss: 0.6774125080555677
Train loss:

Train loss: 0.275221675944825
Train loss: 0.2744065487019296
Train loss: 0.2735973638027234
Train loss: 0.2727905719886335
Train loss: 0.27199199345352043
Train loss: 0.2711984569476883
Train loss: 0.27040855626541155
Train loss: 0.26962338875965436
Train loss: 0.26884017560137985
Train loss: 0.26806304929948555
Train loss: 0.2672876186641681
Train loss: 0.26651946516199543
Train loss: 0.26575576346637547
Train loss: 0.26499435574569424
Train loss: 0.26423950458494877
Train loss: 0.2634874104278489
Train loss: 0.2627403162419796
Train loss: 0.26199850691913285
Train loss: 0.26126038715084815
Train loss: 0.2605281542104672
Train loss: 0.2598003892516586
Train loss: 0.25907503018776573
Train loss: 0.258353243445183
Train loss: 0.25763807000546923
Train loss: 0.25692754017654806
Train loss: 0.2562193384452056
Train loss: 0.255515662092587
Train loss: 0.25481559087832767
Train loss: 0.25412086560709835
Train loss: 0.25342903988048077
Train loss: 0.2527398081990529
Train loss: 0.25205361411

Train loss: 0.15704331340813157
Train loss: 0.15679008858136206
Train loss: 0.156537436516512
Train loss: 0.1562843645671418
Train loss: 0.15603347991078922
Train loss: 0.1557839492505247
Train loss: 0.15553462620831618
Train loss: 0.15528561879863154
Train loss: 0.1550378007656437
Train loss: 0.15479187751398946
Train loss: 0.1545464788435995
Train loss: 0.15430135197574726
Train loss: 0.15405710500395187
Train loss: 0.1538144851506535
Train loss: 0.1535721481456881
Train loss: 0.15333037191247187
Train loss: 0.15308966893004686
Train loss: 0.1528497992945766
Train loss: 0.15260981412366886
Train loss: 0.15237086231913075
Train loss: 0.15213226220308088
Train loss: 0.15189611547804602
Train loss: 0.1516601810590663
Train loss: 0.15142464451832977
Train loss: 0.1511898815556952
Train loss: 0.15095612014487494
Train loss: 0.1507227015367731
Train loss: 0.15049013160507788
Train loss: 0.15025830743479426
Train loss: 0.15002666692505928
Train loss: 0.14979656718698267
Train loss: 0.149567

Train loss: 0.11154838790341964
Train loss: 0.11142686333702616
Train loss: 0.11130505977590993
Train loss: 0.11118403620832619
Train loss: 0.11106261720963914
Train loss: 0.11094093915079803
Train loss: 0.11081998060034941
Train loss: 0.11070019952588843
Train loss: 0.110579745682294
Train loss: 0.11046001565630865
Train loss: 0.11033994928661701
Train loss: 0.1102203420278701
Train loss: 0.11010077399815918
Train loss: 0.1099821857460575
Train loss: 0.1098643005901725
Train loss: 0.10974535620162947
Train loss: 0.10962714135899224
Train loss: 0.10950870117181935
Train loss: 0.10939076246905871
Train loss: 0.10927285053301602
Train loss: 0.10915612294507235
Train loss: 0.10904005151752968
Train loss: 0.10892384545935566
Train loss: 0.10880735019495275
Train loss: 0.10869183803775481
Train loss: 0.10857620374264922
Train loss: 0.1084605551831415
Train loss: 0.10834530426770228
Train loss: 0.10823035929973694
Train loss: 0.10811604100658938
Train loss: 0.10800171329188546
Train loss: 0.

Train loss: 0.08737746400844758
Train loss: 0.0873054729082601
Train loss: 0.08723338363387693
Train loss: 0.08716184560363391
Train loss: 0.08708987566025753
Train loss: 0.08701800560616207
Train loss: 0.08694625505642302
Train loss: 0.08687497106732792
Train loss: 0.08680293721860954
Train loss: 0.08673277647263854
Train loss: 0.08666129877794364
Train loss: 0.08659077431204008
Train loss: 0.08651999333695996
Train loss: 0.08644950559112881
Train loss: 0.08637873390724904
Train loss: 0.08630853246964677
Train loss: 0.08623826838729362
Train loss: 0.08616796776440472
Train loss: 0.08609827630271231
Train loss: 0.08602855442128829
Train loss: 0.08595916189672337
Train loss: 0.0858895436660989
Train loss: 0.08581958099637392
Train loss: 0.0857502019206636
Train loss: 0.08568079900540229
Train loss: 0.08561158017802944
Train loss: 0.08554231437514852
Train loss: 0.08547325038696539
Train loss: 0.08540392123288554
Train loss: 0.08533548088805382
Train loss: 0.08526765085645159
Train loss:

Epoch:  33%|███▎      | 1/3 [19:38<39:16, 1178.15s/it]

F1 Validation Accuracy:  0.0
Flat Validation Accuracy:  0.0
Train loss: 0.010774066671729088
Train loss: 0.01081632450222969
Train loss: 0.0107622096935908
Train loss: 0.010809098137542605
Train loss: 0.010818986780941486
Train loss: 0.010770975922544798
Train loss: 0.01074529638780015
Train loss: 0.010752347996458411
Train loss: 0.010775983850989077
Train loss: 0.01074698818847537
Train loss: 0.01079293213446032
Train loss: 0.010770009364932775
Train loss: 0.010800156742334366
Train loss: 0.010790806995438678
Train loss: 0.010771169327199459
Train loss: 0.010807317739818245
Train loss: 0.010838681064984378
Train loss: 0.010815860393146673
Train loss: 0.010829896862177472
Train loss: 0.01082539134658873
Train loss: 0.010836741177453882
Train loss: 0.010845509061420506
Train loss: 0.01083145812963662
Train loss: 0.010816611543608209
Train loss: 0.010845684409141541
Train loss: 0.0108577052776057
Train loss: 0.010858989976070545
Train loss: 0.010828605719975062
Train loss: 0.010825752312

Train loss: 0.010306253419613024
Train loss: 0.010305145580321551
Train loss: 0.010302775520905555
Train loss: 0.010299782224354289
Train loss: 0.010299338484293387
Train loss: 0.010296118486027314
Train loss: 0.01029380398857243
Train loss: 0.010293468389136251
Train loss: 0.010292575314119168
Train loss: 0.010290884658289973
Train loss: 0.010289079581592303
Train loss: 0.010284215577233297
Train loss: 0.010283346080945598
Train loss: 0.01028186279413013
Train loss: 0.010281322332729406
Train loss: 0.010278907971400204
Train loss: 0.010277537263508113
Train loss: 0.01027554672743593
Train loss: 0.010273538027586562
Train loss: 0.010273683753643018
Train loss: 0.010271923152598306
Train loss: 0.010270487148038767
Train loss: 0.010268970732869257
Train loss: 0.010267095585518023
Train loss: 0.010263371308617321
Train loss: 0.010262182563624895
Train loss: 0.010262069441378117
Train loss: 0.010258611959769674
Train loss: 0.010256181948478686
Train loss: 0.010252398123844065
Train loss: 0

Train loss: 0.009887280207657384
Train loss: 0.009886559391394257
Train loss: 0.009884324690657937
Train loss: 0.009883450366467594
Train loss: 0.009881951633255713
Train loss: 0.009880253418703518
Train loss: 0.009878687641703257
Train loss: 0.009878452191784565
Train loss: 0.009876654512768784
Train loss: 0.009874335647831986
Train loss: 0.009873927558071488
Train loss: 0.009872391489425711
Train loss: 0.009871119578640522
Train loss: 0.009868995446595363
Train loss: 0.009866442594887802
Train loss: 0.009866522047088884
Train loss: 0.00986541022085449
Train loss: 0.009864071539292732
Train loss: 0.0098641435138348
Train loss: 0.009861711826189834
Train loss: 0.009860620588092905
Train loss: 0.009859404650230247
Train loss: 0.009856809348926205
Train loss: 0.009856773097910186
Train loss: 0.009855343730511218
Train loss: 0.009854498420635821
Train loss: 0.009853784959940683
Train loss: 0.009852462726160034
Train loss: 0.009850918857886172
Train loss: 0.009849780459648393
Train loss: 0

Train loss: 0.00957858192178611
Train loss: 0.009577689306189616
Train loss: 0.009577581541148943
Train loss: 0.009575783554959963
Train loss: 0.009574770870535656
Train loss: 0.009573225195047788
Train loss: 0.009572061422644862
Train loss: 0.009571221432890053
Train loss: 0.009568875002867912
Train loss: 0.009568688049937026
Train loss: 0.009567458140240475
Train loss: 0.009566271048357809
Train loss: 0.00956519134648624
Train loss: 0.009564779830236096
Train loss: 0.009563995300039044
Train loss: 0.009563448019106312
Train loss: 0.009562401917270001
Train loss: 0.009561723321262873
Train loss: 0.009560601456112663
Train loss: 0.009560224621357824
Train loss: 0.009559201153530527
Train loss: 0.009558366072763289
Train loss: 0.009557458438612183
Train loss: 0.00955787036806789
Train loss: 0.009557366292239354
Train loss: 0.009555642504500134
Train loss: 0.009554212802960989
Train loss: 0.00955326283967468
Train loss: 0.009552427176194339
Train loss: 0.00955160503433706
Train loss: 0.0

Train loss: 0.009326952350166466
Train loss: 0.00932569531351328
Train loss: 0.00932501360483758
Train loss: 0.009324247586334537
Train loss: 0.009323014254688266
Train loss: 0.009322041520418458
Train loss: 0.009320436850356967
Train loss: 0.009319856393041745
Train loss: 0.009318774983623502
Train loss: 0.009316925169153523
Train loss: 0.009316324550802455
Train loss: 0.009315365932808064
Train loss: 0.009314662298790072
Train loss: 0.009313573856495703
Train loss: 0.009313383495466378
Train loss: 0.009312278500168457
Train loss: 0.00931120039757543
Train loss: 0.009310642384747997
Train loss: 0.009310280983844513
Train loss: 0.009309864284613395
Train loss: 0.00930902694915883
Train loss: 0.009308100410499702
Train loss: 0.009306720019218737
Train loss: 0.009305353518830587
Train loss: 0.009304940627700306
Train loss: 0.009303463839842152
Train loss: 0.009301960717704965
Train loss: 0.00930093474854982
Train loss: 0.009301046509397268
Train loss: 0.009300716582355942
Train loss: 0.0

Train loss: 0.00911771562591708
Train loss: 0.009116902232915162
Train loss: 0.009115776746160835
Train loss: 0.00911457059397615
Train loss: 0.009114071052277673
Train loss: 0.009113613454078398
Train loss: 0.009112822126033178
Train loss: 0.009113124081239128
Train loss: 0.009112804455722644
Train loss: 0.009112100651049438
Train loss: 0.009111128925368769
Train loss: 0.009110289201184752
Train loss: 0.009109999519580064
Train loss: 0.009109664502850306
Train loss: 0.009108987822317572
Train loss: 0.009108632730387882
Train loss: 0.009107649644004852
Train loss: 0.009107505992764098


Epoch:  67%|██████▋   | 2/3 [39:10<19:36, 1176.40s/it]

F1 Validation Accuracy:  0.0
Flat Validation Accuracy:  0.0
Train loss: 0.007871757261455059
Train loss: 0.00836703460663557
Train loss: 0.008422376587986946
Train loss: 0.008495883783325553
Train loss: 0.008347529359161854
Train loss: 0.008302639859418074
Train loss: 0.008252127628241266
Train loss: 0.008201389340683818
Train loss: 0.00821406746076213
Train loss: 0.00824202885851264
Train loss: 0.00822591290555217
Train loss: 0.008213868752742806
Train loss: 0.00821339045293056
Train loss: 0.008248548089925731
Train loss: 0.008239913980166118
Train loss: 0.008232766296714544
Train loss: 0.008220408276161727
Train loss: 0.008200935657239623
Train loss: 0.008240887405056702
Train loss: 0.008258527657017112
Train loss: 0.008253083076505434
Train loss: 0.008279017299752344
Train loss: 0.008257291279733181
Train loss: 0.00823424244299531
Train loss: 0.008215085901319981
Train loss: 0.008214908150526194
Train loss: 0.008218149585580384
Train loss: 0.008236870468993272
Train loss: 0.00821720

Train loss: 0.008090572231580096
Train loss: 0.00809087991155684
Train loss: 0.00808980946885459
Train loss: 0.008087147178778809
Train loss: 0.008085727954850248
Train loss: 0.008085439031151689
Train loss: 0.008081770187937746
Train loss: 0.00808266847889172
Train loss: 0.008082334832777541
Train loss: 0.008078823418378137
Train loss: 0.008077888835840014
Train loss: 0.008074338673255764
Train loss: 0.008073829596155676
Train loss: 0.008074682141931458
Train loss: 0.008073788925920375
Train loss: 0.00807371328824736
Train loss: 0.008073571971002615
Train loss: 0.008073155260539817
Train loss: 0.008071358204027464
Train loss: 0.00807192652666969
Train loss: 0.008072034068073261
Train loss: 0.008071442365784337
Train loss: 0.008069066322198983
Train loss: 0.008068442041643293
Train loss: 0.008069819475663997
Train loss: 0.00807130364123324
Train loss: 0.008073586206883192
Train loss: 0.008074252618193303
Train loss: 0.008075143820824714
Train loss: 0.008072872312819893
Train loss: 0.00

Train loss: 0.008037769431640246
Train loss: 0.008037316939793527
Train loss: 0.008036450804499332
Train loss: 0.008034619149302997
Train loss: 0.008034513142167278
Train loss: 0.008034508327014803
Train loss: 0.00803506346050613
Train loss: 0.008035316821298167
Train loss: 0.008034827366757851
Train loss: 0.008034505812619908
Train loss: 0.008034884747600802
Train loss: 0.008034098671530099
Train loss: 0.008032503469488101
Train loss: 0.008032638924305502
Train loss: 0.008032924531038568
Train loss: 0.008033010600395645
Train loss: 0.008031730573552037
Train loss: 0.008031590765002797
Train loss: 0.008031246159585982
Train loss: 0.008030916342123422
Train loss: 0.008031018536810644
Train loss: 0.008030645123933656
Train loss: 0.0080316876679759
Train loss: 0.008033286219006012
Train loss: 0.00803383630762735
Train loss: 0.008033743086413175
Train loss: 0.00803249778935597
Train loss: 0.008032701952921177
Train loss: 0.008031324248414564
Train loss: 0.008031486205502668
Train loss: 0.0

Train loss: 0.00798722363147979
Train loss: 0.00798739100744327
Train loss: 0.007986588197104266
Train loss: 0.007985736934487965
Train loss: 0.007986171198279102
Train loss: 0.00798536544734864
Train loss: 0.007985678364504253
Train loss: 0.00798542323157625
Train loss: 0.007985013452834663
Train loss: 0.007985073966797353
Train loss: 0.007984947908316322
Train loss: 0.007985541347634831
Train loss: 0.007986046376903682
Train loss: 0.007986054739247508
Train loss: 0.007985769439644887
Train loss: 0.007986129637617443
Train loss: 0.007986350232747451
Train loss: 0.007986893026916889
Train loss: 0.007986022082352925
Train loss: 0.007985394798500542
Train loss: 0.007984256574712084
Train loss: 0.0079834770598727
Train loss: 0.00798275063162242
Train loss: 0.007982837305877146
Train loss: 0.007982765981640768
Train loss: 0.007981883894979414
Train loss: 0.00798103282168027
Train loss: 0.00798024076994835
Train loss: 0.007980221084785315
Train loss: 0.007979601870020397
Train loss: 0.00798

Train loss: 0.007929924439638853
Train loss: 0.007929778035421234
Train loss: 0.00792907953109897
Train loss: 0.007928750332415193
Train loss: 0.007928674273931261
Train loss: 0.007928818686221221
Train loss: 0.007928566268884074
Train loss: 0.007928976100567052
Train loss: 0.007929026659685262
Train loss: 0.007929252568130244
Train loss: 0.007929378856286997
Train loss: 0.007929624355389828
Train loss: 0.00792969742158553
Train loss: 0.007929705483756173
Train loss: 0.007930087072120822
Train loss: 0.007929405777398589
Train loss: 0.007928948540526523
Train loss: 0.007929147037081311
Train loss: 0.007929204859769005
Train loss: 0.007928516133117167
Train loss: 0.00792829947746998
Train loss: 0.007927948149718367
Train loss: 0.00792781515557268
Train loss: 0.007928198337270893
Train loss: 0.00792809787162696
Train loss: 0.00792786750124722
Train loss: 0.007927356995860039
Train loss: 0.00792749158080446
Train loss: 0.00792785541875585
Train loss: 0.007927377134890987
Train loss: 0.0079

Train loss: 0.00788555913604796
Train loss: 0.007885486662512918
Train loss: 0.007885247815218857
Train loss: 0.007885367874537743
Train loss: 0.007885781290942426
Train loss: 0.00788513221573901
Train loss: 0.007885281978305549
Train loss: 0.007885225473583788
Train loss: 0.007885266142780338
Train loss: 0.00788524043557661
Train loss: 0.007885284464628923
Train loss: 0.007884904751103611
Train loss: 0.007884357002063429
Train loss: 0.007884458648313724
Train loss: 0.007884043111919743
Train loss: 0.007883484832382956
Train loss: 0.007883780642514122


Epoch: 100%|██████████| 3/3 [58:49<00:00, 1176.34s/it]

F1 Validation Accuracy:  0.0
Flat Validation Accuracy:  0.0





In [18]:
torch.save(model.state_dict(), '/mnt/localdata/geng/model/lmtc/clf_model')

## Prediction & Metrics

In [19]:
test_df = pd.read_csv('/mnt/localdata/geng/data/downstream/multiLabelClassification/test.csv',index_col=0)

In [20]:
test_df.head()

Unnamed: 0,celex_id,header+recital,10,1000,1005,1006,1007,1008,1012,1015,...,980,981,986,990,993,994,995,996,997,c_871b5612
0,32011D0690,15.10.2011 EN Official Journal of the European...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,31996R2052,COMMISSION REGULATION (EC) No 2052/96 of 25 Oc...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,32001R1868,Commission Regulation (EC) No 1868/2001\nof 24...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31992R0601,11.3.1992 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,31992D0310,COMMISSION DECISION of 21 May 1992 adjusting...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
cols = test_df.columns
label_cols = list(cols[2:])
num_labels = len(label_cols)
# print('Label columns: ', label_cols)

test_df = test_df.sample(frac=1).reset_index(drop=True) #shuffle rows
test_df['one_hot_labels'] = list(test_df[label_cols].values)
test_df.head()

Unnamed: 0,celex_id,header+recital,10,1000,1005,1006,1007,1008,1012,1015,...,981,986,990,993,994,995,996,997,c_871b5612,one_hot_labels
0,31993D0033,COMMISSION DECISION of 16 December 1992 amendi...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,32011R1266,7.12.2011 EN Official Journal of the European ...,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,32003R0239,Commission Regulation (EC) No 239/2003\nof 7 F...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,32011R1153,15.11.2011 EN Official Journal of the European...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,31995R1366,COUNCIL REGULATION (EC) No 1366/95 of 12 June...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [22]:
labels = list(test_df.one_hot_labels.values)
comments = list(test_df["header+recital"].values)

In [25]:
max_length = 512 #max
tokenizer = RobertaTokenizer.from_pretrained("/mnt/localdata/geng/model/legalRoberta/", do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,truncation=True, pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())



tokenizer outputs:  dict_keys(['input_ids', 'attention_mask'])


In [26]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
# token_type_ids = encodings['token_type_ids'] # token type ids
token_type_ids=[tokenizer.create_token_type_ids_from_sequences(input_id) for input_id in input_ids]
attention_masks = encodings['attention_mask'] # attention masks

In [33]:
# Convert all of our data into torch tensors, the required datatype for our model
test_inputs = torch.tensor(input_ids)
test_labels = torch.tensor(labels)
test_masks = torch.tensor(attention_masks)
test_token_types = torch.tensor(token_type_ids)

# Select a batch size for testing. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during testing because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = RandomSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [34]:
torch.save(test_dataloader,'/mnt/localdata/geng/data/downstream/multiLabelClassification/test_data_loader')

In [35]:
test_dataloader=torch.load('/mnt/localdata/geng/data/downstream/multiLabelClassification/test_data_loader')

In [49]:
test_dataloader.batch_size

32

In [77]:
len(test_dataloader.batch_sampler.sampler.data_source.tensors[2][0])

2517

In [None]:
model=torch.load( '/mnt/localdata/geng/model/lmtc/clf_model')
parallel_model = torch.nn.DataParallel(model) # Encapsulate the model
parallel_model.cuda()

In [36]:
# Put model in evaluation mode to evaluate loss on the test set
parallel_model.eval()

# Variables to gather full output
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
        # Forward pass
        outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
        b_logit_pred = outs[0]
        pred_label = torch.sigmoid(b_logit_pred)

        b_logit_pred = b_logit_pred.detach().cpu().numpy()
        pred_label = pred_label.to('cpu').numpy()
        b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

# Flatten outputs
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]

# Calculate Accuracy
threshold = 0.50
pred_bools = [pl>threshold for pl in pred_labels]
true_bools = [tl==1 for tl in true_labels]
val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

print('F1 test Accuracy: ', val_f1_accuracy)
print('Flat test Accuracy: ', val_flat_accuracy)

ValueError: Multi-label binary indicator input with different numbers of labels

In [40]:
from sklearn.metrics import coverage_error
coverage_error(true_labels, pred_labels)

ValueError: y_true and y_score have different shape

In [41]:
len(pred_labels[0])

4108

In [73]:
set([len(true_label) for true_label in true_labels])

{2517}

In [44]:
pred_labels[0]

array([0.00045328, 0.00086889, 0.00056395, ..., 0.00063075, 0.00076824,
       0.00068135], dtype=float32)

In [47]:
model.config

RobertaConfig {
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "LABEL_26",
    "27": "LABEL_27",
    "28": "LABEL_28",
    "29": "LABEL_29",
    "30": "LABEL_30",
    "31": "LABEL_31",
    "32": "LABEL_32",
    "33": "LABEL_33

In [79]:
train.columns

Index(['celex_id', 'header+recital', '10', '1000', '1005', '1006', '1007',
       '1008', '1012', '1015',
       ...
       '981', '986', '990', '993', '994', '995', '996', '997', 'c_871b5612',
       'one_hot_labels'],
      dtype='object', length=2520)

In [80]:
df = pd.read_csv('/mnt/localdata/geng/data/downstream/multiLabelClassification/val.csv',index_col=0)

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/localdata/geng/data/downstream/multiLabelClassification/val.csv'