In [1]:
!pip install datasets

[0m

In [2]:
!pip install transformers

[0m

In [3]:
import os

base_path = "**AMR-parsed test dataset**"
files = sorted([base_path+f for f in os.listdir(base_path)])
files[:5]

['/kaggle/input/etchr-amr-dataset/NLTK-Sentence-Splitter/NLTK-Sentence-Splitter/TestCase_0000.csv',
 '/kaggle/input/etchr-amr-dataset/NLTK-Sentence-Splitter/NLTK-Sentence-Splitter/TestCase_0001.csv',
 '/kaggle/input/etchr-amr-dataset/NLTK-Sentence-Splitter/NLTK-Sentence-Splitter/TestCase_0002.csv',
 '/kaggle/input/etchr-amr-dataset/NLTK-Sentence-Splitter/NLTK-Sentence-Splitter/TestCase_0003.csv',
 '/kaggle/input/etchr-amr-dataset/NLTK-Sentence-Splitter/NLTK-Sentence-Splitter/TestCase_0004.csv']

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

tokenizer = AutoTokenizer.from_pretrained("distilroberta-base")
bert_model = AutoModel.from_pretrained("distilroberta-base")

class HierBERTClassifier(torch.nn.Module):
    def __init__(self, bert_model, num_classes=10):
        super().__init__()
        self.bert_model = bert_model
        self.num_classes = num_classes
        self.seg_encoder = torch.nn.Transformer(d_model=self.bert_model.config.hidden_size,
                                                nhead=self.bert_model.config.num_attention_heads,
                                                batch_first=True,
                                                dim_feedforward=self.bert_model.config.intermediate_size,
                                                activation=self.bert_model.config.hidden_act,
                                                dropout=self.bert_model.config.hidden_dropout_prob,
                                                layer_norm_eps=self.bert_model.config.layer_norm_eps,
                                                num_encoder_layers=2, num_decoder_layers=0).encoder
        self.dropout = torch.nn.Dropout(self.bert_model.config.hidden_dropout_prob)
        self.classifier = torch.nn.Linear(768, num_classes)
        
        
    def forward(self, input_ids, attention_mask):
        encoded_outputs = self.bert_model(input_ids=input_ids, attention_mask=attention_mask)[0]
        encoded_outputs = encoded_outputs.contiguous().view(1, input_ids.size(0), 128, 768)
        encoded_outputs = encoded_outputs[:, :, 0]
        seg_encoder_outputs = self.seg_encoder(encoded_outputs)
        pooled_output, _ = torch.max(seg_encoder_outputs, 1)
        output = self.dropout(pooled_output)
        output = self.classifier(output)
        return output
        
        
model = HierBERTClassifier(bert_model)

Downloading (…)okenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/989 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt: 0.00B [00:00, ?B/s]

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


Downloading pytorch_model.bin:   0%|          | 0.00/141M [00:00<?, ?B/s]

Some weights of the model checkpoint at nlpaueb/legal-bert-small-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
import os
import pandas as pd
files = sorted([base_path+'/'+f for f in os.listdir(base_path)])
case_files = []
for file in files:
  df = pd.read_csv(file)
  df = df.values
  cases = []
  for row in df:
    try: 
      if(row[0].index("\n(z0")!=0):
        content=row[0][row[0].index("\n(z0"):]
      else:
        content = row[0]
    except:
      continue
    cases.append(content)
  case_files.append(cases)

In [6]:
from datasets import load_dataset
dataset = load_dataset("coastalcph/fairlex", "ecthr")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading metadata: 0.00B [00:00, ?B/s]

Downloading and preparing dataset fairlex/ecthr (download: 30.44 MiB, generated: 107.98 MiB, post-processed: Unknown size, total: 138.43 MiB) to /root/.cache/huggingface/datasets/coastalcph___fairlex/ecthr/1.0.0/b755f714459ab788a8e3f9167fe7463f79981775296915d36ac10fc58ea93737...


Downloading data:   0%|          | 0.00/31.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Dataset fairlex downloaded and prepared to /root/.cache/huggingface/datasets/coastalcph___fairlex/ecthr/1.0.0/b755f714459ab788a8e3f9167fe7463f79981775296915d36ac10fc58ea93737. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
from tqdm.notebook import trange
test_case_files = [dataset['test']['text'][i].split('</s>') for i in trange(len(dataset['test']['text']))]

  0%|          | 0/1000 [00:00<?, ?it/s]

In [8]:
train_labels = dataset['test']['labels']
print(train_labels[:5], len(train_labels))

[[6], [4], [3], [3], []] 1000


In [9]:
label_vocab = []
for label_group in train_labels:
  label_vocab.extend(label_group)
  label_vocab = list(set(label_vocab))
labels_vocab = sorted(label_vocab)
print(label_vocab)
print(len(label_vocab))

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
10


In [10]:
import numpy as np
vectored_y = []
for label_group in train_labels:
  temp = np.zeros(len(label_vocab))
  for l in label_group:
    temp[label_vocab.index(l)] =1
  vectored_y.append(temp)
vectored_y = np.stack(vectored_y)

In [11]:
vectored_y = torch.from_numpy(vectored_y).float()

In [12]:
from tqdm import tqdm

device = "cuda"

In [13]:
model.load_state_dict(torch.load('**model_path**'))

<All keys matched successfully>

In [14]:
model = model.to(device)

In [15]:
len(case_files), vectored_y.shape

(1000, torch.Size([1000, 10]))

In [16]:
model.eval()

HierBERTClassifier(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-5): 6 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elemen

In [17]:
batch_size=4
bar = tqdm(zip(case_files,vectored_y), total=len(vectored_y))
running_loss, running_acc, running_prec, running_recall, running_f1 = 0, 0, 0, 0, 0
y_true, y_pred = [], []
for batch_idx, (case_x, y) in enumerate(bar):
    case_x = case_x[-64:]
    y = y.to(device)
    y = y.unsqueeze(0)
    inputs = tokenizer(case_x, return_tensors="pt", padding='max_length', truncation=True, max_length=128).to(device)
    out = model(inputs['input_ids'], inputs['attention_mask'])
    loss = torch.nn.functional.binary_cross_entropy(torch.sigmoid(out), y)
    y_ = (out>0).float()
    y_pred.append(y_)
    y_true.append(y)
    acc = torch.mean((y==y_).float())
    running_acc += acc.item()
    running_loss += loss.item()
    tp, fp, tn, fn = 0,0,0,0
    for i,j in zip(y[0], y_[0]):
        if i==1 and j==1:
            tp += 1
        elif i==0 and j==1:
            fp += 1
        elif i==1 and j==0:
            fn += 1
        else:
            tn += 1
    if tp+fp>0:
        prec = tp/(tp+fp)
    else:
        prec = 0.0
    if tp+fn>0:
        recall = tp/(tp+fn)
    else:
        recall = 0.0
    if prec+recall>0:
        f1 = 2*prec*recall/(prec+recall)
    else:
        f1 = 0.0
    running_prec += prec
    running_recall += recall
    running_f1 += f1
    bar.set_description(str({"loss": round(running_loss/(batch_idx+1), 3), "acc": round(running_acc/(batch_idx+1), 3), "f1-score": round(running_f1/(batch_idx+1), 3), "precision": round(running_prec/(batch_idx+1), 3), "recall": round(running_recall/(batch_idx+1), 3)}))
bar.close()

{'loss': 0.169, 'acc': 0.933, 'f1-score': 0.574, 'precision': 0.599, 'recall': 0.581}: 100%|██████████| 1000/1000 [01:03<00:00, 15.84it/s]


In [18]:
y_pred_np = np.array([y[0].to("cpu").numpy().tolist() for y in y_pred])
y_pred_np.shape

(1000, 10)

In [19]:
y_true_np = np.array([i[0].to("cpu").numpy().tolist() for i in y_true])
y_true_np.shape

(1000, 10)

In [20]:
from sklearn.metrics import f1_score

In [21]:
f1_score(y_true_np, y_pred_np, average='macro')

0.5272588494570776

In [22]:
class Evaluator:
  def __init__(self, dataset, outputs, label_vocab):
    self.data = dataset
    self.out = outputs
    self.label_vocab = label_vocab
    self.key_breakdown = {'applicant_gender': ['N/A', 'male', 'female'],
                          'applicant_age': ['N/A', '<=35', '<=65', '>65'],
                          'defendant_state': ['E.C.', 'West']}

  def breakdowner(self, group_label='applicant_gender'):
    print(group_label)
    groups = sorted(list(set(self.data[group_label])))
    f1_scores = []
    display_df = []
    for group in groups:
      true_group_set, out_group_set = [], []
      for i in range(self.out.shape[0]):
        if self.data[group_label][i]==group:
          row = np.zeros(self.out.shape[1])
          for t in self.data['labels'][i]:
            row[self.label_vocab.index(t)] = 1
          true_group_set.append(row)
          out_group_set.append(self.out[i])
      true_group_set = np.array(true_group_set)
      out_group_set = np.stack(out_group_set)
      group_f1 = f1_score(true_group_set, out_group_set, average='macro', zero_division=1)
      total_test_cases = true_group_set.shape[0]
      f1_scores.append(group_f1)
      minimum_f1, minimum_f1_index = 2, 0
      for col_idx, (col_true, col_out) in enumerate(zip(true_group_set.T, out_group_set.T)):
        case_f1 = f1_score(col_true, col_out, average='macro', zero_division=1)
        if case_f1<minimum_f1:
          minimum_f1 = case_f1
          minimum_f1_index = col_idx
      worst_column = true_group_set.T[minimum_f1_index]
      worst_class_influence = np.sum(worst_column)/total_test_cases
      display_df.append([self.key_breakdown[group_label][group], round(group_f1, 3), round(worst_class_influence, 3)])
    display_df = pd.DataFrame(np.array(display_df))
    display_df.columns = ["attribute", 'mF1', 'WCI']
    print(display_df)
    print()
    return round(np.mean(f1_scores)*100, 2), round(np.std(f1_scores)*100, 2), round(np.min(f1_scores)*100, 2)

  def label_runner(self):
    df = {"category": [], "Avg Performance Across Groups": [], "Group Disparity": [], "Worst Group Performance": []}
    for group_label in ['applicant_age', 'applicant_gender', 'defendant_state']:
      out = self.breakdowner(group_label)
      df["category"].append(group_label)
      df["Avg Performance Across Groups"].append(out[0])
      df["Group Disparity"].append(out[1])
      df["Worst Group Performance"].append(out[2])
    df = pd.DataFrame(df)
    return df

In [23]:
evaluator = Evaluator(dataset['test'], y_pred_np, [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
evaluator.label_runner()

applicant_age
  attribute    mF1    WCI
0       N/A  0.546  0.017
1      <=35  0.534  0.031
2      <=65  0.485  0.009
3       >65  0.621  0.031

applicant_gender
  attribute    mF1    WCI
0       N/A  0.552  0.018
1      male  0.485  0.011
2    female   0.43  0.029

defendant_state
  attribute    mF1    WCI
0      E.C.  0.532  0.017
1      West   0.62  0.013



Unnamed: 0,category,Avg Performance Across Groups,Group Disparity,Worst Group Performance
0,applicant_age,54.64,4.84,48.54
1,applicant_gender,48.91,4.97,43.04
2,defendant_state,57.6,4.41,53.19
