### Install dependencies

In [None]:
!pip install tokenizers -q
!pip install transformers -q
!pip install transformers[torch] -q
!pip install seqeval -q
!pip install evaluate -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m67.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m236.8/236.8 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import spacy
import numpy as np
import pandas as pd
import json
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from transformers import AutoTokenizer, BertTokenizerFast, DebertaTokenizerFast,  DebertaForSequenceClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification, TrainerCallback
from datasets import DatasetDict, Dataset, load_metric, load_from_disk
import traceback
from tqdm import tqdm
from evaluate import load
import matplotlib.pyplot as plt

import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForTokenClassification, AdamW
from google.colab import drive
import warnings
import pickle

In [None]:
class EmptyCacheCallback(TrainerCallback):
    def on_train_batch_end(self, args, state, control, *_) -> None:
        torch.cuda.empty_cache()

# Instantiate the EmptyCacheCallback
empty_cache_callback = EmptyCacheCallback()

In [None]:
drive.mount('/content/drive')
%cd /content/drive/MyDrive/256_Project
PATH = '/content/drive/MyDrive/256_Project'

Mounted at /content/drive
/content/drive/MyDrive/256_Project


### Load dataset (skip when needed)

In [None]:
with open('data/NER_TRAIN_JUDGEMENT.json', 'r') as f:
    train_judge = json.load(f)

with open('data/NER_TRAIN_PREAMBLE.json', 'r') as f:
    train_preamble = json.load(f)

with open('data/NER_DEV_JUDGEMENT.json', 'r') as f:
    test_judge = json.load(f)

with open('data/NER_DEV_PREAMBLE.json', 'r') as f:
    test_preamble = json.load(f)

In [None]:
len(train_judge), len(train_preamble), len(test_judge), len(test_preamble)

(9435, 1560, 949, 125)

In [None]:
test_judge[0]

{'id': '03f3901e95ed493b866bd7807f623bc0',
 'annotations': [{'result': [{'value': {'start': 10,
      'end': 22,
      'text': 'Constitution',
      'labels': ['STATUTE']},
     'id': '25TFDATV',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 108,
      'end': 155,
      'text': 'R.C. Cooper v. Union of India, (1970) 1 SCC 248',
      'labels': ['PRECEDENT']},
     'id': 'F706LMQM',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels'},
    {'value': {'start': 160,
      'end': 209,
      'text': 'Maneka Gandhi v. Union of India, (1978) 1 SCC 248',
      'labels': ['PRECEDENT']},
     'id': '2EURBJSZ',
     'from_name': 'label',
     'to_name': 'text',
     'type': 'labels'}]}],
 'data': {'text': "True, our Constitution has no 'due process' clause or the VIII Amendment; but, in this branch of law, after R.C. Cooper v. Union of India, (1970) 1 SCC 248 and Maneka Gandhi v. Union of India, (1978) 1 SCC 248, the conseq

In [None]:
# test_preamble[0]

### Initializing tokenizer & Label Encoder

In [None]:
ner_labels = ['NO_TAG', 'COURT', 'PETITIONER', 'RESPONDENT', 'JUDGE', 'LAWYER', 'DATE', 'ORG', 'GPE', 'STATUTE',
              'PROVISION', 'PRECEDENT', 'CASE_NUMBER', 'WITNESS', 'OTHER_PERSON']

# label_encoder = LabelEncoder()
# label_encoder.fit(ner_labels)

# encoded_labels = label_encoder.transform(ner_labels)
# print(label_encoder.inverse_transform([6])[0])
# # label_encoder.transform(['NO_TAG'])[0]

# print(encoded_labels)
# print(type(encoded_labels))


{'NO_TAG': 0, 'COURT': 1, 'PETITIONER': 2, 'RESPONDENT': 3, 'JUDGE': 4, 'LAWYER': 5, 'DATE': 6, 'ORG': 7, 'GPE': 8, 'STATUTE': 9, 'PROVISION': 10, 'PRECEDENT': 11, 'CASE_NUMBER': 12, 'WITNESS': 13, 'OTHER_PERSON': 14}


In [None]:
# tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base", is_split_into_words=True)
# tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", is_split_into_words=True)

tokenizer = BertTokenizerFast.from_pretrained(model_name, tokenizer="basic")


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

### Data Preprocessing (skip when needed)

Start, end in dataset is 1 indexed <br/>
start - starts from 1 <br/>
end - 1 character after the word

In [None]:
def get_token_start(start, end, offsets_mapping):
    token_start = None
    for i, interval in enumerate(offsets_mapping):
        left, right = interval
        if left<=start<=right:
            token_start = i
            break

    # n = len(word_list)
    # first_word = txt.split()[0]
    # if i>0 and word_list[i-1] == first_word:
    #   token_start = i-1
    # elif i<n-1 and word_list[i+1] == first_word:
    #   token_start = i+1

    return token_start

def get_token_end(start, end, offsets_mapping):
    token_end = None
    for i, interval in enumerate(offsets_mapping):
        left, right = interval
        if left<=end<=right:
            token_end = i
            break

    # n = len(word_list)
    # last_word = txt.split()[-1]
    # if i>0 and word_list[i-1] == last_word:
    #   token_end = i-1
    # elif i<n-1 and word_list[i+1] == last_word:
    #   token_end = i+1

    return token_end

In [None]:
label_map = {label: index for index, label in enumerate(ner_labels)}

label_map

{'NO_TAG': 0,
 'COURT': 1,
 'PETITIONER': 2,
 'RESPONDENT': 3,
 'JUDGE': 4,
 'LAWYER': 5,
 'DATE': 6,
 'ORG': 7,
 'GPE': 8,
 'STATUTE': 9,
 'PROVISION': 10,
 'PRECEDENT': 11,
 'CASE_NUMBER': 12,
 'WITNESS': 13,
 'OTHER_PERSON': 14}

In [None]:
def process_individual_record(rec):

  try:
    text = rec['data']['text']
    annotations = rec['annotations'][0]['result']
    annotated_token_positions = [(ann['value']['start'], ann['value']['end'], ann['value']['text']) for ann in annotations]

    word_list = text.split(' ')
    word_indices = []
    start_index = 0
    unql = set()

    for word in word_list:
      start_index = text.find(word, start_index)
      end_index = start_index + len(word)
      word_indices.append((start_index, end_index))
      start_index = end_index + 1  # Adding 1 for the space

    # Initialize label_ids with NO_TAG for all tokens
    label_ids = [0] * len(word_list)

    # Assign labels to annotated tokens
    for start, end, txt in annotated_token_positions:

        # print("\nin inner loop")
        # print(start, end, txt)

        # if start>=120 or end>=120: continue

        # Find the corresponding token positions in input_ids using offsets_mapping
        token_start = get_token_start(start, end, word_indices)
        token_end = get_token_end(start, end, word_indices)

        assert token_start is not None
        assert token_end is not None

        # print(token_start, token_end)
        # Assign the label to the corresponding token(s)
        label = [ann['value']['labels'][0] for ann in annotations if ann['value']['start'] == start and ann['value']['end'] == end]
        label_id = label_map[label[0]]
        label_ids[token_start:token_end+1] = [label_id] * (token_end + 1 - token_start)
        # print(label_ids)
        unql.add(label[0])

  except Exception as e:
    print("Exception")
    print(e)
    print(start, end)
    print(word_indices)
    print("\n")
    print(word_list)
    print("\n")
    print(rec)

    raise SystemExit

  assert len(label_ids)==len(word_list)

  return word_list, label_ids, unql


In [None]:
# Prepare train
combined_train_data = train_judge + train_preamble
rows = []
count=0
for rec in tqdm(combined_train_data):
    text, label_ids, unql = process_individual_record(rec)
    ans = []
    for lbl in unql:
      cls = lbl.split('_')
      labels = [1 if value == label_map[lbl]  else 0 for value in label_ids]
      rows.append([cls, text, labels])

100%|██████████| 10995/10995 [00:03<00:00, 3595.88it/s]


In [None]:
df_train = pd.DataFrame(rows, columns = ['class', 'text', 'labels'])
df_train

Unnamed: 0,class,text,labels
0,[ORG],"[\n\n(7), On, specific, query, by, the, Bench,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[OTHER, PERSON]","[He, was, also, asked, whether, Agya, <span, c...","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
2,[WITNESS],"[, \n5.2, CW3, Mr, Vijay, Mishra, ,, Deputy, M...","[0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,[ORG],"[, \n5.2, CW3, Mr, Vijay, Mishra, ,, Deputy, M...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ..."
4,[GPE],"[, \n5.2, CW3, Mr, Vijay, Mishra, ,, Deputy, M...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ..."
...,...,...,...
19825,[PETITIONER],"[In, The, High, Court, Of, Judicature, At, Pat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
19826,[RESPONDENT],"[Petitioner:\nThe, Automobile, Transport(Rajas...","[0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0]"
19827,[PETITIONER],"[Petitioner:\nThe, Automobile, Transport(Rajas...","[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
19828,[RESPONDENT],"[Petitioner:\nEast, India, COMMERClAL, Co.,, L...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0]"


In [None]:
text= "Hey there how are ya\n"
text.split(), text.split(" ")

(['Hey', 'there', 'how', 'are', 'ya'], ['Hey', 'there', 'how', 'are', 'ya\n'])

In [None]:
# Prepare test set
combined_test_data = test_judge + test_preamble
rows = []
count=0
for rec in tqdm(combined_test_data):
    text, label_ids, unql = process_individual_record(rec)
    ans = []
    for lbl in unql:
      cls = lbl.split('_')
      labels = [1 if value == label_map[lbl]  else 0 for value in label_ids]
      rows.append([cls, text, labels])

100%|██████████| 1074/1074 [00:00<00:00, 4868.87it/s]


In [None]:
df_test = pd.DataFrame(rows, columns = ['class', 'text', 'labels'])
df_test

Unnamed: 0,class,text,labels
0,[STATUTE],"[True,, our, Constitution, has, no, 'due, proc...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[PRECEDENT],"[True,, our, Constitution, has, no, 'due, proc...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[JUDGE],"[(See, Principles, of, Statutory, Interpretati...","[0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0]"
3,"[OTHER, PERSON]","[Their, Lordships, have, said, --, , ""It, is, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,[GPE],"[Their, Lordships, have, said, --, , ""It, is, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
1976,[JUDGE],"[High, Court, Of, Judicature, At, Allahabad\n,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1977,[LAWYER],"[High, Court, Of, Judicature, At, Allahabad\n,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1978,[RESPONDENT],"[High, Court, Of, Judicature, At, Allahabad\n,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1979,[COURT],"[High, Court, Of, Judicature, At, Allahabad\n,...","[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
PATH

'/content/drive/MyDrive/256_Project'

In [None]:
df_train.to_csv(PATH + '/train.csv')
df_test.to_csv(PATH + '/test.csv')

In [None]:
ls

 [0m[01;34mdata[0m/                                    deberta_kfold.ipynb
'DeBERTa(1).ipynb'                        Electra_Fine_Tuning.ipynb
'DeBERTa(2).ipynb'                        RoBERTa_existing.ipynb
 [01;34mdeberta-fine-tune[0m/                       test.csv
 DeBERTa_hyperparam.ipynb                 train.csv
 [01;34mdeberta-kfold[0m/                           trainer_log_history.pkl
 deberta-kfold-20230613T220642Z-001.zip   zero-shot-data.ipynb
 deberta-kfold-20230613T220642Z-002.zip
