# Dependencies

In [4]:
!pip install huggingface_hub
!pip install datasets



In [5]:
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): 
Add token as git credential? (Y/n) n
Token is valid (permission: read).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [8]:
!pip install transformers
!pip install nltk

import numpy as np
import pandas as pd

import nltk
from nltk.tokenize import word_tokenize

nltk.download("punkt")
nltk.download("punkt_tab")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

# Loading and transforming dataset into the required form

In [9]:
from datasets import load_dataset

ds = load_dataset("opennyaiorg/InLegalNER")

train_dataset = load_dataset("opennyaiorg/InLegalNER", split="train")
dev_dataset = load_dataset("opennyaiorg/InLegalNER", split="dev")
test_dataset  = load_dataset("opennyaiorg/InLegalNER", split="test")

README.md:   0%|          | 0.00/3.14k [00:00<?, ?B/s]

(…)-00000-of-00001-de25c1ae1db42f79.parquet:   0%|          | 0.00/3.53M [00:00<?, ?B/s]

(…)-00000-of-00001-b148266485fd7aeb.parquet:   0%|          | 0.00/389k [00:00<?, ?B/s]

(…)-00000-of-00001-814a730def5e8488.parquet:   0%|          | 0.00/1.52M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10995 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1074 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/4501 [00:00<?, ? examples/s]

In [32]:
dev_df = pd.DataFrame(dev_dataset)
train_df = pd.DataFrame(train_dataset)
test_df = pd.DataFrame(test_dataset)

In [11]:
dev_df["annotations"][0]

[{'result': [{'from_name': 'label',
    'id': '25TFDATV',
    'to_name': 'text',
    'type': 'labels',
    'value': {'end': 22,
     'labels': ['STATUTE'],
     'start': 10,
     'text': 'Constitution'}},
   {'from_name': 'label',
    'id': 'F706LMQM',
    'to_name': 'text',
    'type': 'labels',
    'value': {'end': 155,
     'labels': ['PRECEDENT'],
     'start': 108,
     'text': 'R.C. Cooper v. Union of India, (1970) 1 SCC 248'}},
   {'from_name': 'label',
    'id': '2EURBJSZ',
    'to_name': 'text',
    'type': 'labels',
    'value': {'end': 209,
     'labels': ['PRECEDENT'],
     'start': 160,
     'text': 'Maneka Gandhi v. Union of India, (1978) 1 SCC 248'}}]}]

In [12]:
dev_df["data"][0]

{'text': "True, our Constitution has no 'due process' clause or the VIII Amendment; but, in this branch of law, after R.C. Cooper v. Union of India, (1970) 1 SCC 248 and Maneka Gandhi v. Union of India, (1978) 1 SCC 248, the consequence is the same."}

In [13]:
tags = [
    "COURT",
    "PETITIONER",
    "RESPONDENT",
    "JUDGE",
    "LAWYER",
    "DATE",
    "ORG",
    "GPE",
    "STATUTE",
    "PROVISION",
    "PRECEDENT",
    "CASE_NUMBER",
    "WITNESS",
    "OTHER_PERSON",
]

ner_tags = ["O"] + [item for x in tags for item in (f"B-{x}", f"I-{x}")]
index2tag = {idx: tag for idx, tag in enumerate(ner_tags)}
tag2index = {tag: idx for idx, tag in enumerate(ner_tags)}

In [16]:
ner_tags

['O',
 'B-COURT',
 'I-COURT',
 'B-PETITIONER',
 'I-PETITIONER',
 'B-RESPONDENT',
 'I-RESPONDENT',
 'B-JUDGE',
 'I-JUDGE',
 'B-LAWYER',
 'I-LAWYER',
 'B-DATE',
 'I-DATE',
 'B-ORG',
 'I-ORG',
 'B-GPE',
 'I-GPE',
 'B-STATUTE',
 'I-STATUTE',
 'B-PROVISION',
 'I-PROVISION',
 'B-PRECEDENT',
 'I-PRECEDENT',
 'B-CASE_NUMBER',
 'I-CASE_NUMBER',
 'B-WITNESS',
 'I-WITNESS',
 'B-OTHER_PERSON',
 'I-OTHER_PERSON']

In [17]:
index2tag

{0: 'O',
 1: 'B-COURT',
 2: 'I-COURT',
 3: 'B-PETITIONER',
 4: 'I-PETITIONER',
 5: 'B-RESPONDENT',
 6: 'I-RESPONDENT',
 7: 'B-JUDGE',
 8: 'I-JUDGE',
 9: 'B-LAWYER',
 10: 'I-LAWYER',
 11: 'B-DATE',
 12: 'I-DATE',
 13: 'B-ORG',
 14: 'I-ORG',
 15: 'B-GPE',
 16: 'I-GPE',
 17: 'B-STATUTE',
 18: 'I-STATUTE',
 19: 'B-PROVISION',
 20: 'I-PROVISION',
 21: 'B-PRECEDENT',
 22: 'I-PRECEDENT',
 23: 'B-CASE_NUMBER',
 24: 'I-CASE_NUMBER',
 25: 'B-WITNESS',
 26: 'I-WITNESS',
 27: 'B-OTHER_PERSON',
 28: 'I-OTHER_PERSON'}

In [18]:
tag2index

{'O': 0,
 'B-COURT': 1,
 'I-COURT': 2,
 'B-PETITIONER': 3,
 'I-PETITIONER': 4,
 'B-RESPONDENT': 5,
 'I-RESPONDENT': 6,
 'B-JUDGE': 7,
 'I-JUDGE': 8,
 'B-LAWYER': 9,
 'I-LAWYER': 10,
 'B-DATE': 11,
 'I-DATE': 12,
 'B-ORG': 13,
 'I-ORG': 14,
 'B-GPE': 15,
 'I-GPE': 16,
 'B-STATUTE': 17,
 'I-STATUTE': 18,
 'B-PROVISION': 19,
 'I-PROVISION': 20,
 'B-PRECEDENT': 21,
 'I-PRECEDENT': 22,
 'B-CASE_NUMBER': 23,
 'I-CASE_NUMBER': 24,
 'B-WITNESS': 25,
 'I-WITNESS': 26,
 'B-OTHER_PERSON': 27,
 'I-OTHER_PERSON': 28}

In [52]:
def tokenize_with_positions(text):
    tokens_with_positions = {}
    tokens = word_tokenize(text)

    # NLTK converts double quotes during tokenization
    tokens = ['"' if token == "''" or token == "``" else token for token in tokens]

    start_pos = 0
    for index, token in enumerate(tokens):
        start_pos = text.find(token, start_pos)

        tokens_with_positions[start_pos] = {
            "token": token,
            "index": index
        }
        start_pos += len(token)

    return tokens, tokens_with_positions

def assign_labels(tokens, tokens_with_positions, annotations):
    positions = tokens_with_positions.keys()
    labels = ["O"] * len(tokens)
    indices = [0] * len(tokens)

    for annotation in annotations:
        annotation_value = annotation["value"]
        label = annotation_value["labels"][0]
        start = annotation_value["start"]
        end = annotation_value["end"]
        filtered_positions = [x for x in positions if start <= x < end]

        i_flag = False

        for position in filtered_positions:
            if i_flag:
                prefixed_label = f"I-{label}"
            else:
                prefixed_label = f"B-{label}"
                i_flag = True

            token_index = tokens_with_positions[position]["index"]
            labels[token_index] = prefixed_label
            indices[token_index] = tag2index[prefixed_label]

    return labels, indices

def process_row(row):
    text = row["data"]["text"]
    annotations = row["annotations"][0]["result"]

    tokens, tokens_with_positions = tokenize_with_positions(text)

    labels, indices = assign_labels(tokens, tokens_with_positions, annotations)

    return pd.Series([tokens, labels, indices], index=["tokens", "tags", "indices"])

def convert_format(df):
    df = df[["annotations", "data"]]
    df = df.apply(process_row, axis=1)
    return df

In [53]:
dev_df_formatted = convert_format(dev_df)
train_df_formatted = convert_format(train_df)
test_df_formatted = convert_format(test_df)

In [64]:
from collections import Counter

def count_tags_in_df(df):
    tag_counts = Counter()
    for tags_list in df['tags']:
        for tag in tags_list:
            tag_counts[tag] += 1

    return tag_counts

In [65]:
from collections import Counter

def count_unique_tags_in_df(df):
    tag_counts = Counter()

    for tags_list in df['tags']:
        for tag in tags_list:
            if tag != 'O':
                tag = tag[2:]

            tag_counts[tag] += 1

    return tag_counts

In [66]:
from tabulate import tabulate

def tabulate_and_print(df, function):
    tag_counts = function(df)
    table_data = []
    for tag, count in tag_counts.items():
        table_data.append([tag, count])

    print(tabulate(table_data, headers=["Tag", "Count"]))

In [67]:
def tabulate_all(df):
    tabulate_and_print(df, count_tags_in_df)

def tabulate_unique(df):
    tabulate_and_print(df, count_unique_tags_in_df)

In [73]:
tabulate_all(dev_df_formatted)

Tag               Count
--------------  -------
O                 48377
B-STATUTE           222
B-PRECEDENT         177
I-PRECEDENT        2315
B-JUDGE             172
I-JUDGE             236
B-GPE               181
B-OTHER_PERSON      273
B-DATE              218
B-PROVISION         258
I-PROVISION         770
I-STATUTE           463
B-CASE_NUMBER       121
I-CASE_NUMBER       437
B-COURT             296
I-COURT            1071
I-DATE              133
I-OTHER_PERSON      203
B-ORG               159
I-ORG               347
B-PETITIONER        206
I-PETITIONER        483
B-WITNESS            54
I-WITNESS            58
I-GPE                48
B-RESPONDENT        307
I-RESPONDENT       1097
B-LAWYER            545
I-LAWYER            656


In [74]:
tabulate_unique(dev_df_formatted)

Tag             Count
------------  -------
O               48377
STATUTE           685
PRECEDENT        2492
JUDGE             408
GPE               229
OTHER_PERSON      476
DATE              351
PROVISION        1028
CASE_NUMBER       558
COURT            1367
ORG               506
PETITIONER        689
WITNESS           112
RESPONDENT       1404
LAWYER           1201


In [75]:
tabulate_all(train_df_formatted)

Tag               Count
--------------  -------
O                490506
B-ORG              1440
I-ORG              2971
B-OTHER_PERSON     2598
I-OTHER_PERSON     2155
B-WITNESS           862
I-WITNESS           768
B-GPE              1393
B-STATUTE          1804
B-DATE             1885
I-DATE             1937
B-PROVISION        2378
I-PROVISION        6606
I-STATUTE          4005
B-COURT            2367
I-COURT           10192
B-PRECEDENT        1351
I-PRECEDENT       14631
B-CASE_NUMBER      1038
I-CASE_NUMBER      4870
I-GPE               288
B-PETITIONER       3031
I-PETITIONER       5538
B-JUDGE            2291
I-JUDGE            2290
B-RESPONDENT       3811
I-RESPONDENT      11044
B-LAWYER           2897
I-LAWYER           3189


In [76]:
tabulate_unique(train_df_formatted)

Tag             Count
------------  -------
O              490506
ORG              4411
OTHER_PERSON     4753
WITNESS          1630
GPE              1681
STATUTE          5809
DATE             3822
PROVISION        8984
COURT           12559
PRECEDENT       15982
CASE_NUMBER      5908
PETITIONER       8569
JUDGE            4581
RESPONDENT      14855
LAWYER           6086


In [77]:
tabulate_all(test_df_formatted)

Tag               Count
--------------  -------
O                202810
B-COURT            1221
I-COURT            3845
B-JUDGE             580
B-PETITIONER        847
I-PETITIONER       1709
B-LAWYER           1585
I-LAWYER           1711
B-RESPONDENT       1061
I-RESPONDENT       4406
I-JUDGE             693
B-OTHER_PERSON     1082
B-PROVISION        1215
I-PROVISION        3317
B-STATUTE           973
I-STATUTE          2048
B-PRECEDENT         650
I-PRECEDENT        8154
I-OTHER_PERSON      941
B-ORG               912
I-ORG              1698
B-WITNESS           398
I-WITNESS           363
B-DATE             1049
I-DATE              684
B-GPE               713
B-CASE_NUMBER       665
I-CASE_NUMBER      2470
I-GPE               131


In [78]:
tabulate_unique(test_df_formatted)

Tag             Count
------------  -------
O              202810
COURT            5066
JUDGE            1273
PETITIONER       2556
LAWYER           3296
RESPONDENT       5467
OTHER_PERSON     2023
PROVISION        4532
STATUTE          3021
PRECEDENT        8804
ORG              2610
WITNESS           761
DATE             1733
GPE               844
CASE_NUMBER      3135


In [80]:
def count_all_os(df):
  ctr = 0
  for _, row in df.iterrows():
      if all(tag == 'O' for tag in row['tags']):
          ctr += 1
  return ctr

In [81]:
print(f"Count of all Os in dev_df_formatted: {count_all_os(dev_df_formatted)}")
print(f"Count of all Os in train_df_formatted: {count_all_os(train_df_formatted)}")
print(f"Count of all Os in test_df_formatted: {count_all_os(test_df_formatted)}")

Count of all Os in dev_df_formatted: 184
Count of all Os in train_df_formatted: 2197
Count of all Os in test_df_formatted: 565


In [82]:
def drop_rows_with_only_o_tags(df):
  new_df = df.copy()
  rows_to_drop = []
  for index, row in new_df.iterrows():
    if all(tag == 'O' for tag in row['tags']):
      rows_to_drop.append(index)
  new_df = new_df.drop(rows_to_drop)
  return new_df

In [85]:
dev_df_cleaned = drop_rows_with_only_o_tags(dev_df_formatted)
train_df_cleaned = drop_rows_with_only_o_tags(train_df_formatted)
test_df_cleaned = drop_rows_with_only_o_tags(test_df_formatted)

In [86]:
print(f"Count of all Os in dev_df_formatted: {count_all_os(dev_df_cleaned)}")
print(f"Count of all Os in train_df_formatted: {count_all_os(train_df_cleaned)}")
print(f"Count of all Os in test_df_formatted: {count_all_os(test_df_cleaned)}")

Count of all Os in dev_df_formatted: 0
Count of all Os in train_df_formatted: 0
Count of all Os in test_df_formatted: 0


In [87]:
print(f"Number of rows in dev_df_cleaned: {len(dev_df_cleaned)}")
print(f"Number of rows in train_df_cleaned: {len(train_df_cleaned)}")
print(f"Number of rows in test_df_cleaned: {len(test_df_cleaned)}")

Number of rows in dev_df_cleaned: 890
Number of rows in train_df_cleaned: 8798
Number of rows in test_df_cleaned: 3936


In [88]:
tabulate_all(dev_df_cleaned)

Tag               Count
--------------  -------
O                 43041
B-STATUTE           222
B-PRECEDENT         177
I-PRECEDENT        2315
B-JUDGE             172
I-JUDGE             236
B-GPE               181
B-OTHER_PERSON      273
B-DATE              218
B-PROVISION         258
I-PROVISION         770
I-STATUTE           463
B-CASE_NUMBER       121
I-CASE_NUMBER       437
B-COURT             296
I-COURT            1071
I-DATE              133
I-OTHER_PERSON      203
B-ORG               159
I-ORG               347
B-PETITIONER        206
I-PETITIONER        483
B-WITNESS            54
I-WITNESS            58
I-GPE                48
B-RESPONDENT        307
I-RESPONDENT       1097
B-LAWYER            545
I-LAWYER            656


In [89]:
tabulate_unique(dev_df_cleaned)

Tag             Count
------------  -------
O               43041
STATUTE           685
PRECEDENT        2492
JUDGE             408
GPE               229
OTHER_PERSON      476
DATE              351
PROVISION        1028
CASE_NUMBER       558
COURT            1367
ORG               506
PETITIONER        689
WITNESS           112
RESPONDENT       1404
LAWYER           1201


In [90]:
tabulate_all(train_df_cleaned)

Tag               Count
--------------  -------
O                427058
B-ORG              1440
I-ORG              2971
B-OTHER_PERSON     2598
I-OTHER_PERSON     2155
B-WITNESS           862
I-WITNESS           768
B-GPE              1393
B-STATUTE          1804
B-DATE             1885
I-DATE             1937
B-PROVISION        2378
I-PROVISION        6606
I-STATUTE          4005
B-COURT            2367
I-COURT           10192
B-PRECEDENT        1351
I-PRECEDENT       14631
B-CASE_NUMBER      1038
I-CASE_NUMBER      4870
I-GPE               288
B-PETITIONER       3031
I-PETITIONER       5538
B-JUDGE            2291
I-JUDGE            2290
B-RESPONDENT       3811
I-RESPONDENT      11044
B-LAWYER           2897
I-LAWYER           3189


In [91]:
tabulate_unique(train_df_cleaned)

Tag             Count
------------  -------
O              427058
ORG              4411
OTHER_PERSON     4753
WITNESS          1630
GPE              1681
STATUTE          5809
DATE             3822
PROVISION        8984
COURT           12559
PRECEDENT       15982
CASE_NUMBER      5908
PETITIONER       8569
JUDGE            4581
RESPONDENT      14855
LAWYER           6086


In [92]:
tabulate_all(test_df_cleaned)

Tag               Count
--------------  -------
O                186789
B-COURT            1221
I-COURT            3845
B-JUDGE             580
B-PETITIONER        847
I-PETITIONER       1709
B-LAWYER           1585
I-LAWYER           1711
B-RESPONDENT       1061
I-RESPONDENT       4406
I-JUDGE             693
B-OTHER_PERSON     1082
B-PROVISION        1215
I-PROVISION        3317
B-STATUTE           973
I-STATUTE          2048
B-PRECEDENT         650
I-PRECEDENT        8154
I-OTHER_PERSON      941
B-ORG               912
I-ORG              1698
B-WITNESS           398
I-WITNESS           363
B-DATE             1049
I-DATE              684
B-GPE               713
B-CASE_NUMBER       665
I-CASE_NUMBER      2470
I-GPE               131


In [93]:
tabulate_unique(test_df_cleaned)

Tag             Count
------------  -------
O              186789
COURT            5066
JUDGE            1273
PETITIONER       2556
LAWYER           3296
RESPONDENT       5467
OTHER_PERSON     2023
PROVISION        4532
STATUTE          3021
PRECEDENT        8804
ORG              2610
WITNESS           761
DATE             1733
GPE               844
CASE_NUMBER      3135


In [113]:
dev_df_cleaned.to_json("dev_df.json", orient="records")
train_df_cleaned.to_json("train_df.json", orient="records")
test_df_cleaned.to_json("test_df.json", orient="records")

In [114]:
merged_df = pd.concat([dev_df_cleaned, train_df_cleaned, test_df_cleaned], ignore_index=True)

merged_df['tokens_tuple'] = merged_df['tokens'].apply(tuple)
merged_df = merged_df.drop_duplicates(subset=['tokens_tuple'])
merged_df = merged_df.drop(['tokens_tuple'], axis=1)

In [118]:
print(f"Number of rows in merged_df: {len(merged_df)}")

Number of rows in merged_df: 13612


In [115]:
tabulate_all(merged_df)

Tag               Count
--------------  -------
O                656147
B-STATUTE          2993
B-PRECEDENT        2176
I-PRECEDENT       25075
B-JUDGE            3039
I-JUDGE            3214
B-GPE              2286
B-OTHER_PERSON     3952
B-DATE             3150
B-PROVISION        3847
I-PROVISION       10681
I-STATUTE          6503
B-CASE_NUMBER      1823
I-CASE_NUMBER      7774
B-COURT            3878
I-COURT           15090
I-DATE             2754
I-OTHER_PERSON     3298
B-ORG              2510
I-ORG              5015
B-PETITIONER       4082
I-PETITIONER       7726
B-WITNESS          1314
I-WITNESS          1189
I-GPE               467
B-RESPONDENT       5175
I-RESPONDENT      16520
B-LAWYER           5025
I-LAWYER           5556


In [116]:
tabulate_unique(merged_df)

Tag             Count
------------  -------
O              656147
STATUTE          9496
PRECEDENT       27251
JUDGE            6253
GPE              2753
OTHER_PERSON     7250
DATE             5904
PROVISION       14528
CASE_NUMBER      9597
COURT           18968
ORG              7525
PETITIONER      11808
WITNESS          2503
RESPONDENT      21695
LAWYER          10581


In [117]:
merged_df.to_json("merged_df.json", orient="records")