In [1]:
from datasets import load_dataset
import tiktoken
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [2]:
# load dataset
dataset = load_dataset("refugee-law-lab/luck-of-the-draw-iii", split="train")

# convert dataset to df
df = pd.DataFrame(dataset)
df.head(1)

Unnamed: 0,citation,year,name,date_filed,city_filed,nature,class,track,documents,source_url,scraped_timestamp
0,IMM-10085-12,2012,EDITH VICTORIA CASTRO RODRIGUES v. MCI,2012-10-01,Toronto,Imm - Appl. for leave & jud. review - IRB - Re...,Non-Action,Immigration Leave & Judicial Review,"[{'DOCNO': None, 'DOC_DT': '2013-04-25', 'RECO...",https://www.fct-cf.gc.ca/en/court-files-and-de...,2022-11-23


In [3]:
# example of docket
docket = df.iloc[0].documents
for docket_entry in docket:
    print(docket_entry)

{'DOCNO': None, 'DOC_DT': '2013-04-25', 'RECORDED_ENTRY': " Memorandum to file from Ann Murphy dated 25-APR-2013 further to phone conversations with the Law Society concerning the death of Applicant's counsel and the Applicant's dismissed order, I have been advised that the Law Society has advised the Applicant that she should retain new counsel and contact the Federal.  The Law Society will not provide the registry with the address of the Applicant in order for the registry to send out the dismissed order.  The Law Society will not advise  the Applicant that her Application was dismissed. placed on file.", 'RE_NO': 14}
{'DOCNO': None, 'DOC_DT': '2013-04-16', 'RECORDED_ENTRY': " Memorandum to file from Ann Murphy dated 16-APR-2013 I have contacted the Law Society of Upper Canada concering status of Mr. Makepeace's legal file, for this Applicant in light of the fact he is now deceased.  They will call me back. BF 25-apr-2013 placed on file.", 'RE_NO': 13}
{'DOCNO': None, 'DOC_DT': '2013

In [4]:
# Count tokens
def count_tokens(text, model = "cl100k_base"):
    encoding = tiktoken.get_encoding(model)
    num_tokens = len(encoding.encode(text))
    return num_tokens

def list_dict_to_token_count(documents):
    text = ""
    for doc in documents:
        text = text + " ".join([str(v) for k,v in doc.items()])
    return count_tokens(text)   

df['num_tokens'] = df['documents'].progress_apply(list_dict_to_token_count)

100%|██████████| 218639/218639 [01:16<00:00, 2868.34it/s]


In [5]:
# Summary stats
print("Number of dockets:",len(df))
print()
print("Total number of tokens in all dockets:",f"{df['num_tokens'].sum():,}")
print()
print("Number of tokens per docket:")
print()
print("Average number of tokens:",round(df['num_tokens'].mean(),1))
print("Median number of tokens:",df['num_tokens'].median())
print("Max number of tokens:",df['num_tokens'].max())
print("Min number of tokens:",df['num_tokens'].min())
print("Quartiles for number of tokens:")
print(df['num_tokens'].quantile([.25, .5, .75]))
print("Top 10% of tokens:",df['num_tokens'].quantile([.9]))
print("Top 5% of tokens:",df['num_tokens'].quantile([.95]))
print("Top 1% of tokens:",df['num_tokens'].quantile([.99]))
print("Top 0.1% of tokens:",df['num_tokens'].quantile([.999]))
print("Percentage of dockets with more than 4000 tokens:",round((df['num_tokens']>4000).mean()*100,2),"%")



Number of dockets: 218639

Total number of tokens in all dockets: 181,371,722

Number of tokens per docket:

Average number of tokens: 829.5
Median number of tokens: 538.0
Max number of tokens: 39388
Min number of tokens: 23
Quartiles for number of tokens:
0.25     411.0
0.50     538.0
0.75    1057.0
Name: num_tokens, dtype: float64
Top 10% of tokens: 0.9    1682.0
Name: num_tokens, dtype: float64
Top 5% of tokens: 0.95    2145.0
Name: num_tokens, dtype: float64
Top 1% of tokens: 0.99    3432.0
Name: num_tokens, dtype: float64
Top 0.1% of tokens: 0.999    6619.724
Name: num_tokens, dtype: float64
Percentage of dockets with more than 4000 tokens: 0.58 %


In [8]:
# main groups
df['nature'].value_counts()

nature
Imm - Appl. for leave & jud. review - IRB - Refugee                       105774
Imm - Appl. for leave & jud. review - Other Arising in Canada              37804
Imm - Appl. for leave & jud. review - Arising outside Canada               30698
Imm - Appl. for leave & jud. review - Pre-removal risk assessment          10856
Imm - Appl. for leave & jud. review - IRB - Refugee Appeal Division         9855
Imm - Appl. for leave & jud. review - IRB -Immigration Appeal Division      6757
Imm - Application for Judicial Review - Visa Officer                        4707
Imm - Appl. for leave & jud. review - IRB - Refugee Protection Div.         2943
Imm - Appl. for leave & judicial review & extension - CRDD                  2574
Imm - Appl. for leave & jud. review - IRB - Immigration Division            1776
Imm - Appl. for leave & judicial review - H&C                               1340
Imm - Appl. for leave & judicial review & extension - Others                1020
Imm - Appl. for leave

In [10]:
# Summary stats: Refugee cases only
df = df[df['nature'].str.contains("Refugee")]
print("Number of refugee law JR dockets:",len(df))
print()
print("Total number of tokens in all refugee law dockets:",f"{df['num_tokens'].sum():,}")
print()
print("Number of tokens per refugee law docket:")
print()
print("Average number of tokens:",round(df['num_tokens'].mean(),1))
print("Median number of tokens:",df['num_tokens'].median())
print("Max number of tokens:",df['num_tokens'].max())
print("Min number of tokens:",df['num_tokens'].min())
print("Quartiles for number of tokens:")
print(df['num_tokens'].quantile([.25, .5, .75]))
print("Top 10% of tokens:",df['num_tokens'].quantile([.9]))
print("Top 5% of tokens:",df['num_tokens'].quantile([.95]))
print("Top 1% of tokens:",df['num_tokens'].quantile([.99]))
print("Top 0.1% of tokens:",df['num_tokens'].quantile([.999]))
print("Percentage of dockets with more than 4000 tokens:",round((df['num_tokens']>4000).mean()*100,2),"%")

Number of refugee law JR dockets: 118572

Total number of tokens in all refugee law dockets: 83,823,756

Number of tokens per refugee law docket:

Average number of tokens: 706.9
Median number of tokens: 500.0
Max number of tokens: 18222
Min number of tokens: 99
Quartiles for number of tokens:
0.25    406.0
0.50    500.0
0.75    787.0
Name: num_tokens, dtype: float64
Top 10% of tokens: 0.9    1416.0
Name: num_tokens, dtype: float64
Top 5% of tokens: 0.95    1772.0
Name: num_tokens, dtype: float64
Top 1% of tokens: 0.99    2746.58
Name: num_tokens, dtype: float64
Top 0.1% of tokens: 0.999    4733.003
Name: num_tokens, dtype: float64
Percentage of dockets with more than 4000 tokens: 0.2 %


### Conclusion

181 million tokens for pre-training (all dockets)
84 million tokens for pre-training (refugee law dockets)

118k refugee law dockets for fine-tuning

While some dockets are lengthy (e.g. >20k tokens), the vast majority are under 4000 tokens, especially 
in the refugee law setting (i.e. 99.8% of dockets). That being said a substantial portion (50%) are over 500 tokens. It's fine if we have to discard the 0.2% of too long dockets (or truncate them in some way)

Upshot: Typical BERT model will not work. Need either to pre-process the dockets in some way to reduce length or work with model that can accommodate a larger context window. 