# Create Hugging Face dataset 

Code to create Hugging Face dataset from downloaded FC dockets

Data posted to: https://huggingface.co/datasets/refugee-law-lab/luck-of-the-draw-iii

In [1]:
# Setup
import pandas as pd
import datetime
import re
from tqdm import tqdm
tqdm.pandas()



In [2]:
# Load everything

# local parquet file
parquet_file = 'D:/RLLPDATA/fc_dockets_2022_12_01.parquet.gzip'

# Load dockets
df_full=pd.read_parquet(parquet_file, engine='pyarrow')
df_full=df_full.reset_index(drop='True')
print(len(df_full))


218639


In [3]:
# Clean the dockets

DOCKETdatestr = re.compile(r'-?\d+')

def parse_datetime(ts: str):
    match = DOCKETdatestr.findall(ts)
    if not match:
        raise ValueError(f'Unable to parse timestamp {ts} (match: {DOCKETdatestr})')
    return datetime.datetime.fromtimestamp(int(match[0]) / 1000).date().strftime('%Y-%m-%d')
    
def get_documents(documents):
    revised_documents=[]
    for document in documents:    
        revised_document = {}
        revised_document['RE_NO'] = document['RE_NO']
        revised_document['DOCNO'] = document['DOCNO']
        revised_document['DOC_DT'] = parse_datetime(document['DOC_DT'])
        revised_document['RECORDED_ENTRY'] = document['RECORDED_ENTRY']
        revised_documents.append(revised_document)
    return revised_documents

def clean_row(row):
    row['name'] = row['documents'][-1]['STYLE_OF_CAUSE']
    row['date_filed'] = parse_datetime(row['documents'][-1]['FILING_DATE'])
    row['city_filed'] = row['documents'][-1]['ENGLISH_OFFICE_NAME']
    row['nature'] = row['documents'][-1]['ENGLISH_NATURE_DESC']
    row['class'] = row['documents'][-1]['ENGLISH_PROCEEDING_CLASS']
    row['track'] = row['documents'][-1]['ENGLISH_TRACK_NAME']
    row['documents']=get_documents(row['documents']) 
    row['scraped_timestamp'] = str(row['timestamp'].date())  
    return row

df_full = df_full.progress_apply(clean_row, axis=1)

# drop unneeded columns
df_full=df_full.drop(columns=['timestamp'])

# create source_url column 
df_full['source_url']='https://www.fct-cf.gc.ca/en/court-files-and-decisions/court-files'

# Reorder
col_order = [
    'citation',
    'year',
    'name',
    'date_filed',
    'city_filed',
    'nature',
    'class',
    'track',
    'documents',
    'source_url',
    'scraped_timestamp',
]

df_full = df_full[col_order]

df_full.head()



  0%|          | 0/218639 [00:00<?, ?it/s]

100%|██████████| 218639/218639 [03:26<00:00, 1058.07it/s]


Unnamed: 0,citation,year,name,date_filed,city_filed,nature,class,track,documents,source_url,scraped_timestamp
0,IMM-10085-12,2012,EDITH VICTORIA CASTRO RODRIGUES v. MCI,2012-10-01,Toronto,Imm - Appl. for leave & jud. review - IRB - Re...,Non-Action,Immigration Leave & Judicial Review,"[{'RE_NO': 14, 'DOCNO': None, 'DOC_DT': '2013-...",https://www.fct-cf.gc.ca/en/court-files-and-de...,2022-11-23
1,IMM-10182-12,2012,ABDOU KHADIR SECK c. MCI,2012-10-04,Montréal,Imm - Appl. for leave & jud. review - IRB - Re...,Non-Action,Immigration Leave & Judicial Review,"[{'RE_NO': 7, 'DOCNO': None, 'DOC_DT': '2013-0...",https://www.fct-cf.gc.ca/en/court-files-and-de...,2022-11-23
2,IMM-10196-12,2012,CYRIL JOHN DA SILVA v. MCI,2012-10-04,Toronto,Imm - Appl. for leave & jud. review - IRB -Imm...,Non-Action,Immigration Leave & Judicial Review,"[{'RE_NO': 37, 'DOCNO': None, 'DOC_DT': '2017-...",https://www.fct-cf.gc.ca/en/court-files-and-de...,2022-11-23
3,IMM-10211-12,2012,ALISA POGORELOVSKY ET AL v. MCI,2012-10-05,Toronto,Imm - Appl. for leave & jud. review - IRB - Re...,Non-Action,Immigration Leave & Judicial Review,"[{'RE_NO': 9, 'DOCNO': None, 'DOC_DT': '2013-0...",https://www.fct-cf.gc.ca/en/court-files-and-de...,2022-11-23
4,IMM-10212-12,2012,DARIUSZ GLOWACKI ET AL v. MCI,2012-10-05,Toronto,Imm - Appl. for leave & jud. review - IRB - Re...,Non-Action,Immigration Leave & Judicial Review,"[{'RE_NO': 30, 'DOCNO': 17.0, 'DOC_DT': '2014-...",https://www.fct-cf.gc.ca/en/court-files-and-de...,2022-11-23


In [None]:
# export cleaned df to jsonl
df_full.to_json("output/data.jsonl", orient='records', lines=True)


In [None]:
# export cleaned df to parquet
df_full.to_parquet("output/train.parquet")

In [9]:
# export to alternative directory
df_full.to_parquet("D:/AI-Projects/luck-of-the-draw-iii-data/luck-of-the-draw-iii/train.parquet")