### Notebook to parse text files to produce cleaned text of RPD (Legacy) decisions

Sean Rehaag

License: Creative Commons Attribution-NonCommercial 4.0 International (CC BY-NC 4.0). 

Dataset & Code to be cited as:

Sean Rehaag, "Refugee Protection Division (Legacy) Bulk Decisions Dataset" (2023), online: Refugee Law Laboratory <https://refugeelab.ca/bulk-data/rpd/>.

Notes:

(1) Data Source: In the fall of 2022 the Immigration and Refugee Board provided the RLL with a full backlog of approximately 116k published decisions from all divisions (RAD, RPD, ID, IAD). Because the IRB no longer regularly publishes RPD decisions, the dataset is no longer being updated, which is why we refer to the 
dataset as a legacy dataset. For more recent RPD decisions (obtained via Access to Information Requests), 
see the RLLR dataset.

(2) Unofficial Data: The data are unofficial reproductions. For official versions, please contact the Immigration and Refugee Board. 

(3) Non-Affiliation / Endorsement: The data has been collected and reproduced without any affiliation or endorsement from the Immigration and Refugee Board.

(4) Non-Commerical Use: As indicated in the license, data may be used for non-commercial use (with attribution) only. For commercial use, please contact the Immigration and Refugee Board. 

(5) Accuracy: Data was collected and processed programmatically for the purposes of academic research. While we make best efforts to ensure accuracy, data gathering of this kind inevitably involves errors. As such the data should be viewed as preliminary information aimed to prompt further research and discussion, rather than as definitive information.

Acknowledgements: Thanks to Rafael Dolores who coded the initial parsing scripts for the Refugee Appeal Division Bulk Decisions Dataset, which were modified for this datset.


# Installing Libraries

In [1]:
#!pip install langdetect
#!pip install regex
#!pip install dask

# Importing Libraries

In [2]:
import os
import regex as re 
import pandas as pd
from datetime import datetime
from langdetect import detect, DetectorFactory
from difflib import get_close_matches
import json
from tqdm import tqdm
import chardet
import dask.bag as db
from dask.diagnostics import ProgressBar
import pathlib
from pathlib import Path

## Declaring Constant
Here, we specify the directories containing our data files.

In [3]:
DATA_DIRS = ["../IRB Decisions - Initial Request - TEXT"]

# For SR:
DATA_DIRS = ["d:/IRB Decisions - Initial Request - TEXT/"]

In [4]:
# set seed for langdetect for consistent results and reproducibility
DetectorFactory.seed = 42



## Language Detection
This function determines the language of a given text.

In [5]:
def detect_language(text):
    try:
        return detect(text)
    except:
        return None

## Decision Maker Extraction
This function searches the given file for the decision maker using regular expressions.

In [6]:
def extract_decision_maker(content):
    patterns = [
        # String in line immediately after 'Panel' and before 'Tribunal', allowing tabs and spaces
        r"^Panel\s*([^\n]+?)\s*\n\s*Tribunal\s*$",  
      
        # String in line immediately after 'Tribunal' and before 'Panel', allowing tabs and spaces
        r"^Tribunal\s*([^\n]+?)\s*\n\s*Panel\s*$",
        # String in line immediately after 'Tribunal' and followed by another 'Tribunal', allowing tabs and spaces
        r"^Tribunal\s*([^\n]+?)\s*\n\s*Tribunal\s*$"
    ]

    for pattern in patterns:
        # Use re.MULTILINE to allow ^ and $ to match the start and end of each line
        match = re.search(pattern, content, re.IGNORECASE | re.MULTILINE)
        if match:
            captured = match.group(1).strip()
            # Check if captured group ends with 'Tribunal' or 'Panel' and exclude it
            if not captured.endswith("Tribunal") and not captured.endswith("Panel"):
                return captured
    return None



## Regular Expression Detector
Functions to parse the date from text files while accounting for several different formats

In [7]:
def match_date_patterns(content):
    patterns = {
        "custom": (r"Date (?:of decision|de la décision)\s*\n\s*([A-Za-z]+)\s+(\d{1,2})\.\s*(\d{4})", lambda m: [m.group(1), m.group(2), m.group(3)]),
        "primary": (r"Date (?:of decision|de la décision)\s*(?:Le )?\s*((?:(?:\d{1,2}|1er)\s+[\w]+\s*,?\s*\d{1,4})|\w+\s+\d{1,2}(?:st|nd|rd|th)?\s*,?\s*\d{1,4}|\d{1,2}-\d{1,2}-\d{1,4})", lambda m: m.group(1).replace(',', '').split()),
        "original_decision": (r"Date of decision\s+([\w\s]+),\s+(\d{4})\s+\(original decision\)", lambda m: m.group(1).strip().split() + [m.group(2).strip()]),
        "tribunal": (r"Tribunal\s*\n\s*([\w\s]+?)\s*\n\s*Date of decision", lambda m: m.group(1).replace(',', '').split()),
        "original": (r"Original\s+([\w]+\s+\d{1,2}(?:st|nd|rd|th)?,?\s+\d{4})", lambda m: m.group(1).replace(',', '').split())
    }

    for key, (pattern, process) in patterns.items():
        match = re.search(pattern, content, re.IGNORECASE)
        if match:
            return process(match)

    return None

## Date Formatter
Takes detected regular expression and converts into one common format

In [8]:
french_to_english = {
        'janvier': 'January', 'fevrier': 'February', 'mars': 'March', 'avril': 'April',
        'mai': 'May', 'juin': 'June', 'juillet': 'July', 'aout': 'August',
        'septembre': 'September', 'octobre': 'October', 'novembre': 'November', 'decembre': 'December'
}

def correct_month_name(misspelled_month, possibilities=['Janvier','January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December'], cutoff=0.6):
    correct_months = get_close_matches(misspelled_month, possibilities, n=1, cutoff=cutoff)
    if correct_months:
        corrected_month = correct_months[0]
        # Check if the corrected month is in the French to English mapping
        return french_to_english.get(corrected_month.lower(), corrected_month)
    else:
        return misspelled_month

def correct_year_typo(year):
    if len(year) == 3 and year.startswith("0"):
        return "20" + year[1:]
    return year

def correct_year_typo(year):
    """Corrects year format typos."""
    return "20" + year[1:] if len(year) == 3 and year.startswith("0") else year

def process_numeric_format(parts):
    """Processes numeric date format 'dd-mm-yyyy'."""
    day, month, year = parts[0].split('-')
    year = correct_year_typo(year)
    return datetime(int(year), int(month), int(day)).date().strftime('%Y-%m-%d')

def process_day_first_format(parts, french_month_mapping):
    """Processes dates in 'day month year' format, French or English."""
    day = 1 if parts[0].lower() == '1er' else int(parts[0])

    month = ''
    # Check if month and year are concatenated
    if len(parts) == 2 and not parts[1].isdigit():
        month_year_str = parts[1]
        for i in range(1, len(month_year_str)):
            if month_year_str[i:].isdigit():
                month_str, year_str = month_year_str[:i], month_year_str[i:]
                year = correct_year_typo(year_str)
                month = french_month_mapping.get(month_str.lower().replace('é', 'e').replace('û', 'u').replace('ô', 'o'), month_str.capitalize())
                break
    else:
        month = parts[1].lower().replace('é', 'e').replace('û', 'u').replace('ô', 'o')
        if len(parts) >= 3:
            year = correct_year_typo(parts[2])

    if month in french_month_mapping:
        return datetime(int(year), french_month_mapping[month], day).date().strftime('%Y-%m-%d')
    else:
        if isinstance(month, int):
            return datetime(int(year), month, day).date().strftime('%Y-%m-%d')
        
        corrected_month = correct_month_name(month.capitalize())
        try:
            return datetime.strptime(f"{corrected_month} {day} {year}", '%B %d %Y').date().strftime('%Y-%m-%d')
        except ValueError as e:
            print(f"Error parsing date: {e}")
            return None

def process_month_first_format(parts):
    """Processes month first format with possible ordinal suffix."""
    day = 0
    month = ''
    year = ''
    
    if len(parts) == 2 and parts[1].isdigit() and len(parts[1]) > 2:
        
        if parts[1].isdigit() and len(parts[1]) > 4: 
            month = parts[0]
            year_str = parts[1][-4:]
            day_str = parts[1][:-4]
            year = year_str
            day = int(day_str)
            
        elif parts[1].isdigit()and len(parts[1]) > 3: #Year is the second entry
            month_day_str = parts[0]
            for i in range(1, len(month_day_str)):
                if not month_day_str[i].isdigit():
                    day_str, month_str = month_day_str[:i], month_day_str[i:]
                    if day_str.isdigit():
                        day = int(day_str)
                    month = french_to_english.get(month_str.lower().replace('é', 'e').replace('û', 'u').replace('ô', 'o'), month_str)
                    parts[0] = month
                    break
            year = parts[1]
        else:
            year_str = parts[1][-4:]
            day_str = parts[1][:-4]
            year = correct_year_typo(year_str)
            day = int(day_str)

    else:
        day = re.sub(r"[^\d]", "", parts[1])
        day = int(day) if day.isdigit() else 1
        if len(parts) >= 3:
            year = correct_year_typo(parts[2])
            
    try:
        corrected_month = correct_month_name(parts[0].capitalize())
        return datetime.strptime(f"{corrected_month} {day} {year}", '%B %d %Y').date().strftime('%Y-%m-%d')
    except ValueError as e:
        print(f"Error parsing date: {e}")
        return None


## Document Date Extraction
This function searches the given file for the document date using regular expressions, taking into account both French and English texts.

In [9]:
def process_date_parts(parts, french_month_mapping):
    """Determines the correct date processing method based on the format of the parts."""
    if '-' in parts[0]:
        return process_numeric_format(parts)
    elif parts[0].isdigit() or parts[0].lower() == '1er':
        return process_day_first_format(parts, french_month_mapping)
    else:
        return process_month_first_format(parts)

def extract_document_date(content):
    french_month_mapping = {
        'janvier': 1, 'fevrier': 2, 'mars': 3, 'avril': 4,
        'mai': 5, 'juin': 6, 'juillet': 7, 'aout': 8,
        'septembre': 9, 'octobre': 10, 'novembre': 11, 'decembre': 12
    }
    
    parts = match_date_patterns(content)
    
    if not parts:
        return None
    return process_date_parts(parts, french_month_mapping)

## File Processor Helpers

In [10]:
def extract_rpd_number(content):
    """Extracts the RPD number from the content, ignoring IAD/ID/RAD files."""
    # Check for lines indicating the file should be ignored
    ignore_lines = ["IAD File",
                    "dossier de la SAI",
                    "IMMIGRATION APPEAL DIVISION", 
                    "ID File", 
                    "IMMIGRATION DIVISION", 
                    "RAD File",
                    "REFUGEE APPEAL DIVISION",
                    "REFUGEE DIVISION"
                    ]
    
    for line in content.splitlines():
        # Sanitize the current line
        sanitized_line = ''.join(c for c in line if c.isprintable()).strip()

        # Check if the sanitized line matches any ignore line
        if any(ignore_line in sanitized_line for ignore_line in ignore_lines):   
            return None
        
        if "RPD File" in sanitized_line or "RPD file" in sanitized_line :          
            rpd_number_match = re.search(r"([A-Z]{2}\d+.\d+)", sanitized_line)
            if rpd_number_match:
                return rpd_number_match.group(1)
            
            # If RPD is in the next immediate line
            next_line_index = content.splitlines().index(line) + 1
            if next_line_index < len(content.splitlines()):
                next_line = content.splitlines()[next_line_index]
                rpd_number_match = re.search(r"([A-Z]{2}\d+-\d+)", next_line)
                if rpd_number_match:
                    return rpd_number_match.group(1)

    if "REFUGEE PROTECTION DIVISION" in content or "SECTION DE LA PROTECTION DES RÉFUGIÉS" in content:
        rpd_number_match = re.search(r"([A-Z]{2}\d+-\d+)", content)
        if rpd_number_match:
            return rpd_number_match.group(1)

    return None

def process_file(file_path):
    """Processes a single file and extracts data."""

    # manually exclude problematic files
    problem_files = ["1821419", "636351"]
    if Path(file_path).stem in problem_files:
        return None
    
    # Use chardet to detect the encoding of the file
    with open(file_path, 'rb') as file:
        raw_data = file.read()
    encoding = chardet.detect(raw_data)['encoding']

    # Read the file with the detected encoding
    with open(file_path, 'r', errors='replace', encoding=encoding) as file:
        content = file.read()

    rpd_number = extract_rpd_number(content)
    if rpd_number:
        lang = detect_language(content)
        decision_maker_name = extract_decision_maker(content)
        document_date = extract_document_date(content)
        year = int(document_date.split('-')[0]) if document_date else None

        return {
            'citation': rpd_number,
            'citation2': '',
            'dataset': 'RPD',
            'name': '',
            'source_url': os.path.basename(file_path),
            #'scraped_timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
            'scraped_timestamp': datetime.fromtimestamp(os.path.getmtime(file_path)).strftime('%Y-%m-%d %H:%M:%S'),
            'document_date': document_date,
            'year': year,
            'unofficial_text': content,
            'language': lang,
            'other': json.dumps({'decision-maker_name': decision_maker_name}, ensure_ascii=False),
        }
    return None

## Processing Files
This block of code reads each file in the dataset directories to extract the needed information, using the previously defined functions and form a Pandas dataframe.

In [11]:
# Main data processing loop (run in paraellel using Dask)

def process_file_wrapper(file_path):
    if not os.path.basename(file_path).startswith('~'):
        return process_file(file_path)

# Gather all file paths
file_paths = []
for data_dir in DATA_DIRS:
    if os.path.exists(data_dir) and os.path.isdir(data_dir):
        dir_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir)]
        file_paths.extend(dir_files)

# Create a Dask Bag from file paths
file_bag = db.from_sequence(file_paths)

# Use Dask to process files in parallel
with ProgressBar():
    results = file_bag.map(process_file_wrapper).filter(lambda x: x is not None).compute()

# Convert results to a Pandas DataFrame
df = pd.DataFrame(results)

# # Process files using a regular loop for easier debugging if needed
# results = []
# for file_path in file_paths:
#     try:
#         result = process_file_wrapper(file_path)
#         if result is not None:
#             results.append(result)
#     except Exception as e:
#         print(f"Error processing file {file_path}: {e}")

# # Convert results to a Pandas DataFrame
# df = pd.DataFrame(results)

[########################################] | 100% Completed | 201.94 s


### Data cleaning
Cleans data to match huggingface dataset

In [12]:
# Manually fix some dates:

# if df.doccument_date are in a series of problem dates, then drop the row from the df
problem_dates = ["0013-10-11",
                 "0013-05-31", 
                 "3006-03-03", 
                 "1005-10-04", 
                 "3013-08-13",
                 "1015-06-03",
                 "1006-03-31"
                 ]
df = df[~df['document_date'].isin(problem_dates)]

# fix dates format
df['document_date'] = pd.to_datetime(df['document_date']).dt.strftime('%Y-%m-%d')
df['scraped_timestamp'] = pd.to_datetime(df['scraped_timestamp']).dt.strftime('%Y-%m-%d')

print (len(df))
# drop where year is nan
df = df.dropna(subset=['year'])
print (len(df))
# convert year to int
df['year'] = df['year'].astype(int)

# Remove rows where unofficial text is duplicated, keeping the last one
df = df.drop_duplicates(subset=['unofficial_text'], keep='last')
print (len(df))

# Remove rows where year is before 2001 or after 2020
df = df[(df['year'] > 2001) & (df['year'] <= 2020)]

# reset index
df = df.reset_index(drop=True)

df



13833
12472
12470


Unnamed: 0,citation,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
0,MA6-05104,,RPD,,1005371.txt,2023-11-12,2008-04-04,2008,Commission de l'immigration et du statut de ré...,fr,"{""decision-maker_name"": null}"
1,TA6-12745,,RPD,,1005377.txt,2023-11-12,2008-07-03,2008,\n\n\nRPD File No. / N° de dossier de la SPR :...,en,"{""decision-maker_name"": ""Joanna Bedard""}"
2,TA8-21155,,RPD,,1005397.txt,2023-11-12,2009-10-27,2009,\n\n\n\nN° de dossier de la SPR/RPD File No.: ...,fr,"{""decision-maker_name"": ""Cliff Berry""}"
3,MA6-00156,,RPD,,1005435.txt,2023-11-12,2008-06-06,2008,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": null}"
4,TA7-02522,,RPD,,1005448.txt,2023-11-12,2009-09-17,2009,\n\nRPD File No. / N° de dossier de la SPR : ...,en,"{""decision-maker_name"": ""M. McCaffrey""}"
...,...,...,...,...,...,...,...,...,...,...,...
12462,TA6-11195,,RPD,,993958.txt,2023-11-13,2008-06-04,2008,\n\n\nN° de dossier de la SPR/RPD File No.: TA...,fr,"{""decision-maker_name"": ""E. Joanne Sajtos""}"
12463,TA7-14136,,RPD,,993959.txt,2023-11-13,2008-12-02,2008,\n\n\nRPD File No. / N° de dossier de la SPR :...,en,"{""decision-maker_name"": ""Roslyn Ahara""}"
12464,TA7-10081,,RPD,,993961.txt,2023-11-13,2008-08-15,2008,\n\n\nN° de dossier de la SPR/RPD File No.: TA...,fr,"{""decision-maker_name"": ""Ken Atkinson""}"
12465,MA7-01235,,RPD,,997313.txt,2023-11-13,2010-12-13,2010,Immigration and\nRefugee Board\nRefugee Protec...,en,"{""decision-maker_name"": null}"


In [13]:
# Clean text of cases
def clean_text(text):

    # remove \xa0
    text = text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])
    
    # # Remove single newlines
    # text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)

    # Convert multiple newlines to single newlines
    text = re.sub(r'\n+', '\n', text)

    # Remove all strings '\n[Page #]\n' (with # being a number of up to 4 digits 
    text = re.sub(r'\n\[Page \d{1,3}\]\n', ' ', text)
    
    return text

tqdm.pandas()
df['unofficial_text'] = df.unofficial_text.progress_apply(clean_text)

100%|██████████| 12467/12467 [00:16<00:00, 762.76it/s]


### Exports

In [14]:
# export cleaned df to jsonl
out_path_parsed = pathlib.Path('DATA/rpd_cases.jsonl')
df.to_json(out_path_parsed, orient='records', lines=True)

In [15]:
# get start and end year
start_year = df['year'].min()
end_year = df['year'].max()

# set output dir
out_path_yearly = 'DATA/YEARLY/'

# export cleaned df to yearly / language json files
for year in tqdm(range(start_year, end_year+1)):
    for language in ['en', 'fr']:
        out_path_yearly_lang = out_path_yearly + f'{year}_{language}.json'
        df[(df.year == year) & (df.language == language)].to_json(out_path_yearly_lang, orient='records', indent=4)

100%|██████████| 19/19 [00:00<00:00, 31.26it/s]


In [16]:
# replace 'other' column with empty string for huggingface
df['other'] = ''

# eport to parquet for hugginface
out_path_parquet = pathlib.Path('DATA/rpd_cases.parquet')
df.to_parquet(out_path_parquet)

In [17]:
# list rows per df.year
df['year'].value_counts()

year
2012    1639
2011    1557
2010    1265
2013    1262
2009     938
2014     897
2007     872
2006     850
2008     744
2005     551
2015     455
2018     340
2017     255
2003     238
2016     194
2004     135
2019     133
2002     121
2020      21
Name: count, dtype: int64