## Notebook to scrape all cases from Refugee Law Lab Reporter Website

Requirements:
    
    pip install requests
    pip install bs4
    pip install pandas
    pip install tqdm

(Produced on Python 3.9.12)

### Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import pathlib
import json

# set up progress bar
from tqdm import tqdm
tqdm.pandas()

# set up paths
out_path_yearly = pathlib.Path('DATA/YEARLY/')

# set up years sought
start_year = 2019
end_year = 2022

### Scrape table of cases from RLL website

In [2]:
# get table from RLLR Website
url = 'https://refugeelab.ca/rllr/'
r = requests.get(url)
print(r.status_code)

# get first table
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find('table')

# put table into dataframe
df = pd.read_html(str(table))[0]

# add link to each row
base_url = 'https://refugeelab.ca/rllr/'
df['Link'] = df['Citation & Link'].apply(lambda x: base_url + x.replace(' ', '').lower())

# drop 'Citation Sort (Hidden)' column
#df.drop('Citation Sort (Hidden)', axis=1, inplace=True)

# change 'Citation & Link' to 'citation'
df.rename(columns={'Citation & Link': 'citation'}, inplace=True)

# change Date of Decision to "decision_date"
df.rename(columns={'Date of Decision': 'document_date'}, inplace=True)

# for all column names, replace spaces with underscores and lower
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df


200


Unnamed: 0,citation,country,case_type,document_date,rpd_number,link
0,2023 RLLR 90,Hong Kong,PSG: SOGIE,11/7/2023,VC3-07423,https://refugeelab.ca/rllr/2023rllr90
1,2023 RLLR 89,Colombia,Race/Ethnicity/Nationality,12/7/2023,VC3-07174,https://refugeelab.ca/rllr/2023rllr89
2,2023 RLLR 88,Israel,PSG: Gender Based Violence,12/8/2023,VC3-06509,https://refugeelab.ca/rllr/2023rllr88
3,2023 RLLR 87,Israel,No Nexus: Criminality/Corruption,6/8/2023,VC3-00122,https://refugeelab.ca/rllr/2023rllr87
4,2023 RLLR 86,Colombia,PSG: SOGIE,8/22/2023,VC2-10309,https://refugeelab.ca/rllr/2023rllr86
...,...,...,...,...,...,...
724,2019 RLLR 5,Haiti,PSG: SOGIE,10/17/2019,MB7-21566,https://refugeelab.ca/rllr/2019rllr5
725,2019 RLLR 4,Haiti,PSG: Gender Based Violence,7/4/2019,MB7-18975,https://refugeelab.ca/rllr/2019rllr4
726,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,9/10/2019,MB7-18354,https://refugeelab.ca/rllr/2019rllr3
727,2019 RLLR 2,Nigeria,PSG: SOGIE,12/23/2019,TB9-01394,https://refugeelab.ca/rllr/2019rllr2


### Scrape html for each case in the table

In [3]:
# function to get the html from the link
def get_html(link):
    r = requests.get(link)
    if r.status_code == 200:
        time.sleep(.25)
        return r.text
    else:
        return None

df['html'] = df.link.progress_apply(get_html)


100%|██████████| 729/729 [11:13<00:00,  1.08it/s]


In [4]:
# export to jsonl raw 
df.to_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,country,case_type,document_date,rpd_number,link,html
0,2023 RLLR 90,Hong Kong,PSG: SOGIE,11/7/2023,VC3-07423,https://refugeelab.ca/rllr/2023rllr90,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
1,2023 RLLR 89,Colombia,Race/Ethnicity/Nationality,12/7/2023,VC3-07174,https://refugeelab.ca/rllr/2023rllr89,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
2,2023 RLLR 88,Israel,PSG: Gender Based Violence,12/8/2023,VC3-06509,https://refugeelab.ca/rllr/2023rllr88,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
3,2023 RLLR 87,Israel,No Nexus: Criminality/Corruption,6/8/2023,VC3-00122,https://refugeelab.ca/rllr/2023rllr87,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
4,2023 RLLR 86,Colombia,PSG: SOGIE,8/22/2023,VC2-10309,https://refugeelab.ca/rllr/2023rllr86,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
...,...,...,...,...,...,...,...
724,2019 RLLR 5,Haiti,PSG: SOGIE,10/17/2019,MB7-21566,https://refugeelab.ca/rllr/2019rllr5,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
725,2019 RLLR 4,Haiti,PSG: Gender Based Violence,7/4/2019,MB7-18975,https://refugeelab.ca/rllr/2019rllr4,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
726,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,9/10/2019,MB7-18354,https://refugeelab.ca/rllr/2019rllr3,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
727,2019 RLLR 2,Nigeria,PSG: SOGIE,12/23/2019,TB9-01394,https://refugeelab.ca/rllr/2019rllr2,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."


### Parse html for each case

In [5]:
# load raw cases
df = pd.read_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

In [6]:
# function to extract text from html
def get_text(html):

    # if html is None, return None
    if html is None:
        return None

    # extract text from class 'entry-content' in page1 html using beautiful soup
    soup = BeautifulSoup(html, 'html.parser')

    # convert <br> to new line to preserve paragraphs
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Insert newline characters after each <p> tag to preserve paragraphs
    for p in soup.find_all('p'):
        p.insert_after('\n')


    # extract text from class 'entry-content' (remove \xa0)
    text = soup.find('div', {'id': 'content'}).text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])

    # remove more than two newlines in a row
    text = re.sub(r'\n{3,}', '\n\n', text)

    # if text starts with one or more newlines, remove them
    text = re.sub(r'^\n+', '', text)
    
    return text

df['text'] = df.html.progress_apply(get_text)

# drop html column
df.drop('html', axis=1, inplace=True)

100%|██████████| 729/729 [00:04<00:00, 160.47it/s]


In [7]:
# PRINT text for first case
print(df.text[0])

2023 RLLR 90

Citation: 2023 RLLR 90
Tribunal: Refugee Protection Division
Date of Decision: November 7, 2023
Panel: Lorna Farmer
Counsel for the Claimant(s): Nalini Reddy, Carolina Fridman
Country: Hong Kong Special Administrative Region
RPD Number: VC3-07423
Associated RPD Number(s): N/A
ATIP Number: A-2023-01721
ATIP Pages: N/A

DECISION

[1] MEMBER: This is the decision of the Refugee Protection Division in the claim of XXXX XXXX XXXX, as a citizen of Hong Kong Special Administrative Region who is claiming refugee protection pursuant to section 96 and subsection 97(1) of the Immigration and Refugee Protection Act. In rendering this decision, I have considered and applied the Chairperson’s Guidelines on proceedings before the IRB involving sexual orientation, gender identity, and expression, and sex characteristics that offers guidance to promote greater understanding of cases involving gender identity and expression and the harm individuals may face due to their non-conformity with

In [8]:
# function to extract RPD_member from text
def get_member(text):
    if text is None:
        return None
    else:
        # if text contains 'Panel:' extract the text after 'Panel:' and before the next newline
        if 'Panel:' in text:
            return re.search(r'Panel:(.*)\n', text).group(1).strip()
        else:
            return None
    
df['member'] = df.text.progress_apply(get_member)

100%|██████████| 729/729 [00:00<00:00, 727838.04it/s]


In [9]:
# put in standard format for RLL (except "other" field)

# create language column with value "en"
df['language'] = "en"

# create dataset column with value "RLLR"
df['dataset'] = "RLLR"

# create year column with first 4 characters of citation:
df['year'] = df.citation.str[:4]
df['year'] = df['year'].astype(int)

# add a name column with value ""
df['name'] = ""

# rename rpd_number to citation2
df.rename(columns={'rpd_number': 'citation2'}, inplace=True)

# rename link column to source_url
df.rename(columns={'link': 'source_url'}, inplace=True)

# rename text column to unofficial_text
df.rename(columns={'text': 'unofficial_text'}, inplace=True)

# add a scraped timestamp column with today's date in YYYY-MM-DD format as a string
df['scraped_timestamp'] = pd.Timestamp.today().strftime('%Y-%m-%d')

# add a column for other that takes "country", "case_type", and "member" columns
# and puts them in a dictionary using valid json format (as a string)
df['other'] = df[['country', 'case_type', 'member']].to_dict(orient='records')
df['other'] = df['other'].apply(lambda x: json.dumps(x))

# reorder columns
list_cols = ['citation',
             'citation2', 
             'dataset', 
             'year', 
             'name', 
             'language', 
             'document_date', 
             'source_url', 
             'scraped_timestamp', 
             'unofficial_text',
             'other',
             ]

df = df[list_cols]


In [10]:
# export to json as a single file
df.to_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2023 RLLR 90,VC3-07423,RLLR,2023,,en,11/7/2023,https://refugeelab.ca/rllr/2023rllr90,2024-09-08,2023 RLLR 90\n\nCitation: 2023 RLLR 90\nTribun...,"{""country"": ""Hong Kong"", ""case_type"": ""PSG: SO..."
1,2023 RLLR 89,VC3-07174,RLLR,2023,,en,12/7/2023,https://refugeelab.ca/rllr/2023rllr89,2024-09-08,2023 RLLR 89\n\nCitation: 2023 RLLR 89\nTribun...,"{""country"": ""Colombia"", ""case_type"": ""Race/Eth..."
2,2023 RLLR 88,VC3-06509,RLLR,2023,,en,12/8/2023,https://refugeelab.ca/rllr/2023rllr88,2024-09-08,2023 RLLR 88\n\nCitation: 2023 RLLR 88\nTribun...,"{""country"": ""Israel"", ""case_type"": ""PSG: Gende..."
3,2023 RLLR 87,VC3-00122,RLLR,2023,,en,6/8/2023,https://refugeelab.ca/rllr/2023rllr87,2024-09-08,2023 RLLR 87\n\nCitation: 2023 RLLR 87\nTribun...,"{""country"": ""Israel"", ""case_type"": ""No Nexus: ..."
4,2023 RLLR 86,VC2-10309,RLLR,2023,,en,8/22/2023,https://refugeelab.ca/rllr/2023rllr86,2024-09-08,2023 RLLR 86\n\nCitation: 2023 RLLR 86\nTribun...,"{""country"": ""Colombia"", ""case_type"": ""PSG: SOG..."
...,...,...,...,...,...,...,...,...,...,...,...
724,2019 RLLR 5,MB7-21566,RLLR,2019,,en,10/17/2019,https://refugeelab.ca/rllr/2019rllr5,2024-09-08,2019 RLLR 5\n\nCitation: 2019 RLLR 5\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""PSG: SOGIE""..."
725,2019 RLLR 4,MB7-18975,RLLR,2019,,en,7/4/2019,https://refugeelab.ca/rllr/2019rllr4,2024-09-08,2019 RLLR 4\n\nCitation: 2019 RLLR 4\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""PSG: Gender..."
726,2019 RLLR 3,MB7-18354,RLLR,2019,,en,9/10/2019,https://refugeelab.ca/rllr/2019rllr3,2024-09-08,2019 RLLR 3\n\nCitation: 2019 RLLR 3\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""No Nexus: C..."
727,2019 RLLR 2,TB9-01394,RLLR,2019,,en,12/23/2019,https://refugeelab.ca/rllr/2019rllr2,2024-09-08,2019 RLLR 2\n\nCitation: 2019 RLLR 2\nTribunal...,"{""country"": ""Nigeria"", ""case_type"": ""PSG: SOGI..."


In [11]:
# export cleaned df to parquet
df.to_parquet("DATA/rllr_cases.parquet")

In [12]:
# export cleaned df to yearly json files
for year in tqdm(range(start_year, end_year+1)):
    df[df.year == year].to_json(out_path_yearly / f'{year}.json', orient='records', indent=4)


100%|██████████| 4/4 [00:00<00:00, 249.95it/s]


In [13]:
# Direct to HF
out_path_HF = pathlib.Path('d:/AI-Projects/canadian-legal-data/RLLR/train.parquet')
df.to_parquet(out_path_HF)


### Data verification   

In [14]:
#load json
df = pd.read_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

# convert each item in other dictionary to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member
0,2023 RLLR 90,VC3-07423,RLLR,2023,,en,11/7/2023,https://refugeelab.ca/rllr/2023rllr90,2024-09-08,2023 RLLR 90\n\nCitation: 2023 RLLR 90\nTribun...,"{'country': 'Hong Kong', 'case_type': 'PSG: SO...",Hong Kong,PSG: SOGIE,Lorna Farmer
1,2023 RLLR 89,VC3-07174,RLLR,2023,,en,12/7/2023,https://refugeelab.ca/rllr/2023rllr89,2024-09-08,2023 RLLR 89\n\nCitation: 2023 RLLR 89\nTribun...,"{'country': 'Colombia', 'case_type': 'Race/Eth...",Colombia,Race/Ethnicity/Nationality,Ademiju Olatunji
2,2023 RLLR 88,VC3-06509,RLLR,2023,,en,12/8/2023,https://refugeelab.ca/rllr/2023rllr88,2024-09-08,2023 RLLR 88\n\nCitation: 2023 RLLR 88\nTribun...,"{'country': 'Israel', 'case_type': 'PSG: Gende...",Israel,PSG: Gender Based Violence,Stefan Martens
3,2023 RLLR 87,VC3-00122,RLLR,2023,,en,6/8/2023,https://refugeelab.ca/rllr/2023rllr87,2024-09-08,2023 RLLR 87\n\nCitation: 2023 RLLR 87\nTribun...,"{'country': 'Israel', 'case_type': 'No Nexus: ...",Israel,No Nexus: Criminality/Corruption,Zonia M. Ouchakov
4,2023 RLLR 86,VC2-10309,RLLR,2023,,en,8/22/2023,https://refugeelab.ca/rllr/2023rllr86,2024-09-08,2023 RLLR 86\n\nCitation: 2023 RLLR 86\nTribun...,"{'country': 'Colombia', 'case_type': 'PSG: SOG...",Colombia,PSG: SOGIE,Frank Fowlie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
724,2019 RLLR 5,MB7-21566,RLLR,2019,,en,10/17/2019,https://refugeelab.ca/rllr/2019rllr5,2024-09-08,2019 RLLR 5\n\nCitation: 2019 RLLR 5\nTribunal...,"{'country': 'Haiti', 'case_type': 'PSG: SOGIE'...",Haiti,PSG: SOGIE,Ethan McMonagle
725,2019 RLLR 4,MB7-18975,RLLR,2019,,en,7/4/2019,https://refugeelab.ca/rllr/2019rllr4,2024-09-08,2019 RLLR 4\n\nCitation: 2019 RLLR 4\nTribunal...,"{'country': 'Haiti', 'case_type': 'PSG: Gender...",Haiti,PSG: Gender Based Violence,Nicole Ginsberg
726,2019 RLLR 3,MB7-18354,RLLR,2019,,en,9/10/2019,https://refugeelab.ca/rllr/2019rllr3,2024-09-08,2019 RLLR 3\n\nCitation: 2019 RLLR 3\nTribunal...,"{'country': 'Haiti', 'case_type': 'No Nexus: C...",Haiti,No Nexus: Criminality/Corruption,Me Jean-Guy Jam
727,2019 RLLR 2,TB9-01394,RLLR,2019,,en,12/23/2019,https://refugeelab.ca/rllr/2019rllr2,2024-09-08,2019 RLLR 2\n\nCitation: 2019 RLLR 2\nTribunal...,"{'country': 'Nigeria', 'case_type': 'PSG: SOGI...",Nigeria,PSG: SOGIE,Marcelle Bourassa


In [15]:
# function to extract rpd_number from text
def get_rpd_number(text):
    if text is None:
        return None
    else:
        # if text contains 'RPD:' extract the text after 'RPD:' and before the next newline
        if 'RPD Number:' in text:
            return re.search(r'RPD Number:(.*)\n', text).group(1).strip()
        else:
            return None

df['rpd_number'] = df.unofficial_text.progress_apply(get_rpd_number)

# print df where rpd_number2 is not same as rpd_number
df[df.rpd_number != df['citation2']]

100%|██████████| 729/729 [00:00<00:00, 729052.84it/s]


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member,rpd_number


In [16]:
df.head()


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member,rpd_number
0,2023 RLLR 90,VC3-07423,RLLR,2023,,en,11/7/2023,https://refugeelab.ca/rllr/2023rllr90,2024-09-08,2023 RLLR 90\n\nCitation: 2023 RLLR 90\nTribun...,"{'country': 'Hong Kong', 'case_type': 'PSG: SO...",Hong Kong,PSG: SOGIE,Lorna Farmer,VC3-07423
1,2023 RLLR 89,VC3-07174,RLLR,2023,,en,12/7/2023,https://refugeelab.ca/rllr/2023rllr89,2024-09-08,2023 RLLR 89\n\nCitation: 2023 RLLR 89\nTribun...,"{'country': 'Colombia', 'case_type': 'Race/Eth...",Colombia,Race/Ethnicity/Nationality,Ademiju Olatunji,VC3-07174
2,2023 RLLR 88,VC3-06509,RLLR,2023,,en,12/8/2023,https://refugeelab.ca/rllr/2023rllr88,2024-09-08,2023 RLLR 88\n\nCitation: 2023 RLLR 88\nTribun...,"{'country': 'Israel', 'case_type': 'PSG: Gende...",Israel,PSG: Gender Based Violence,Stefan Martens,VC3-06509
3,2023 RLLR 87,VC3-00122,RLLR,2023,,en,6/8/2023,https://refugeelab.ca/rllr/2023rllr87,2024-09-08,2023 RLLR 87\n\nCitation: 2023 RLLR 87\nTribun...,"{'country': 'Israel', 'case_type': 'No Nexus: ...",Israel,No Nexus: Criminality/Corruption,Zonia M. Ouchakov,VC3-00122
4,2023 RLLR 86,VC2-10309,RLLR,2023,,en,8/22/2023,https://refugeelab.ca/rllr/2023rllr86,2024-09-08,2023 RLLR 86\n\nCitation: 2023 RLLR 86\nTribun...,"{'country': 'Colombia', 'case_type': 'PSG: SOG...",Colombia,PSG: SOGIE,Frank Fowlie,VC2-10309
