## Notebook to scrape all cases from Refugee Law Lab Reporter Website

Requirements:
    
    pip install requests
    pip install bs4
    pip install pandas
    pip install tqdm

(Produced on Python 3.9.12)

### Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import pathlib
import json

# set up progress bar
from tqdm import tqdm
tqdm.pandas()

# set up paths
out_path_yearly = pathlib.Path('DATA/YEARLY/')

# set up years sought
start_year = 2019
end_year = 2022

### Scrape table of cases from RLL website

In [3]:
# get table from RLLR Website
url = 'https://refugeelab.ca/rllr/'
r = requests.get(url)
print(r.status_code)

# get first table
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find('table')

# put table into dataframe
df = pd.read_html(str(table))[0]

# add link to each row
base_url = 'https://refugeelab.ca/rllr/'
df['Link'] = df['Citation & Link'].apply(lambda x: base_url + x.replace(' ', '').lower())

# drop 'Citation Sort (Hidden)' column
#df.drop('Citation Sort (Hidden)', axis=1, inplace=True)

# change 'Citation & Link' to 'citation'
df.rename(columns={'Citation & Link': 'citation'}, inplace=True)

# change Date of Decision to "decision_date"
df.rename(columns={'Date of Decision': 'document_date'}, inplace=True)

# for all column names, replace spaces with underscores and lower
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df


200


Unnamed: 0,citation,country,case_type,document_date,rpd_number,link
0,2022 RLLR 140,Bahamas,PSG: Gender Based Violence,5/13/2022,TC0-01092,https://refugeelab.ca/rllr/2022rllr140
1,2022 RLLR 138,Iran,Political Opinion,8/9/2022,TB9-35046,https://refugeelab.ca/rllr/2022rllr138
2,2022 RLLR 137,Iran,Religion,9/26/2022,TB9-30696,https://refugeelab.ca/rllr/2022rllr137
3,2022 RLLR 136,Russia,Political Opinion,1/18/2022,TB9-27796,https://refugeelab.ca/rllr/2022rllr136
4,2022 RLLR 135,China,Political Opinion,5/4/2022,TB9-27542,https://refugeelab.ca/rllr/2022rllr135
...,...,...,...,...,...,...
636,2019 RLLR 5,Haiti,PSG: SOGIE,10/17/2019,MB7-21566,https://refugeelab.ca/rllr/2019rllr5
637,2019 RLLR 4,Haiti,PSG: Gender Based Violence,7/4/2019,MB7-18975,https://refugeelab.ca/rllr/2019rllr4
638,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,9/10/2019,MB7-18354,https://refugeelab.ca/rllr/2019rllr3
639,2019 RLLR 2,Nigeria,PSG: SOGIE,12/23/2019,TB9-01394,https://refugeelab.ca/rllr/2019rllr2


### Scrape html for each case in the table

In [4]:
# function to get the html from the link
def get_html(link):
    r = requests.get(link)
    if r.status_code == 200:
        time.sleep(.25)
        return r.text
    else:
        return None

df['html'] = df.link.progress_apply(get_html)


100%|██████████| 641/641 [11:07<00:00,  1.04s/it]


In [5]:
# export to jsonl raw 
df.to_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,country,case_type,document_date,rpd_number,link,html
0,2022 RLLR 140,Bahamas,PSG: Gender Based Violence,5/13/2022,TC0-01092,https://refugeelab.ca/rllr/2022rllr140,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
1,2022 RLLR 138,Iran,Political Opinion,8/9/2022,TB9-35046,https://refugeelab.ca/rllr/2022rllr138,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
2,2022 RLLR 137,Iran,Religion,9/26/2022,TB9-30696,https://refugeelab.ca/rllr/2022rllr137,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
3,2022 RLLR 136,Russia,Political Opinion,1/18/2022,TB9-27796,https://refugeelab.ca/rllr/2022rllr136,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
4,2022 RLLR 135,China,Political Opinion,5/4/2022,TB9-27542,https://refugeelab.ca/rllr/2022rllr135,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
...,...,...,...,...,...,...,...
636,2019 RLLR 5,Haiti,PSG: SOGIE,10/17/2019,MB7-21566,https://refugeelab.ca/rllr/2019rllr5,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
637,2019 RLLR 4,Haiti,PSG: Gender Based Violence,7/4/2019,MB7-18975,https://refugeelab.ca/rllr/2019rllr4,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
638,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,9/10/2019,MB7-18354,https://refugeelab.ca/rllr/2019rllr3,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
639,2019 RLLR 2,Nigeria,PSG: SOGIE,12/23/2019,TB9-01394,https://refugeelab.ca/rllr/2019rllr2,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."


### Parse html for each case

In [31]:
# load raw cases
df = pd.read_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

In [32]:
# function to extract text from html
def get_text(html):

    # if html is None, return None
    if html is None:
        return None

    # extract text from class 'entry-content' in page1 html using beautiful soup
    soup = BeautifulSoup(html, 'html.parser')

    # convert <br> to new line to preserve paragraphs
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Insert newline characters after each <p> tag to preserve paragraphs
    for p in soup.find_all('p'):
        p.insert_after('\n')


    # extract text from class 'entry-content' (remove \xa0)
    text = soup.find('div', {'id': 'content'}).text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])

    # remove more than two newlines in a row
    text = re.sub(r'\n{3,}', '\n\n', text)

    # if text starts with one or more newlines, remove them
    text = re.sub(r'^\n+', '', text)
    
    return text

df['text'] = df.html.progress_apply(get_text)

# drop html column
df.drop('html', axis=1, inplace=True)

100%|██████████| 641/641 [00:03<00:00, 163.41it/s]


In [33]:
# PRINT text for first case
print(df.text[0])

2022 RLLR 140

Citation: 2022 RLLR 140
Tribunal: Refugee Protection Division
Date of Decision: May 13, 2022
Panel: Joseph Berkovits
Counsel for the Claimant(s): Tamal Whitfield
Country: Bahamas
RPD Number: TC0-01092
Associated RPD Number(s): TC0-01121
ATIP Number: A-2023-01023
ATIP Pages: N/A

REASONS FOR DECISION

[1] XXXX (the principal claimant), and her son, XXXX (the minor claimant), citizens of the Bahamas, claim refugee protection pursuant to section 96 and 97(1) of the Immigration and Refugee Protection Act, (the IRPA).

[2] The principal claimant was the designated representative for the minor claimant.[1]

[3] The panel heard the claims jointly, pursuant to Refugee Protection Division Rule 55.[2]

[4] The panel has considered and applied the Chairperson’s Guideline 4: Women Claimants Fearing Gender-Related Persecution.[3]

DECISION

[5] For the reasons given below, the panel finds that the claimants are Convention refugees because they have established that they will face a s

In [34]:
# function to extract RPD_member from text
def get_member(text):
    if text is None:
        return None
    else:
        # if text contains 'Panel:' extract the text after 'Panel:' and before the next newline
        if 'Panel:' in text:
            return re.search(r'Panel:(.*)\n', text).group(1).strip()
        else:
            return None
    
df['member'] = df.text.progress_apply(get_member)

100%|██████████| 641/641 [00:00<00:00, 641811.62it/s]


In [35]:
# put in standard format for RLL (except "other" field)

# create language column with value "en"
df['language'] = "en"

# create dataset column with value "RLLR"
df['dataset'] = "RLLR"

# create year column with first 4 characters of citation:
df['year'] = df.citation.str[:4]
df['year'] = df['year'].astype(int)

# add a name column with value ""
df['name'] = ""

# rename rpd_number to citation2
df.rename(columns={'rpd_number': 'citation2'}, inplace=True)

# rename link column to source_url
df.rename(columns={'link': 'source_url'}, inplace=True)

# rename text column to unofficial_text
df.rename(columns={'text': 'unofficial_text'}, inplace=True)

# add a scraped timestamp column with today's date in YYYY-MM-DD format as a string
df['scraped_timestamp'] = pd.Timestamp.today().strftime('%Y-%m-%d')

# add a column for other that takes "country", "case_type", and "member" columns
# and puts them in a dictionary using valid json format (as a string)
df['other'] = df[['country', 'case_type', 'member']].to_dict(orient='records')
df['other'] = df['other'].apply(lambda x: json.dumps(x))

# reorder columns
list_cols = ['citation',
             'citation2', 
             'dataset', 
             'year', 
             'name', 
             'language', 
             'document_date', 
             'source_url', 
             'scraped_timestamp', 
             'unofficial_text',
             'other',
             ]

df = df[list_cols]


In [36]:
# export to json as a single file
df.to_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 140,TC0-01092,RLLR,2022,,en,5/13/2022,https://refugeelab.ca/rllr/2022rllr140,2024-08-06,2022 RLLR 140\n\nCitation: 2022 RLLR 140\nTrib...,"{""country"": ""Bahamas"", ""case_type"": ""PSG: Gend..."
1,2022 RLLR 138,TB9-35046,RLLR,2022,,en,8/9/2022,https://refugeelab.ca/rllr/2022rllr138,2024-08-06,2022 RLLR 138\n\nCitation: 2022 RLLR 138\nTrib...,"{""country"": ""Iran"", ""case_type"": ""Political Op..."
2,2022 RLLR 137,TB9-30696,RLLR,2022,,en,9/26/2022,https://refugeelab.ca/rllr/2022rllr137,2024-08-06,2022 RLLR 137\n\nCitation: 2022 RLLR 137\nTrib...,"{""country"": ""Iran"", ""case_type"": ""Religion"", ""..."
3,2022 RLLR 136,TB9-27796,RLLR,2022,,en,1/18/2022,https://refugeelab.ca/rllr/2022rllr136,2024-08-06,2022 RLLR 136\n\nCitation: 2022 RLLR 136\nTrib...,"{""country"": ""Russia"", ""case_type"": ""Political ..."
4,2022 RLLR 135,TB9-27542,RLLR,2022,,en,5/4/2022,https://refugeelab.ca/rllr/2022rllr135,2024-08-06,2022 RLLR 135\n\nCitation: 2022 RLLR 135\nTrib...,"{""country"": ""China"", ""case_type"": ""Political O..."
...,...,...,...,...,...,...,...,...,...,...,...
636,2019 RLLR 5,MB7-21566,RLLR,2019,,en,10/17/2019,https://refugeelab.ca/rllr/2019rllr5,2024-08-06,2019 RLLR 5\n\nCitation: 2019 RLLR 5\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""PSG: SOGIE""..."
637,2019 RLLR 4,MB7-18975,RLLR,2019,,en,7/4/2019,https://refugeelab.ca/rllr/2019rllr4,2024-08-06,2019 RLLR 4\n\nCitation: 2019 RLLR 4\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""PSG: Gender..."
638,2019 RLLR 3,MB7-18354,RLLR,2019,,en,9/10/2019,https://refugeelab.ca/rllr/2019rllr3,2024-08-06,2019 RLLR 3\n\nCitation: 2019 RLLR 3\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""No Nexus: C..."
639,2019 RLLR 2,TB9-01394,RLLR,2019,,en,12/23/2019,https://refugeelab.ca/rllr/2019rllr2,2024-08-06,2019 RLLR 2\n\nCitation: 2019 RLLR 2\nTribunal...,"{""country"": ""Nigeria"", ""case_type"": ""PSG: SOGI..."


In [37]:
# export cleaned df to parquet
df.to_parquet("DATA/rllr_cases.parquet")

In [38]:
# export cleaned df to yearly json files
for year in tqdm(range(start_year, end_year+1)):
    df[df.year == year].to_json(out_path_yearly / f'{year}.json', orient='records', indent=4)


100%|██████████| 4/4 [00:00<00:00, 235.35it/s]


In [39]:
# Direct to HF
out_path_HF = pathlib.Path('d:/AI-Projects/canadian-legal-data/RLLR/train.parquet')
df.to_parquet(out_path_HF)


### Data verification   

In [28]:
#load json
df = pd.read_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

# convert each item in other dictionary to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member
0,2022 RLLR 140,TC0-01092,RLLR,2022,,en,5/13/2022,https://refugeelab.ca/rllr/2022rllr140,2024-08-06,2022 RLLR 140\n\nCitation: 2022 RLLR 140\nTrib...,"{'country': 'Bahamas', 'case_type': 'PSG: Gend...",Bahamas,PSG: Gender Based Violence,Joseph Berkovits
1,2022 RLLR 138,TB9-35046,RLLR,2022,,en,8/9/2022,https://refugeelab.ca/rllr/2022rllr138,2024-08-06,2022 RLLR 138\n\nCitation: 2022 RLLR 138\nTrib...,"{'country': 'Iran', 'case_type': 'Political Op...",Iran,Political Opinion,Victoria Bragues
2,2022 RLLR 137,TB9-30696,RLLR,2022,,en,9/26/2022,https://refugeelab.ca/rllr/2022rllr137,2024-08-06,2022 RLLR 137\n\nCitation: 2022 RLLR 137\nTrib...,"{'country': 'Iran', 'case_type': 'Religion', '...",Iran,Religion,Milton Israel
3,2022 RLLR 136,TB9-27796,RLLR,2022,,en,1/18/2022,https://refugeelab.ca/rllr/2022rllr136,2024-08-06,2022 RLLR 136\n\nCitation: 2022 RLLR 136\nTrib...,"{'country': 'Russia', 'case_type': 'Political ...",Russia,Political Opinion,Cristina De Leon
4,2022 RLLR 135,TB9-27542,RLLR,2022,,en,5/4/2022,https://refugeelab.ca/rllr/2022rllr135,2024-08-06,2022 RLLR 135\n\nCitation: 2022 RLLR 135\nTrib...,"{'country': 'China', 'case_type': 'Political O...",China,Political Opinion,Suraj Balakrishnan
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
636,2019 RLLR 5,MB7-21566,RLLR,2019,,en,10/17/2019,https://refugeelab.ca/rllr/2019rllr5,2024-08-06,2019 RLLR 5\n\nCitation: 2019 RLLR 5\nTribunal...,"{'country': 'Haiti', 'case_type': 'PSG: SOGIE'...",Haiti,PSG: SOGIE,Ethan McMonagle
637,2019 RLLR 4,MB7-18975,RLLR,2019,,en,7/4/2019,https://refugeelab.ca/rllr/2019rllr4,2024-08-06,2019 RLLR 4\n\nCitation: 2019 RLLR 4\nTribunal...,"{'country': 'Haiti', 'case_type': 'PSG: Gender...",Haiti,PSG: Gender Based Violence,Nicole Ginsberg
638,2019 RLLR 3,MB7-18354,RLLR,2019,,en,9/10/2019,https://refugeelab.ca/rllr/2019rllr3,2024-08-06,2019 RLLR 3\n\nCitation: 2019 RLLR 3\nTribunal...,"{'country': 'Haiti', 'case_type': 'No Nexus: C...",Haiti,No Nexus: Criminality/Corruption,Me Jean-Guy Jam
639,2019 RLLR 2,TB9-01394,RLLR,2019,,en,12/23/2019,https://refugeelab.ca/rllr/2019rllr2,2024-08-06,2019 RLLR 2\n\nCitation: 2019 RLLR 2\nTribunal...,"{'country': 'Nigeria', 'case_type': 'PSG: SOGI...",Nigeria,PSG: SOGIE,Marcelle Bourassa


In [29]:
# function to extract rpd_number from text
def get_rpd_number(text):
    if text is None:
        return None
    else:
        # if text contains 'RPD:' extract the text after 'RPD:' and before the next newline
        if 'RPD Number:' in text:
            return re.search(r'RPD Number:(.*)\n', text).group(1).strip()
        else:
            return None

df['rpd_number'] = df.unofficial_text.progress_apply(get_rpd_number)

# print df where rpd_number2 is not same as rpd_number
df[df.rpd_number != df['citation2']]

100%|██████████| 641/641 [00:00<00:00, 640588.24it/s]


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member,rpd_number


In [30]:
df.head()


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member,rpd_number
0,2022 RLLR 140,TC0-01092,RLLR,2022,,en,5/13/2022,https://refugeelab.ca/rllr/2022rllr140,2024-08-06,2022 RLLR 140\n\nCitation: 2022 RLLR 140\nTrib...,"{'country': 'Bahamas', 'case_type': 'PSG: Gend...",Bahamas,PSG: Gender Based Violence,Joseph Berkovits,TC0-01092
1,2022 RLLR 138,TB9-35046,RLLR,2022,,en,8/9/2022,https://refugeelab.ca/rllr/2022rllr138,2024-08-06,2022 RLLR 138\n\nCitation: 2022 RLLR 138\nTrib...,"{'country': 'Iran', 'case_type': 'Political Op...",Iran,Political Opinion,Victoria Bragues,TB9-35046
2,2022 RLLR 137,TB9-30696,RLLR,2022,,en,9/26/2022,https://refugeelab.ca/rllr/2022rllr137,2024-08-06,2022 RLLR 137\n\nCitation: 2022 RLLR 137\nTrib...,"{'country': 'Iran', 'case_type': 'Religion', '...",Iran,Religion,Milton Israel,TB9-30696
3,2022 RLLR 136,TB9-27796,RLLR,2022,,en,1/18/2022,https://refugeelab.ca/rllr/2022rllr136,2024-08-06,2022 RLLR 136\n\nCitation: 2022 RLLR 136\nTrib...,"{'country': 'Russia', 'case_type': 'Political ...",Russia,Political Opinion,Cristina De Leon,TB9-27796
4,2022 RLLR 135,TB9-27542,RLLR,2022,,en,5/4/2022,https://refugeelab.ca/rllr/2022rllr135,2024-08-06,2022 RLLR 135\n\nCitation: 2022 RLLR 135\nTrib...,"{'country': 'China', 'case_type': 'Political O...",China,Political Opinion,Suraj Balakrishnan,TB9-27542
