## Notebook to scrape all cases from Refugee Law Lab Reporter Website

Requirements:
    
    pip install requests
    pip install bs4
    pip install pandas
    pip install tqdm

(Produced on Python 3.9.12)

### Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import pathlib
import json

# set up progress bar
from tqdm import tqdm
tqdm.pandas()

# set up paths
out_path_yearly = pathlib.Path('DATA/YEARLY/')

# set up years sought
start_year = 2019
end_year = 2023

### Scrape table of cases from RLL website

In [2]:
# get table from RLLR Website
url = 'https://refugeelab.ca/rllr/'
r = requests.get(url)
print(r.status_code)

# get first table
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find('table')

# put table into dataframe
df = pd.read_html(str(table))[0]

# add link to each row
base_url = 'https://refugeelab.ca/rllr/'
df['Link'] = df['Citation & Link'].apply(lambda x: base_url + x.replace(' ', '').lower())

# drop 'Citation Sort (Hidden)' column
#df.drop('Citation Sort (Hidden)', axis=1, inplace=True)

# change 'Citation & Link' to 'citation'
df.rename(columns={'Citation & Link': 'citation'}, inplace=True)

# change Date of Decision to "decision_date"
df.rename(columns={'Date of Decision': 'document_date'}, inplace=True)

# for all column names, replace spaces with underscores and lower
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df


200


Unnamed: 0,citation,country,case_type,document_date,rpd_number,link
0,2023 RLLR 120,Cameroon,Political Opinion,12/20/2023,VC3-09637,https://refugeelab.ca/rllr/2023rllr120
1,2023 RLLR 119,Russia,Political Opinion,12/19/2023,VC3-07126,https://refugeelab.ca/rllr/2023rllr119
2,2023 RLLR 118,Lebanon,PSG: SOGIE,10/31/2023,VC3-06815,https://refugeelab.ca/rllr/2023rllr118
3,2023 RLLR 117,USA,PSG: SOGIE,11/2/2023,VC3-06662,https://refugeelab.ca/rllr/2023rllr117
4,2023 RLLR 116,Ukraine,PSG: SOGIE,6/26/2023,VC3-04003,https://refugeelab.ca/rllr/2023rllr116
...,...,...,...,...,...,...
752,2019 RLLR 5,Haiti,PSG: SOGIE,10/17/2019,MB7-21566,https://refugeelab.ca/rllr/2019rllr5
753,2019 RLLR 4,Haiti,PSG: Gender Based Violence,7/4/2019,MB7-18975,https://refugeelab.ca/rllr/2019rllr4
754,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,9/10/2019,MB7-18354,https://refugeelab.ca/rllr/2019rllr3
755,2019 RLLR 2,Nigeria,PSG: SOGIE,12/23/2019,TB9-01394,https://refugeelab.ca/rllr/2019rllr2


### Scrape html for each case in the table

In [3]:
# function to get the html from the link
def get_html(link):
    r = requests.get(link)
    if r.status_code == 200:
        time.sleep(.25)
        return r.text
    else:
        return None

df['html'] = df.link.progress_apply(get_html)


100%|██████████| 757/757 [09:59<00:00,  1.26it/s]


In [4]:
# export to jsonl raw 
df.to_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,country,case_type,document_date,rpd_number,link,html
0,2023 RLLR 120,Cameroon,Political Opinion,12/20/2023,VC3-09637,https://refugeelab.ca/rllr/2023rllr120,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
1,2023 RLLR 119,Russia,Political Opinion,12/19/2023,VC3-07126,https://refugeelab.ca/rllr/2023rllr119,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
2,2023 RLLR 118,Lebanon,PSG: SOGIE,10/31/2023,VC3-06815,https://refugeelab.ca/rllr/2023rllr118,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
3,2023 RLLR 117,USA,PSG: SOGIE,11/2/2023,VC3-06662,https://refugeelab.ca/rllr/2023rllr117,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
4,2023 RLLR 116,Ukraine,PSG: SOGIE,6/26/2023,VC3-04003,https://refugeelab.ca/rllr/2023rllr116,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
...,...,...,...,...,...,...,...
752,2019 RLLR 5,Haiti,PSG: SOGIE,10/17/2019,MB7-21566,https://refugeelab.ca/rllr/2019rllr5,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
753,2019 RLLR 4,Haiti,PSG: Gender Based Violence,7/4/2019,MB7-18975,https://refugeelab.ca/rllr/2019rllr4,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
754,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,9/10/2019,MB7-18354,https://refugeelab.ca/rllr/2019rllr3,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."
755,2019 RLLR 2,Nigeria,PSG: SOGIE,12/23/2019,TB9-01394,https://refugeelab.ca/rllr/2019rllr2,"<!DOCTYPE html>\n<html class=""html"" dir=""ltr"" ..."


### Parse html for each case

In [5]:
# load raw cases
df = pd.read_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

In [6]:
# function to extract text from html
def get_text(html):

    # if html is None, return None
    if html is None:
        return None

    # extract text from class 'entry-content' in page1 html using beautiful soup
    soup = BeautifulSoup(html, 'html.parser')

    # convert <br> to new line to preserve paragraphs
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Insert newline characters after each <p> tag to preserve paragraphs
    for p in soup.find_all('p'):
        p.insert_after('\n')


    # extract text from class 'entry-content' (remove \xa0)
    text = soup.find('div', {'id': 'content'}).text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])

    # remove more than two newlines in a row
    text = re.sub(r'\n{3,}', '\n\n', text)

    # if text starts with one or more newlines, remove them
    text = re.sub(r'^\n+', '', text)
    
    return text

df['text'] = df.html.progress_apply(get_text)

# drop html column
df.drop('html', axis=1, inplace=True)

100%|██████████| 757/757 [00:04<00:00, 153.88it/s]


In [7]:
# PRINT text for first case
print(df.text[0])

2023 RLLR 120

Citation: 2023 RLLR 120
Tribunal: Refugee Protection Division
Date of Decision: December 20, 2023
Panel: Kate Bilkevitch
Counsel for the Claimant(s): N/A
Country: Cameroon
RPD Number: VC3-09637
Associated RPD Number(s): N/A
ATIP Number: A-2024-00593
ATIP Pages: N/A

DECISION

[1] MEMBER: This is the decision of the Refugee Protection Division, the RPD, in the claim of XXXX XXXX XXXX as a citizen of Cameroon, who is claiming refugee protection pursuant to section 96 and subsection 97(1) of the Immigration and Refugee Protection Act.

ALLEGATIONS

[2] The claimant’s allegations are contained in his Basis of Claim form. To summarise only briefly, the claimant has a fear of persecution at the hands of the Cameroonian authorities because of his background as an anglophone and his perceived support of the separatist movement due to his employment as an XXXX in the conflict zone.

DETERMINATION

[3] I find that the claimant is a Convention refugee pursuant to section 96 of the 

In [8]:
# function to extract RPD_member from text
def get_member(text):
    if text is None:
        return None
    else:
        # if text contains 'Panel:' extract the text after 'Panel:' and before the next newline
        if 'Panel:' in text:
            return re.search(r'Panel:(.*)\n', text).group(1).strip()
        else:
            return None
    
df['member'] = df.text.progress_apply(get_member)

100%|██████████| 757/757 [00:00<00:00, 757416.06it/s]


In [9]:
# put in standard format for RLL (except "other" field)

# create language column with value "en"
df['language'] = "en"

# create dataset column with value "RLLR"
df['dataset'] = "RLLR"

# create year column with first 4 characters of citation:
df['year'] = df.citation.str[:4]
df['year'] = df['year'].astype(int)

# add a name column with value ""
df['name'] = ""

# rename rpd_number to citation2
df.rename(columns={'rpd_number': 'citation2'}, inplace=True)

# rename link column to source_url
df.rename(columns={'link': 'source_url'}, inplace=True)

# rename text column to unofficial_text
df.rename(columns={'text': 'unofficial_text'}, inplace=True)

# add a scraped timestamp column with today's date in YYYY-MM-DD format as a string
df['scraped_timestamp'] = pd.Timestamp.today().strftime('%Y-%m-%d')

# add a column for other that takes "country", "case_type", and "member" columns
# and puts them in a dictionary using valid json format (as a string)
df['other'] = df[['country', 'case_type', 'member']].to_dict(orient='records')
df['other'] = df['other'].apply(lambda x: json.dumps(x))

# reorder columns
list_cols = ['citation',
             'citation2', 
             'dataset', 
             'year', 
             'name', 
             'language', 
             'document_date', 
             'source_url', 
             'scraped_timestamp', 
             'unofficial_text',
             'other',
             ]

df = df[list_cols]


In [10]:
# export to json as a single file
df.to_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2023 RLLR 120,VC3-09637,RLLR,2023,,en,12/20/2023,https://refugeelab.ca/rllr/2023rllr120,2024-11-02,2023 RLLR 120\n\nCitation: 2023 RLLR 120\nTrib...,"{""country"": ""Cameroon"", ""case_type"": ""Politica..."
1,2023 RLLR 119,VC3-07126,RLLR,2023,,en,12/19/2023,https://refugeelab.ca/rllr/2023rllr119,2024-11-02,2023 RLLR 119\n\nCitation: 2023 RLLR 119\nTrib...,"{""country"": ""Russia"", ""case_type"": ""Political ..."
2,2023 RLLR 118,VC3-06815,RLLR,2023,,en,10/31/2023,https://refugeelab.ca/rllr/2023rllr118,2024-11-02,2023 RLLR 118\n\nCitation: 2023 RLLR 118\nTrib...,"{""country"": ""Lebanon"", ""case_type"": ""PSG: SOGI..."
3,2023 RLLR 117,VC3-06662,RLLR,2023,,en,11/2/2023,https://refugeelab.ca/rllr/2023rllr117,2024-11-02,2023 RLLR 117\n\nCitation: 2023 RLLR 117\nTrib...,"{""country"": ""USA"", ""case_type"": ""PSG: SOGIE"", ..."
4,2023 RLLR 116,VC3-04003,RLLR,2023,,en,6/26/2023,https://refugeelab.ca/rllr/2023rllr116,2024-11-02,2023 RLLR 116\n\nCitation: 2023 RLLR 116\nTrib...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: SOGI..."
...,...,...,...,...,...,...,...,...,...,...,...
752,2019 RLLR 5,MB7-21566,RLLR,2019,,en,10/17/2019,https://refugeelab.ca/rllr/2019rllr5,2024-11-02,2019 RLLR 5\n\nCitation: 2019 RLLR 5\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""PSG: SOGIE""..."
753,2019 RLLR 4,MB7-18975,RLLR,2019,,en,7/4/2019,https://refugeelab.ca/rllr/2019rllr4,2024-11-02,2019 RLLR 4\n\nCitation: 2019 RLLR 4\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""PSG: Gender..."
754,2019 RLLR 3,MB7-18354,RLLR,2019,,en,9/10/2019,https://refugeelab.ca/rllr/2019rllr3,2024-11-02,2019 RLLR 3\n\nCitation: 2019 RLLR 3\nTribunal...,"{""country"": ""Haiti"", ""case_type"": ""No Nexus: C..."
755,2019 RLLR 2,TB9-01394,RLLR,2019,,en,12/23/2019,https://refugeelab.ca/rllr/2019rllr2,2024-11-02,2019 RLLR 2\n\nCitation: 2019 RLLR 2\nTribunal...,"{""country"": ""Nigeria"", ""case_type"": ""PSG: SOGI..."


In [11]:
# export cleaned df to parquet
df.to_parquet("DATA/rllr_cases.parquet")

In [12]:
# export cleaned df to yearly json files
for year in tqdm(range(start_year, end_year+1)):
    df[df.year == year].to_json(out_path_yearly / f'{year}.json', orient='records', indent=4)


100%|██████████| 5/5 [00:00<00:00, 242.64it/s]


In [13]:
# Direct to HF
out_path_HF = pathlib.Path('d:/AI-Projects/canadian-legal-data/RLLR/train.parquet')
df.to_parquet(out_path_HF)


### Data verification   

In [None]:
#load json
df = pd.read_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

# convert each item in other dictionary to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df

In [None]:
# function to extract rpd_number from text
def get_rpd_number(text):
    if text is None:
        return None
    else:
        # if text contains 'RPD:' extract the text after 'RPD:' and before the next newline
        if 'RPD Number:' in text:
            return re.search(r'RPD Number:(.*)\n', text).group(1).strip()
        else:
            return None

df['rpd_number'] = df.unofficial_text.progress_apply(get_rpd_number)

# print df where rpd_number2 is not same as rpd_number
df[df.rpd_number != df['citation2']]

In [None]:
df.head()
