## Notebook to scrape all cases from Refugee Law Lab Reporter Website

Requirements:
    
    pip install requests
    pip install bs4
    pip install pandas
    pip install tqdm

(Produced on Python 3.9.12)

### Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import pathlib

# set up progress bar
from tqdm import tqdm
tqdm.pandas()

# set up paths
out_path_yearly = pathlib.Path('DATA/YEARLY/')

# set up years sought
start_year = 2019
end_year = 2022

### Scrape table of cases from RLL website

In [2]:
# get table from RLLR Website
url = 'https://refugeelab.ca/rllr/'
r = requests.get(url)
print(r.status_code)

# get first table
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find('table')

# put table into dataframe
df = pd.read_html(str(table))[0]

# add link to each row
base_url = 'https://refugeelab.ca/rllr/'
df['Link'] = df['Citation & Link'].apply(lambda x: base_url + x.replace(' ', '').lower())

# drop 'Citation Sort (Hidden)' column
df.drop('Citation Sort (Hidden)', axis=1, inplace=True)

# change 'Citation & Link' to 'citation'
df.rename(columns={'Citation & Link': 'citation'}, inplace=True)

# change Date of Decision to "decision_date"
df.rename(columns={'Date of Decision': 'document_date'}, inplace=True)

# for all column names, replace spaces with underscores and lower
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df


200


Unnamed: 0,citation,country,case_type,document_date,rpd_number,link
0,2022 RLLR 1,Nigeria,PSG: Gender Based Violence,2022/04/29,TB8-20107,https://refugeelab.ca/rllr/2022rllr1
1,2021 RLLR 76,Nigeria,PSG: SOGIE,2021/12/21,VC1-06497,https://refugeelab.ca/rllr/2021rllr76
2,2021 RLLR 75,South Korea,PSG: SOGIE,2021/11/19,VC1-05121,https://refugeelab.ca/rllr/2021rllr75
3,2021 RLLR 74,Malawi,PSG: SOGIE,2021/06/14,VC1-02500,https://refugeelab.ca/rllr/2021rllr74
4,2021 RLLR 73,Iran,PSG: SOGIE,2021/10/30,VC1-02405,https://refugeelab.ca/rllr/2021rllr73
...,...,...,...,...,...,...
475,2019 RLLR 5,Haiti,PSG: SOGIE,2019/10/17,MB7-21566,https://refugeelab.ca/rllr/2019rllr5
476,2019 RLLR 4,Haiti,PSG: Gender Based Violence,2019/07/04,MB7-18975,https://refugeelab.ca/rllr/2019rllr4
477,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,2019/09/10,MB7-18354,https://refugeelab.ca/rllr/2019rllr3
478,2019 RLLR 2,Nigeria,PSG: SOGIE,2019/12/23,TB9-01394,https://refugeelab.ca/rllr/2019rllr2


### Scrape html for each case in the table

In [3]:
# function to get the html from the link
def get_html(link):
    r = requests.get(link)
    if r.status_code == 200:
        time.sleep(.25)
        return r.text
    else:
        return None

df['html'] = df.link.progress_apply(get_html)


100%|██████████| 480/480 [08:14<00:00,  1.03s/it]


In [4]:
# export to jsonl raw 
df.to_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,country,case_type,document_date,rpd_number,link,html
0,2022 RLLR 1,Nigeria,PSG: Gender Based Violence,2022/04/29,TB8-20107,https://refugeelab.ca/rllr/2022rllr1,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
1,2021 RLLR 76,Nigeria,PSG: SOGIE,2021/12/21,VC1-06497,https://refugeelab.ca/rllr/2021rllr76,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
2,2021 RLLR 75,South Korea,PSG: SOGIE,2021/11/19,VC1-05121,https://refugeelab.ca/rllr/2021rllr75,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
3,2021 RLLR 74,Malawi,PSG: SOGIE,2021/06/14,VC1-02500,https://refugeelab.ca/rllr/2021rllr74,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
4,2021 RLLR 73,Iran,PSG: SOGIE,2021/10/30,VC1-02405,https://refugeelab.ca/rllr/2021rllr73,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
...,...,...,...,...,...,...,...
475,2019 RLLR 5,Haiti,PSG: SOGIE,2019/10/17,MB7-21566,https://refugeelab.ca/rllr/2019rllr5,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
476,2019 RLLR 4,Haiti,PSG: Gender Based Violence,2019/07/04,MB7-18975,https://refugeelab.ca/rllr/2019rllr4,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
477,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,2019/09/10,MB7-18354,https://refugeelab.ca/rllr/2019rllr3,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...
478,2019 RLLR 2,Nigeria,PSG: SOGIE,2019/12/23,TB9-01394,https://refugeelab.ca/rllr/2019rllr2,<!DOCTYPE html><html\nclass=no-js dir=ltr lang...


### Parse html for each case

In [20]:
# load raw cases
df = pd.read_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

In [21]:
# function to extract text from html
def get_text(html):

    # if html is None, return None
    if html is None:
        return None

    # extract text from class 'entry-content' in page1 html using beautiful soup
    soup = BeautifulSoup(html, 'html.parser')

    # convert <br> to new line to preserve paragraphs
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Insert newline characters after each <p> tag to preserve paragraphs
    for p in soup.find_all('p'):
        p.insert_after('\n')


    # extract text from class 'entry-content' (remove \xa0)
    text = soup.find('div', {'class': 'entry-content'}).text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])
    return text

df['text'] = df.html.progress_apply(get_text)

# drop html column
df.drop('html', axis=1, inplace=True)

100%|██████████| 480/480 [00:08<00:00, 59.40it/s]


In [22]:
# function to extract RPD_member from text
def get_member(text):
    if text is None:
        return None
    else:
        # if text contains 'Panel:' extract the text after 'Panel:' and before the next newline
        if 'Panel:' in text:
            return re.search(r'Panel:(.*)\n', text).group(1).strip()
        else:
            return None
    
df['member'] = df.text.progress_apply(get_member)

100%|██████████| 480/480 [00:00<00:00, 480149.28it/s]


In [23]:
# put in standard format for RLL (except "other" field)

# create language column with value "en"
df['language'] = "en"

# create dataset column with value "RLLR"
df['dataset'] = "RLLR"

# create year column with first 4 characters of citation:
df['year'] = df.citation.str[:4]
df['year'] = df['year'].astype(int)

# add a name column with value ""
df['name'] = ""

# rename rpd_number to citation2
df.rename(columns={'rpd_number': 'citation2'}, inplace=True)

# rename link column to source_url
df.rename(columns={'link': 'source_url'}, inplace=True)

# rename text column to unofficial_text
df.rename(columns={'text': 'unofficial_text'}, inplace=True)

# add a scraped timestamp column with today's date in YYYY-MM-DD format as a string
df['scraped_timestamp'] = pd.Timestamp.today().strftime('%Y-%m-%d')

# add a column for other that takes "country", "case_type", and "member" columns
# and puts them in a dictionary using valid json format
df['other'] = df[['country', 'case_type', 'member']].to_dict(orient='records')

# reorder columns
list_cols = ['citation',
             'citation2', 
             'dataset', 
             'year', 
             'name', 
             'language', 
             'document_date', 
             'source_url', 
             'scraped_timestamp', 
             'unofficial_text',
             'other',
             ]

df = df[list_cols]


In [24]:
# export to json as a single file
df.to_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 1,TB8-20107,RLLR,2022,,en,2022/04/29,https://refugeelab.ca/rllr/2022rllr1,2023-07-17,Citation: 2022 RLLR 1\nTribunal: Refugee Prote...,"{'country': 'Nigeria', 'case_type': 'PSG: Gend..."
1,2021 RLLR 76,VC1-06497,RLLR,2021,,en,2021/12/21,https://refugeelab.ca/rllr/2021rllr76,2023-07-17,Citation: 2021 RLLR 76\nTribunal: Refugee Prot...,"{'country': 'Nigeria', 'case_type': 'PSG: SOGI..."
2,2021 RLLR 75,VC1-05121,RLLR,2021,,en,2021/11/19,https://refugeelab.ca/rllr/2021rllr75,2023-07-17,Citation: 2021 RLLR 75\nTribunal: Refugee Prot...,"{'country': 'South Korea', 'case_type': 'PSG: ..."
3,2021 RLLR 74,VC1-02500,RLLR,2021,,en,2021/06/14,https://refugeelab.ca/rllr/2021rllr74,2023-07-17,Citation: 2021 RLLR 74\nTribunal: Refugee Prot...,"{'country': 'Malawi', 'case_type': 'PSG: SOGIE..."
4,2021 RLLR 73,VC1-02405,RLLR,2021,,en,2021/10/30,https://refugeelab.ca/rllr/2021rllr73,2023-07-17,Citation: 2021 RLLR 73\nTribunal: Refugee Prot...,"{'country': 'Iran', 'case_type': 'PSG: SOGIE',..."
...,...,...,...,...,...,...,...,...,...,...,...
475,2019 RLLR 5,MB7-21566,RLLR,2019,,en,2019/10/17,https://refugeelab.ca/rllr/2019rllr5,2023-07-17,Citation: 2019 RLLR 5\nTribunal: Refugee Prote...,"{'country': 'Haiti', 'case_type': 'PSG: SOGIE'..."
476,2019 RLLR 4,MB7-18975,RLLR,2019,,en,2019/07/04,https://refugeelab.ca/rllr/2019rllr4,2023-07-17,Citation: 2019 RLLR 4\nTribunal: Refugee Prote...,"{'country': 'Haiti', 'case_type': 'PSG: Gender..."
477,2019 RLLR 3,MB7-18354,RLLR,2019,,en,2019/09/10,https://refugeelab.ca/rllr/2019rllr3,2023-07-17,Citation: 2019 RLLR 3\nTribunal: Refugee Prote...,"{'country': 'Haiti', 'case_type': 'No Nexus: C..."
478,2019 RLLR 2,TB9-01394,RLLR,2019,,en,2019/12/23,https://refugeelab.ca/rllr/2019rllr2,2023-07-17,Citation: 2019 RLLR 2\nTribunal: Refugee Prote...,"{'country': 'Nigeria', 'case_type': 'PSG: SOGI..."


In [25]:
# export cleaned df to parquet
df.to_parquet("DATA/rllr_cases.parquet")

In [26]:
# export cleaned df to yearly json files
for year in tqdm(range(start_year, end_year+1)):
    df[df.year == year].to_json(out_path_yearly / f'{year}.json', orient='records', indent=4)


100%|██████████| 4/4 [00:00<00:00, 181.84it/s]


### Data verification   

In [12]:
#load json
df = pd.read_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

# convert each item in other dictionary to column
df = pd.concat([df.drop(['other'], axis=1), df['other'].apply(pd.Series)], axis=1)
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,country,case_type,member
0,2022 RLLR 1,TB8-20107,RLLR,2022,,en,2022/04/29,https://refugeelab.ca/rllr/2022rllr1,2023-07-17,Citation: 2022 RLLR 1\nTribunal: Refugee Prote...,Nigeria,PSG: Gender Based Violence,M. Gayda
1,2021 RLLR 76,VC1-06497,RLLR,2021,,en,2021/12/21,https://refugeelab.ca/rllr/2021rllr76,2023-07-17,Citation: 2021 RLLR 76\nTribunal: Refugee Prot...,Nigeria,PSG: SOGIE,Lesley Stalker
2,2021 RLLR 75,VC1-05121,RLLR,2021,,en,2021/11/19,https://refugeelab.ca/rllr/2021rllr75,2023-07-17,Citation: 2021 RLLR 75\nTribunal: Refugee Prot...,South Korea,PSG: SOGIE,David Jones
3,2021 RLLR 74,VC1-02500,RLLR,2021,,en,2021/06/14,https://refugeelab.ca/rllr/2021rllr74,2023-07-17,Citation: 2021 RLLR 74\nTribunal: Refugee Prot...,Malawi,PSG: SOGIE,Jennifer Smith
4,2021 RLLR 73,VC1-02405,RLLR,2021,,en,2021/10/30,https://refugeelab.ca/rllr/2021rllr73,2023-07-17,Citation: 2021 RLLR 73\nTribunal: Refugee Prot...,Iran,PSG: SOGIE,Isis Marianne van Loon
...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,2019 RLLR 5,MB7-21566,RLLR,2019,,en,2019/10/17,https://refugeelab.ca/rllr/2019rllr5,2023-07-17,Citation: 2019 RLLR 5\nTribunal: Refugee Prote...,Haiti,PSG: SOGIE,Ethan McMonagle
476,2019 RLLR 4,MB7-18975,RLLR,2019,,en,2019/07/04,https://refugeelab.ca/rllr/2019rllr4,2023-07-17,Citation: 2019 RLLR 4\nTribunal: Refugee Prote...,Haiti,PSG: Gender Based Violence,Nicole Ginsberg
477,2019 RLLR 3,MB7-18354,RLLR,2019,,en,2019/09/10,https://refugeelab.ca/rllr/2019rllr3,2023-07-17,Citation: 2019 RLLR 3\nTribunal: Refugee Prote...,Haiti,No Nexus: Criminality/Corruption,Me Jean-Guy Jam
478,2019 RLLR 2,TB9-01394,RLLR,2019,,en,2019/12/23,https://refugeelab.ca/rllr/2019rllr2,2023-07-17,Citation: 2019 RLLR 2\nTribunal: Refugee Prote...,Nigeria,PSG: SOGIE,Marcelle Bourassa


In [15]:
# function to extract rpd_number from text
def get_rpd_number(text):
    if text is None:
        return None
    else:
        # if text contains 'RPD:' extract the text after 'RPD:' and before the next newline
        if 'RPD Number:' in text:
            return re.search(r'RPD Number:(.*)\n', text).group(1).strip()
        else:
            return None

df['rpd_number'] = df.unofficial_text.progress_apply(get_rpd_number)

# print df where rpd_number2 is not same as rpd_number
df[df.rpd_number != df['citation2']]

100%|██████████| 480/480 [00:00<00:00, 492481.88it/s]


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,country,case_type,member,rpd_number
