## Notebook to scrape all cases from Refugee Law Lab Reporter Website

Requirements:
    
    pip install requests
    pip install bs4
    pip install pandas
    pip install tqdm

(Produced on Python 3.9.12)

### Setup

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time
import pathlib
import json

# set up progress bar
from tqdm import tqdm
tqdm.pandas()

# set up paths
out_path_yearly = pathlib.Path('DATA/YEARLY/')

# set up years sought
start_year = 2019
end_year = 2022

### Scrape table of cases from RLL website

In [2]:
# get table from RLLR Website
url = 'https://refugeelab.ca/rllr/'
r = requests.get(url)
print(r.status_code)

# get first table
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find('table')

# put table into dataframe
df = pd.read_html(str(table))[0]

# add link to each row
base_url = 'https://refugeelab.ca/rllr/'
df['Link'] = df['Citation & Link'].apply(lambda x: base_url + x.replace(' ', '').lower())

# drop 'Citation Sort (Hidden)' column
df.drop('Citation Sort (Hidden)', axis=1, inplace=True)

# change 'Citation & Link' to 'citation'
df.rename(columns={'Citation & Link': 'citation'}, inplace=True)

# change Date of Decision to "decision_date"
df.rename(columns={'Date of Decision': 'document_date'}, inplace=True)

# for all column names, replace spaces with underscores and lower
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df


200


Unnamed: 0,citation,country,case_type,document_date,rpd_number,link
0,2022 RLLR 31,Sudan,Political Opinion,2022/02/03,VC1-06798,https://refugeelab.ca/rllr/2022rllr31
1,2022 RLLR 30,Barbados,PSG: Gender Based Violence,2022/11/28,VC2-07212,https://refugeelab.ca/rllr/2022rllr30
2,2022 RLLR 29,Ukraine,PSG: Gender Based Violence,2022/11/09,VC2-06405,https://refugeelab.ca/rllr/2022rllr29
3,2022 RLLR 28,India,Political Opinion,2022/09/28,VC2-06395,https://refugeelab.ca/rllr/2022rllr28
4,2022 RLLR 27,Lebanon,Political Opinion,2022/08/30,VC2-05006,https://refugeelab.ca/rllr/2022rllr27
...,...,...,...,...,...,...
533,2019 RLLR 5,Haiti,PSG: SOGIE,2019/10/17,MB7-21566,https://refugeelab.ca/rllr/2019rllr5
534,2019 RLLR 4,Haiti,PSG: Gender Based Violence,2019/07/04,MB7-18975,https://refugeelab.ca/rllr/2019rllr4
535,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,2019/09/10,MB7-18354,https://refugeelab.ca/rllr/2019rllr3
536,2019 RLLR 2,Nigeria,PSG: SOGIE,2019/12/23,TB9-01394,https://refugeelab.ca/rllr/2019rllr2


### Scrape html for each case in the table

In [3]:
# function to get the html from the link
def get_html(link):
    r = requests.get(link)
    if r.status_code == 200:
        time.sleep(.25)
        return r.text
    else:
        return None

df['html'] = df.link.progress_apply(get_html)


100%|██████████| 538/538 [10:30<00:00,  1.17s/it]


In [4]:
# export to jsonl raw 
df.to_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,country,case_type,document_date,rpd_number,link,html
0,2022 RLLR 31,Sudan,Political Opinion,2022/02/03,VC1-06798,https://refugeelab.ca/rllr/2022rllr31,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
1,2022 RLLR 30,Barbados,PSG: Gender Based Violence,2022/11/28,VC2-07212,https://refugeelab.ca/rllr/2022rllr30,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
2,2022 RLLR 29,Ukraine,PSG: Gender Based Violence,2022/11/09,VC2-06405,https://refugeelab.ca/rllr/2022rllr29,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
3,2022 RLLR 28,India,Political Opinion,2022/09/28,VC2-06395,https://refugeelab.ca/rllr/2022rllr28,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
4,2022 RLLR 27,Lebanon,Political Opinion,2022/08/30,VC2-05006,https://refugeelab.ca/rllr/2022rllr27,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
...,...,...,...,...,...,...,...
533,2019 RLLR 5,Haiti,PSG: SOGIE,2019/10/17,MB7-21566,https://refugeelab.ca/rllr/2019rllr5,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
534,2019 RLLR 4,Haiti,PSG: Gender Based Violence,2019/07/04,MB7-18975,https://refugeelab.ca/rllr/2019rllr4,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
535,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,2019/09/10,MB7-18354,https://refugeelab.ca/rllr/2019rllr3,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...
536,2019 RLLR 2,Nigeria,PSG: SOGIE,2019/12/23,TB9-01394,https://refugeelab.ca/rllr/2019rllr2,<!DOCTYPE html><html\nclass=no-js lang=en-CA><...


### Parse html for each case

In [5]:
# load raw cases
df = pd.read_json('DATA/rllr_cases_raw.jsonl', orient='records', lines=True)

In [6]:
# function to extract text from html
def get_text(html):

    # if html is None, return None
    if html is None:
        return None

    # extract text from class 'entry-content' in page1 html using beautiful soup
    soup = BeautifulSoup(html, 'html.parser')

    # convert <br> to new line to preserve paragraphs
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Insert newline characters after each <p> tag to preserve paragraphs
    for p in soup.find_all('p'):
        p.insert_after('\n')


    # extract text from class 'entry-content' (remove \xa0)
    text = soup.find('div', {'class': 'entry-content'}).text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])
    return text

df['text'] = df.html.progress_apply(get_text)

# drop html column
df.drop('html', axis=1, inplace=True)

100%|██████████| 538/538 [00:03<00:00, 135.64it/s]


In [7]:
# function to extract RPD_member from text
def get_member(text):
    if text is None:
        return None
    else:
        # if text contains 'Panel:' extract the text after 'Panel:' and before the next newline
        if 'Panel:' in text:
            return re.search(r'Panel:(.*)\n', text).group(1).strip()
        else:
            return None
    
df['member'] = df.text.progress_apply(get_member)

100%|██████████| 538/538 [00:00<00:00, 537270.37it/s]


In [8]:
# put in standard format for RLL (except "other" field)

# create language column with value "en"
df['language'] = "en"

# create dataset column with value "RLLR"
df['dataset'] = "RLLR"

# create year column with first 4 characters of citation:
df['year'] = df.citation.str[:4]
df['year'] = df['year'].astype(int)

# add a name column with value ""
df['name'] = ""

# rename rpd_number to citation2
df.rename(columns={'rpd_number': 'citation2'}, inplace=True)

# rename link column to source_url
df.rename(columns={'link': 'source_url'}, inplace=True)

# rename text column to unofficial_text
df.rename(columns={'text': 'unofficial_text'}, inplace=True)

# add a scraped timestamp column with today's date in YYYY-MM-DD format as a string
df['scraped_timestamp'] = pd.Timestamp.today().strftime('%Y-%m-%d')

# add a column for other that takes "country", "case_type", and "member" columns
# and puts them in a dictionary using valid json format (as a string)
df['other'] = df[['country', 'case_type', 'member']].to_dict(orient='records')
df['other'] = df['other'].apply(lambda x: json.dumps(x))

# reorder columns
list_cols = ['citation',
             'citation2', 
             'dataset', 
             'year', 
             'name', 
             'language', 
             'document_date', 
             'source_url', 
             'scraped_timestamp', 
             'unofficial_text',
             'other',
             ]

df = df[list_cols]


In [9]:
# export to json as a single file
df.to_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2023-11-30,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{""country"": ""Sudan"", ""case_type"": ""Political O..."
1,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2023-11-30,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{""country"": ""Barbados"", ""case_type"": ""PSG: Gen..."
2,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2023-11-30,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: Gend..."
3,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2023-11-30,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{""country"": ""India"", ""case_type"": ""Political O..."
4,2022 RLLR 27,VC2-05006,RLLR,2022,,en,2022/08/30,https://refugeelab.ca/rllr/2022rllr27,2023-11-30,Citation: 2022 RLLR 27\nTribunal: Refugee Prot...,"{""country"": ""Lebanon"", ""case_type"": ""Political..."
...,...,...,...,...,...,...,...,...,...,...,...
533,2019 RLLR 5,MB7-21566,RLLR,2019,,en,2019/10/17,https://refugeelab.ca/rllr/2019rllr5,2023-11-30,Citation: 2019 RLLR 5\nTribunal: Refugee Prote...,"{""country"": ""Haiti"", ""case_type"": ""PSG: SOGIE""..."
534,2019 RLLR 4,MB7-18975,RLLR,2019,,en,2019/07/04,https://refugeelab.ca/rllr/2019rllr4,2023-11-30,Citation: 2019 RLLR 4\nTribunal: Refugee Prote...,"{""country"": ""Haiti"", ""case_type"": ""PSG: Gender..."
535,2019 RLLR 3,MB7-18354,RLLR,2019,,en,2019/09/10,https://refugeelab.ca/rllr/2019rllr3,2023-11-30,Citation: 2019 RLLR 3\nTribunal: Refugee Prote...,"{""country"": ""Haiti"", ""case_type"": ""No Nexus: C..."
536,2019 RLLR 2,TB9-01394,RLLR,2019,,en,2019/12/23,https://refugeelab.ca/rllr/2019rllr2,2023-11-30,Citation: 2019 RLLR 2\nTribunal: Refugee Prote...,"{""country"": ""Nigeria"", ""case_type"": ""PSG: SOGI..."


In [10]:
# export cleaned df to parquet
df.to_parquet("DATA/rllr_cases.parquet")

In [11]:
# export cleaned df to yearly json files
for year in tqdm(range(start_year, end_year+1)):
    df[df.year == year].to_json(out_path_yearly / f'{year}.json', orient='records', indent=4)


100%|██████████| 4/4 [00:00<00:00, 266.67it/s]


### Data verification   

In [12]:
#load json
df = pd.read_json('DATA/rllr_cases.jsonl', orient='records', lines=True)

# convert each item in other dictionary to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member
0,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2023-11-30,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{'country': 'Sudan', 'case_type': 'Political O...",Sudan,Political Opinion,Siobhan Yorgun
1,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2023-11-30,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{'country': 'Barbados', 'case_type': 'PSG: Gen...",Barbados,PSG: Gender Based Violence,Nick Bower
2,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2023-11-30,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{'country': 'Ukraine', 'case_type': 'PSG: Gend...",Ukraine,PSG: Gender Based Violence,Hannah Gray
3,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2023-11-30,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{'country': 'India', 'case_type': 'Political O...",India,Political Opinion,Kylee Carreno
4,2022 RLLR 27,VC2-05006,RLLR,2022,,en,2022/08/30,https://refugeelab.ca/rllr/2022rllr27,2023-11-30,Citation: 2022 RLLR 27\nTribunal: Refugee Prot...,"{'country': 'Lebanon', 'case_type': 'Political...",Lebanon,Political Opinion,Kay Scorer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
533,2019 RLLR 5,MB7-21566,RLLR,2019,,en,2019/10/17,https://refugeelab.ca/rllr/2019rllr5,2023-11-30,Citation: 2019 RLLR 5\nTribunal: Refugee Prote...,"{'country': 'Haiti', 'case_type': 'PSG: SOGIE'...",Haiti,PSG: SOGIE,Ethan McMonagle
534,2019 RLLR 4,MB7-18975,RLLR,2019,,en,2019/07/04,https://refugeelab.ca/rllr/2019rllr4,2023-11-30,Citation: 2019 RLLR 4\nTribunal: Refugee Prote...,"{'country': 'Haiti', 'case_type': 'PSG: Gender...",Haiti,PSG: Gender Based Violence,Nicole Ginsberg
535,2019 RLLR 3,MB7-18354,RLLR,2019,,en,2019/09/10,https://refugeelab.ca/rllr/2019rllr3,2023-11-30,Citation: 2019 RLLR 3\nTribunal: Refugee Prote...,"{'country': 'Haiti', 'case_type': 'No Nexus: C...",Haiti,No Nexus: Criminality/Corruption,Me Jean-Guy Jam
536,2019 RLLR 2,TB9-01394,RLLR,2019,,en,2019/12/23,https://refugeelab.ca/rllr/2019rllr2,2023-11-30,Citation: 2019 RLLR 2\nTribunal: Refugee Prote...,"{'country': 'Nigeria', 'case_type': 'PSG: SOGI...",Nigeria,PSG: SOGIE,Marcelle Bourassa


In [13]:
# function to extract rpd_number from text
def get_rpd_number(text):
    if text is None:
        return None
    else:
        # if text contains 'RPD:' extract the text after 'RPD:' and before the next newline
        if 'RPD Number:' in text:
            return re.search(r'RPD Number:(.*)\n', text).group(1).strip()
        else:
            return None

df['rpd_number'] = df.unofficial_text.progress_apply(get_rpd_number)

# print df where rpd_number2 is not same as rpd_number
df[df.rpd_number != df['citation2']]

100%|██████████| 538/538 [00:00<00:00, 536376.41it/s]


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member,rpd_number


In [14]:
df.head()


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other,country,case_type,member,rpd_number
0,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2023-11-30,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{'country': 'Sudan', 'case_type': 'Political O...",Sudan,Political Opinion,Siobhan Yorgun,VC1-06798
1,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2023-11-30,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{'country': 'Barbados', 'case_type': 'PSG: Gen...",Barbados,PSG: Gender Based Violence,Nick Bower,VC2-07212
2,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2023-11-30,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{'country': 'Ukraine', 'case_type': 'PSG: Gend...",Ukraine,PSG: Gender Based Violence,Hannah Gray,VC2-06405
3,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2023-11-30,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{'country': 'India', 'case_type': 'Political O...",India,Political Opinion,Kylee Carreno,VC2-06395
4,2022 RLLR 27,VC2-05006,RLLR,2022,,en,2022/08/30,https://refugeelab.ca/rllr/2022rllr27,2023-11-30,Citation: 2022 RLLR 27\nTribunal: Refugee Prot...,"{'country': 'Lebanon', 'case_type': 'Political...",Lebanon,Political Opinion,Kay Scorer,VC2-05006
