In [36]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
import time

# set up progress bar
from tqdm import tqdm
tqdm.pandas()

In [31]:
# get table from RLLR Website
url = 'https://refugeelab.ca/rllr/'
r = requests.get(url)
print(r.status_code)

# get first table
soup = BeautifulSoup(r.text, 'html.parser')
table = soup.find('table')

# put table into dataframe
df = pd.read_html(str(table))[0]

# add link to each row
base_url = 'https://refugeelab.ca/rllr/'
df['Link'] = df['Citation & Link'].apply(lambda x: base_url + x.replace(' ', '').lower())

# drop 'Citation Sort (Hidden)' column
df.drop('Citation Sort (Hidden)', axis=1, inplace=True)

# change 'Citation & Link' to 'citation'
df.rename(columns={'Citation & Link': 'citation'}, inplace=True)

# change Date of Decision to "decision_date"
df.rename(columns={'Date of Decision': 'decision_date'}, inplace=True)

# for all column names, replace spaces with underscores and lower
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

df


200


Unnamed: 0,Citation & Link,Country,Case Type,Date of Decision,RPD Number,Link
0,2022 RLLR 1,Nigeria,PSG: Gender Based Violence,2022/04/29,TB8-20107,https://refugeelab.ca/rllr/2022rllr1
1,2021 RLLR 76,Nigeria,PSG: SOGIE,2021/12/21,VC1-06497,https://refugeelab.ca/rllr/2021rllr76
2,2021 RLLR 75,South Korea,PSG: SOGIE,2021/11/19,VC1-05121,https://refugeelab.ca/rllr/2021rllr75
3,2021 RLLR 74,Malawi,PSG: SOGIE,2021/06/14,VC1-02500,https://refugeelab.ca/rllr/2021rllr74
4,2021 RLLR 73,Iran,PSG: SOGIE,2021/10/30,VC1-02405,https://refugeelab.ca/rllr/2021rllr73
...,...,...,...,...,...,...
475,2019 RLLR 5,Haiti,PSG: SOGIE,2019/10/17,MB7-21566,https://refugeelab.ca/rllr/2019rllr5
476,2019 RLLR 4,Haiti,PSG: Gender Based Violence,2019/07/04,MB7-18975,https://refugeelab.ca/rllr/2019rllr4
477,2019 RLLR 3,Haiti,No Nexus: Criminality/Corruption,2019/09/10,MB7-18354,https://refugeelab.ca/rllr/2019rllr3
478,2019 RLLR 2,Nigeria,PSG: SOGIE,2019/12/23,TB9-01394,https://refugeelab.ca/rllr/2019rllr2


In [34]:
# function to get the html from the link
def get_html(link):
    r = requests.get(link)
    if r.status_code == 200:
        time.sleep(.25)
        return r.text
    else:
        return None

df['html'] = df.link.progress_apply(get_html)


100%|██████████| 480/480 [20:11<00:00,  2.52s/it]


In [35]:
# # export to json
# df.to_json('DATA/rllr_cases_raw.json', orient='records', indent=4)

df

In [78]:
# function to extract text from html
def get_text(html):

    # if html is None, return None
    if html is None:
        return None

    # extract text from class 'entry-content' in page1 html using beautiful soup
    soup = BeautifulSoup(html, 'html.parser')

    # convert <br> to new line to preserve paragraphs
    for br in soup.find_all('br'):
        br.replace_with('\n')

    # Insert newline characters after each <p> tag to preserve paragraphs
    for p in soup.find_all('p'):
        p.insert_after('\n')


    # extract text from class 'entry-content' (remove \xa0)
    text = soup.find('div', {'class': 'entry-content'}).text.replace('\xa0', ' ')

    # Remove multiple whitespaces and preserve paragraphs
    text = '\n'.join([re.sub(r'\s+', ' ', line.strip()) for line in text.split('\n')])
    return text

df['text'] = df.html.progress_apply(get_text)

# drop html column
df.drop('html', axis=1, inplace=True)

100%|██████████| 480/480 [00:08<00:00, 58.92it/s]


In [79]:
# function to extract RPD_member from text
def get_member(text):
    if text is None:
        return None
    else:
        # if text contains 'Panel:' extract the text after 'Panel:' and before the next newline
        if 'Panel:' in text:
            return re.search(r'Panel:(.*)\n', text).group(1).strip()
        else:
            return None

df['member'] = df.text.progress_apply(get_member)

# function to extract rpd_number from text
def get_rpd_number(text):
    if text is None:
        return None
    else:
        # if text contains 'RPD:' extract the text after 'RPD:' and before the next newline
        if 'RPD Number:' in text:
            return re.search(r'RPD Number:(.*)\n', text).group(1).strip()
        else:
            return None

df['rpd_number2'] = df.text.progress_apply(get_rpd_number)


100%|██████████| 480/480 [00:00<00:00, 467006.71it/s]
100%|██████████| 480/480 [00:00<00:00, 480722.52it/s]


In [80]:
# print df where rpd_number2 is not same as rpd_number
df[df.rpd_number2 != df['RPD Number']]


Unnamed: 0,Citation & Link,Country,Case Type,Date of Decision,RPD Number,Link,text,member,rpd_number2
60,2021 RLLR 17,Russia,PSG: SOGIE,2021/07/09,VC0-03444,https://refugeelab.ca/rllr/2021rllr17,,,
70,2021 RLLR 7,Bahamas,PSG: SOGIE,2021/06/10,VC1-03029,https://refugeelab.ca/rllr/2021rllr7,Citation: 2021 RLLR 7\nTribunal: Refugee Prote...,Kristy Sim,VC1-03028
126,2020 RLLR 140,Venezuela,Political Opinion,2020/01/14,VB9-03821,https://refugeelab.ca/rllr/2020rllr140,Citation: 2020 RLLR 140\nTribunal: Refugee Pro...,Kari Schroeder,"VB9-03821,"


In [81]:
# export to json
df.to_json('DATA/rllr_parsed.json', orient='records', indent=4)