## Notebook to load and analyze Refugee Law Lab Reporter data

Requirements

    pip install pandas

If using parquet:

    pip install pyarrow 

(produced with Python 3.9.12)

## Load Data

#### Four options: Local (json & parquet) & Remote (json & parquet)

In [1]:
# OPTION 1: Load parquet data locally via cloned repo

# First, clone git repo
# Then run this code to load data

import pandas as pd
df = pd.read_parquet('DATA/rllr_cases.parquet')
df.head()

# (if code fails, add engine='pyarrow' to read_parquet() function)

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 32,VC1-05483,RLLR,2022,,en,2022/03/08,https://refugeelab.ca/rllr/2022rllr32,2024-04-11,Citation: 2022 RLLR 32\nTribunal: Refugee Prot...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Gende..."
1,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2024-04-11,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{""country"": ""Sudan"", ""case_type"": ""Political O..."
2,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2024-04-11,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{""country"": ""Barbados"", ""case_type"": ""PSG: Gen..."
3,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2024-04-11,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: Gend..."
4,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2024-04-11,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{""country"": ""India"", ""case_type"": ""Political O..."


In [None]:
# OPTION 2: Load JSON data locally via cloned repo

# First, clone git repo
# Then run this code to load data

import pandas as pd
import json
import pathlib

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# Set variables
start_year = 2019  # First year of data sought (2019 +)
end_year = 2023  # Last year of data sought (2023 -)

# load data
results = []
for year in range(start_year, end_year+1):
    with open(data_path / f'{year}.json') as f:
        results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)
df.head()

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2019 RLLR 220,TB6-04576,RLLR,2019,,en,2019/05/14,https://refugeelab.ca/rllr/2019rllr220,2024-04-11,Citation: 2019 RLLR 220\nTribunal: Refugee Pro...,"{""country"": ""Somalia"", ""case_type"": ""Religion""..."
1,2019 RLLR 219,TB6-07934,RLLR,2019,,en,2019/01/15,https://refugeelab.ca/rllr/2019rllr219,2024-04-11,Citation: 2019 RLLR 219\nTribunal: Refugee Pro...,"{""country"": ""Iraq"", ""case_type"": ""Religion"", ""..."
2,2019 RLLR 218,TB8-15170,RLLR,2019,,en,2019/10/22,https://refugeelab.ca/rllr/2019rllr218,2024-04-11,Citation: 2019 RLLR 218\nTribunal: Refugee Pro...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Other..."
3,2019 RLLR 217,TB8-07871,RLLR,2019,,en,2019/05/08,https://refugeelab.ca/rllr/2019rllr217,2024-04-11,Citation: 2019 RLLR 217\nTribunal: Refugee Pro...,"{""country"": ""Angola"", ""case_type"": ""Race/Ethni..."
4,2019 RLLR 216,TB9-14438,RLLR,2019,,en,2019/10/02,https://refugeelab.ca/rllr/2019rllr216,2024-04-11,Citation: 2019 RLLR 216\nTribunal: Refugee Pro...,"{""country"": ""China"", ""case_type"": ""Race/Ethnic..."


In [3]:
# OPTION 3: Load parquet data remotely from GitHub without cloning repo

import pandas as pd
import requests
from io import BytesIO

url = 'https://github.com/Refugee-Law-Lab/rllr_bulk_data/raw/master/DATA/rllr_cases.parquet'

# load data
results = requests.get(url)

# convert to dataframe
df = pd.read_parquet(BytesIO(results.content))
df.head()

# (if code fails, add engine='pyarrow' to read_parquet() function)

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 32,VC1-05483,RLLR,2022,,en,2022/03/08,https://refugeelab.ca/rllr/2022rllr32,2024-04-11,Citation: 2022 RLLR 32\nTribunal: Refugee Prot...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Gende..."
1,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2024-04-11,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{""country"": ""Sudan"", ""case_type"": ""Political O..."
2,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2024-04-11,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{""country"": ""Barbados"", ""case_type"": ""PSG: Gen..."
3,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2024-04-11,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: Gend..."
4,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2024-04-11,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{""country"": ""India"", ""case_type"": ""Political O..."


In [None]:
# OPTION 4: Load json data remotely from GitHub without cloning repo

import pandas as pd
import requests

# Set variables
start_year = 2019  # First year of data sought (2019 +)
end_year = 2023  # Last year of data sought (2023 -)

base_ulr = 'https://raw.githubusercontent.com/Refugee-Law-Lab/rllr_bulk_data/master/DATA/YEARLY/'

# load data
results = []
for year in range(start_year, end_year+1):
        url = base_ulr + f'{year}.json'
        results.extend(requests.get(url).json())

# convert to dataframe
df = pd.DataFrame(results)
df.head()

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2019 RLLR 220,TB6-04576,RLLR,2019,,en,2019/05/14,https://refugeelab.ca/rllr/2019rllr220,2024-04-11,Citation: 2019 RLLR 220\nTribunal: Refugee Pro...,"{""country"": ""Somalia"", ""case_type"": ""Religion""..."
1,2019 RLLR 219,TB6-07934,RLLR,2019,,en,2019/01/15,https://refugeelab.ca/rllr/2019rllr219,2024-04-11,Citation: 2019 RLLR 219\nTribunal: Refugee Pro...,"{""country"": ""Iraq"", ""case_type"": ""Religion"", ""..."
2,2019 RLLR 218,TB8-15170,RLLR,2019,,en,2019/10/22,https://refugeelab.ca/rllr/2019rllr218,2024-04-11,Citation: 2019 RLLR 218\nTribunal: Refugee Pro...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Other..."
3,2019 RLLR 217,TB8-07871,RLLR,2019,,en,2019/05/08,https://refugeelab.ca/rllr/2019rllr217,2024-04-11,Citation: 2019 RLLR 217\nTribunal: Refugee Pro...,"{""country"": ""Angola"", ""case_type"": ""Race/Ethni..."
4,2019 RLLR 216,TB9-14438,RLLR,2019,,en,2019/10/02,https://refugeelab.ca/rllr/2019rllr216,2024-04-11,Citation: 2019 RLLR 216\nTribunal: Refugee Pro...,"{""country"": ""China"", ""case_type"": ""Race/Ethnic..."


In [5]:
# OPTION 5: Load Hugging Face dataset

from datasets import load_dataset
import pandas as pd

dataset = load_dataset("refugee-law-lab/canadian-legal-data", "RLLR", split="train")

# convert to dataframe
df = pd.DataFrame(dataset)
df.head()


Downloading data:   0%|          | 0.00/3.19M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/539 [00:00<?, ? examples/s]

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 32,VC1-05483,RLLR,2022,,en,2022/03/08,https://refugeelab.ca/rllr/2022rllr32,2024-04-11,Citation: 2022 RLLR 32\nTribunal: Refugee Prot...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Gende..."
1,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2024-04-11,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{""country"": ""Sudan"", ""case_type"": ""Political O..."
2,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2024-04-11,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{""country"": ""Barbados"", ""case_type"": ""PSG: Gen..."
3,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2024-04-11,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: Gend..."
4,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2024-04-11,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{""country"": ""India"", ""case_type"": ""Political O..."


In [6]:
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2022 RLLR 32,VC1-05483,RLLR,2022,,en,2022/03/08,https://refugeelab.ca/rllr/2022rllr32,2024-04-11,Citation: 2022 RLLR 32\nTribunal: Refugee Prot...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Gende..."
1,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2024-04-11,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,"{""country"": ""Sudan"", ""case_type"": ""Political O..."
2,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2024-04-11,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,"{""country"": ""Barbados"", ""case_type"": ""PSG: Gen..."
3,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2024-04-11,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: Gend..."
4,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2024-04-11,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,"{""country"": ""India"", ""case_type"": ""Political O..."
...,...,...,...,...,...,...,...,...,...,...,...
534,2019 RLLR 5,MB7-21566,RLLR,2019,,en,2019/10/17,https://refugeelab.ca/rllr/2019rllr5,2024-04-11,Citation: 2019 RLLR 5\nTribunal: Refugee Prote...,"{""country"": ""Haiti"", ""case_type"": ""PSG: SOGIE""..."
535,2019 RLLR 4,MB7-18975,RLLR,2019,,en,2019/07/04,https://refugeelab.ca/rllr/2019rllr4,2024-04-11,Citation: 2019 RLLR 4\nTribunal: Refugee Prote...,"{""country"": ""Haiti"", ""case_type"": ""PSG: Gender..."
536,2019 RLLR 3,MB7-18354,RLLR,2019,,en,2019/09/10,https://refugeelab.ca/rllr/2019rllr3,2024-04-11,Citation: 2019 RLLR 3\nTribunal: Refugee Prote...,"{""country"": ""Haiti"", ""case_type"": ""No Nexus: C..."
537,2019 RLLR 2,TB9-01394,RLLR,2019,,en,2019/12/23,https://refugeelab.ca/rllr/2019rllr2,2024-04-11,Citation: 2019 RLLR 2\nTribunal: Refugee Prote...,"{""country"": ""Nigeria"", ""case_type"": ""PSG: SOGI..."


# Working with the data

In [7]:
import json
# to access the data in the "other" column directly, use this code:

# convert each item in other JSON to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df = df.drop(columns=['other'])
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,country,case_type,member
0,2022 RLLR 32,VC1-05483,RLLR,2022,,en,2022/03/08,https://refugeelab.ca/rllr/2022rllr32,2024-04-11,Citation: 2022 RLLR 32\nTribunal: Refugee Prot...,Mexico,PSG: Gender Based Violence,Kari Schroeder
1,2022 RLLR 31,VC1-06798,RLLR,2022,,en,2022/02/03,https://refugeelab.ca/rllr/2022rllr31,2024-04-11,Citation: 2022 RLLR 31\nTribunal: Refugee Prot...,Sudan,Political Opinion,Siobhan Yorgun
2,2022 RLLR 30,VC2-07212,RLLR,2022,,en,2022/11/28,https://refugeelab.ca/rllr/2022rllr30,2024-04-11,Citation: 2022 RLLR 30\nTribunal: Refugee Prot...,Barbados,PSG: Gender Based Violence,Nick Bower
3,2022 RLLR 29,VC2-06405,RLLR,2022,,en,2022/11/09,https://refugeelab.ca/rllr/2022rllr29,2024-04-11,Citation: 2022 RLLR 29\nTribunal: Refugee Prot...,Ukraine,PSG: Gender Based Violence,Hannah Gray
4,2022 RLLR 28,VC2-06395,RLLR,2022,,en,2022/09/28,https://refugeelab.ca/rllr/2022rllr28,2024-04-11,Citation: 2022 RLLR 28\nTribunal: Refugee Prot...,India,Political Opinion,Kylee Carreno
...,...,...,...,...,...,...,...,...,...,...,...,...,...
534,2019 RLLR 5,MB7-21566,RLLR,2019,,en,2019/10/17,https://refugeelab.ca/rllr/2019rllr5,2024-04-11,Citation: 2019 RLLR 5\nTribunal: Refugee Prote...,Haiti,PSG: SOGIE,Ethan McMonagle
535,2019 RLLR 4,MB7-18975,RLLR,2019,,en,2019/07/04,https://refugeelab.ca/rllr/2019rllr4,2024-04-11,Citation: 2019 RLLR 4\nTribunal: Refugee Prote...,Haiti,PSG: Gender Based Violence,Nicole Ginsberg
536,2019 RLLR 3,MB7-18354,RLLR,2019,,en,2019/09/10,https://refugeelab.ca/rllr/2019rllr3,2024-04-11,Citation: 2019 RLLR 3\nTribunal: Refugee Prote...,Haiti,No Nexus: Criminality/Corruption,Me Jean-Guy Jam
537,2019 RLLR 2,TB9-01394,RLLR,2019,,en,2019/12/23,https://refugeelab.ca/rllr/2019rllr2,2024-04-11,Citation: 2019 RLLR 2\nTribunal: Refugee Prote...,Nigeria,PSG: SOGIE,Marcelle Bourassa
