## Notebook to load and analyze Refugee Law Lab Reporter data

Requirements

    pip install pandas

If using parquet:

    pip install pyarrow 

(produced with Python 3.9.12)

## Load Data

#### Four options: Local (json & parquet) & Remote (json & parquet)

In [None]:
# OPTION 1: Load parquet data locally via cloned repo

# First, clone git repo
# Then run this code to load data

import pandas as pd
df = pd.read_parquet('DATA/rllr_cases.parquet')

# (if code fails, add engine='pyarrow' to read_parquet() function)

In [None]:
# OPTION 2: Load JSON data locally via cloned repo

# First, clone git repo
# Then run this code to load data

import pandas as pd
import json
import pathlib

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# Set variables
start_year = 2019  # First year of data sought (2019 +)
end_year = 2022  # Last year of data sought (2022 -)

# load data
results = []
for year in range(start_year, end_year+1):
    with open(data_path / f'{year}.json') as f:
        results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)

In [None]:
# OPTION 3: Load parquet data remotely from GitHub without cloning repo

import pandas as pd
import requests
from io import BytesIO

url = 'https://github.com/Refugee-Law-Lab/rllr_bulk_data/raw/master/DATA/rllr_cases.parquet'

# load data
results = requests.get(url)

# convert to dataframe
df = pd.read_parquet(BytesIO(results.content))

# (if code fails, add engine='pyarrow' to read_parquet() function)

In [3]:
# OPTION 4: Load json data remotely from GitHub without cloning repo

import pandas as pd
import requests

# Set variables
start_year = 2019  # First year of data sought (2019 +)
end_year = 2022  # Last year of data sought (2022 -)

base_ulr = 'https://raw.githubusercontent.com/Refugee-Law-Lab/rllr_bulk_data/master/DATA/YEARLY/'

# load data
results = []
for year in range(start_year, end_year+1):
        url = base_ulr + f'{year}.json'
        results.extend(requests.get(url).json())

# convert to dataframe
df = pd.DataFrame(results)

In [None]:
# OPTION 5: Load Hugging Face dataset

from datasets import load_dataset
import pandas as pd

dataset = load_dataset("refugee-law-lab/canadian-legal-data", split="train", data_dir="SCC")

# convert to dataframe
df = pd.DataFrame(results)


In [4]:
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2019 RLLR 220,TB6-04576,RLLR,2019,,en,2019/05/14,https://refugeelab.ca/rllr/2019rllr220,2023-07-19,Citation: 2019 RLLR 220\nTribunal: Refugee Pro...,"{""country"": ""Somalia"", ""case_type"": ""Religion""..."
1,2019 RLLR 219,TB6-07934,RLLR,2019,,en,2019/01/15,https://refugeelab.ca/rllr/2019rllr219,2023-07-19,Citation: 2019 RLLR 219\nTribunal: Refugee Pro...,"{""country"": ""Iraq"", ""case_type"": ""Religion"", ""..."
2,2019 RLLR 218,TB8-15170,RLLR,2019,,en,2019/10/22,https://refugeelab.ca/rllr/2019rllr218,2023-07-19,Citation: 2019 RLLR 218\nTribunal: Refugee Pro...,"{""country"": ""Mexico"", ""case_type"": ""PSG: Other..."
3,2019 RLLR 217,TB8-07871,RLLR,2019,,en,2019/05/08,https://refugeelab.ca/rllr/2019rllr217,2023-07-19,Citation: 2019 RLLR 217\nTribunal: Refugee Pro...,"{""country"": ""Angola"", ""case_type"": ""Race/Ethni..."
4,2019 RLLR 216,TB9-14438,RLLR,2019,,en,2019/10/02,https://refugeelab.ca/rllr/2019rllr216,2023-07-19,Citation: 2019 RLLR 216\nTribunal: Refugee Pro...,"{""country"": ""China"", ""case_type"": ""Race/Ethnic..."
...,...,...,...,...,...,...,...,...,...,...,...
475,2021 RLLR 4,TB9-15167,RLLR,2021,,en,2021/05/05,https://refugeelab.ca/rllr/2021rllr4,2023-07-19,Citation: 2021 RLLR 4\nTribunal: Refugee Prote...,"{""country"": ""Zimbabwe"", ""case_type"": ""Politica..."
476,2021 RLLR 3,MB8-07585,RLLR,2021,,en,2021/01/18,https://refugeelab.ca/rllr/2021rllr3,2023-07-19,Citation: 2021 RLLR 3\nTribunal: Refugee Prote...,"{""country"": ""South Africa"", ""case_type"": ""PSG:..."
477,2021 RLLR 2,TB9-27084,RLLR,2021,,en,2021/05/12,https://refugeelab.ca/rllr/2021rllr2,2023-07-19,Citation: 2021 RLLR 2\nTribunal: Refugee Prote...,"{""country"": ""Uganda"", ""case_type"": ""Religion"",..."
478,2021 RLLR 1,TB9-17419,RLLR,2021,,en,2021/06/15,https://refugeelab.ca/rllr/2021rllr1,2023-07-19,Citation: 2021 RLLR 1\nTribunal: Refugee Prote...,"{""country"": ""Pakistan"", ""case_type"": ""Religion..."


# Working with the data

In [5]:
import json
# to access the data in the "other" column directly, use this code:

# convert each item in other JSON to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df = df.drop(columns=['other'])
df

Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,country,case_type,member
0,2019 RLLR 220,TB6-04576,RLLR,2019,,en,2019/05/14,https://refugeelab.ca/rllr/2019rllr220,2023-07-19,Citation: 2019 RLLR 220\nTribunal: Refugee Pro...,Somalia,Religion,A. da Silva
1,2019 RLLR 219,TB6-07934,RLLR,2019,,en,2019/01/15,https://refugeelab.ca/rllr/2019rllr219,2023-07-19,Citation: 2019 RLLR 219\nTribunal: Refugee Pro...,Iraq,Religion,M. Lalonde
2,2019 RLLR 218,TB8-15170,RLLR,2019,,en,2019/10/22,https://refugeelab.ca/rllr/2019rllr218,2023-07-19,Citation: 2019 RLLR 218\nTribunal: Refugee Pro...,Mexico,PSG: Other,David D’Intino
3,2019 RLLR 217,TB8-07871,RLLR,2019,,en,2019/05/08,https://refugeelab.ca/rllr/2019rllr217,2023-07-19,Citation: 2019 RLLR 217\nTribunal: Refugee Pro...,Angola,Race/Ethnicity/Nationality,R. Jackson
4,2019 RLLR 216,TB9-14438,RLLR,2019,,en,2019/10/02,https://refugeelab.ca/rllr/2019rllr216,2023-07-19,Citation: 2019 RLLR 216\nTribunal: Refugee Pro...,China,Race/Ethnicity/Nationality,S. Morgan
...,...,...,...,...,...,...,...,...,...,...,...,...,...
475,2021 RLLR 4,TB9-15167,RLLR,2021,,en,2021/05/05,https://refugeelab.ca/rllr/2021rllr4,2023-07-19,Citation: 2021 RLLR 4\nTribunal: Refugee Prote...,Zimbabwe,Political Opinion,Meredith Rose
476,2021 RLLR 3,MB8-07585,RLLR,2021,,en,2021/01/18,https://refugeelab.ca/rllr/2021rllr3,2023-07-19,Citation: 2021 RLLR 3\nTribunal: Refugee Prote...,South Africa,PSG: Gender Based Violence,Nalong Manivong
477,2021 RLLR 2,TB9-27084,RLLR,2021,,en,2021/05/12,https://refugeelab.ca/rllr/2021rllr2,2023-07-19,Citation: 2021 RLLR 2\nTribunal: Refugee Prote...,Uganda,Religion,Suraj Balakrishnan
478,2021 RLLR 1,TB9-17419,RLLR,2021,,en,2021/06/15,https://refugeelab.ca/rllr/2021rllr1,2023-07-19,Citation: 2021 RLLR 1\nTribunal: Refugee Prote...,Pakistan,Religion,Rodrick Flynn
