## Notebook to load and analyze Refugee Law Lab Reporter data

Requirements

    pip install pandas

If using parquet:

    pip install pyarrow 

(produced with Python 3.9.12)

## Load Data

#### Four options: Local (json & parquet) & Remote (json & parquet)

In [None]:
# OPTION 1: Load parquet data locally via cloned repo

# First, clone git repo
# Then run this code to load data

import pandas as pd
df = pd.read_parquet('DATA/rllr_cases.parquet')
df.head()

# (if code fails, add engine='pyarrow' to read_parquet() function)

In [None]:
# OPTION 2: Load JSON data locally via cloned repo

# First, clone git repo
# Then run this code to load data

import pandas as pd
import json
import pathlib

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# Set variables
start_year = 2019  # First year of data sought (2019 +)
end_year = 2023  # Last year of data sought (2023 -)

# load data
results = []
for year in range(start_year, end_year+1):
    with open(data_path / f'{year}.json') as f:
        results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)
df.head()

In [None]:
# OPTION 3: Load parquet data remotely from GitHub without cloning repo

import pandas as pd
import requests
from io import BytesIO

url = 'https://github.com/Refugee-Law-Lab/rllr_bulk_data/raw/master/DATA/rllr_cases.parquet'

# load data
results = requests.get(url)

# convert to dataframe
df = pd.read_parquet(BytesIO(results.content))
df.head()

# (if code fails, add engine='pyarrow' to read_parquet() function)

In [None]:
# OPTION 4: Load json data remotely from GitHub without cloning repo

import pandas as pd
import requests

# Set variables
start_year = 2019  # First year of data sought (2019 +)
end_year = 2023  # Last year of data sought (2023 -)

base_ulr = 'https://raw.githubusercontent.com/Refugee-Law-Lab/rllr_bulk_data/master/DATA/YEARLY/'

# load data
results = []
for year in range(start_year, end_year+1):
        url = base_ulr + f'{year}.json'
        results.extend(requests.get(url).json())

# convert to dataframe
df = pd.DataFrame(results)
df.head()

In [2]:
# OPTION 5: Load Hugging Face dataset

from datasets import load_dataset
import pandas as pd

dataset = load_dataset("refugee-law-lab/canadian-legal-data", "RLLR", split = "train")

# convert to dataframe
df = pd.DataFrame(dataset)
df.head()


Unnamed: 0,citation,citation2,dataset,year,name,language,document_date,source_url,scraped_timestamp,unofficial_text,other
0,2023 RLLR 120,VC3-09637,RLLR,2023,,en,12/20/2023,https://refugeelab.ca/rllr/2023rllr120,2024-11-02,2023 RLLR 120\n\nCitation: 2023 RLLR 120\nTrib...,"{""country"": ""Cameroon"", ""case_type"": ""Politica..."
1,2023 RLLR 119,VC3-07126,RLLR,2023,,en,12/19/2023,https://refugeelab.ca/rllr/2023rllr119,2024-11-02,2023 RLLR 119\n\nCitation: 2023 RLLR 119\nTrib...,"{""country"": ""Russia"", ""case_type"": ""Political ..."
2,2023 RLLR 118,VC3-06815,RLLR,2023,,en,10/31/2023,https://refugeelab.ca/rllr/2023rllr118,2024-11-02,2023 RLLR 118\n\nCitation: 2023 RLLR 118\nTrib...,"{""country"": ""Lebanon"", ""case_type"": ""PSG: SOGI..."
3,2023 RLLR 117,VC3-06662,RLLR,2023,,en,11/2/2023,https://refugeelab.ca/rllr/2023rllr117,2024-11-02,2023 RLLR 117\n\nCitation: 2023 RLLR 117\nTrib...,"{""country"": ""USA"", ""case_type"": ""PSG: SOGIE"", ..."
4,2023 RLLR 116,VC3-04003,RLLR,2023,,en,6/26/2023,https://refugeelab.ca/rllr/2023rllr116,2024-11-02,2023 RLLR 116\n\nCitation: 2023 RLLR 116\nTrib...,"{""country"": ""Ukraine"", ""case_type"": ""PSG: SOGI..."


In [None]:
df

# Working with the data

In [None]:
import json
# to access the data in the "other" column directly, use this code:

# convert each item in other JSON to column
df['other'] = df['other'].apply(lambda x: json.loads(x))
df[['country', 'case_type', 'member']] = df['other'].apply(pd.Series)
df = df.drop(columns=['other'])
df