### Notebook to verify data parsing using GPT4

In [1]:
# Importing Libraries
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json
import pathlib

### Setup LLM Models

In [2]:
# Get API Key
with open('d:/AI-Projects/secrets/SECRETS-OPENAI.txt', 'r') as in_file:
    OPENAI_API_KEY = in_file.readlines()[0]

# Set up LLM
gpt = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=OPENAI_API_KEY, temperature=0, max_tokens=1024)

def get_response(prompt, system_message):
    messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=prompt),
    ]
    return gpt.invoke(messages).content

In [3]:
def get_decision_date (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only the date of the decision in YYYY-MM-DD format. 
    If you are not sure about the date, you return 'other'. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n DATE OF DECISION in YYYY-MM-DD format (other if unclear): "

    return get_response(prompt, system_message)

def get_rad_decision (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only 'RPD' if the case excerpt is a decision
    of the Refugee Protection Division of the Immigration and Refugee Board, 'other' if it is 
    a different IRB division (e.g. Immigration Division, Immigration Appeal Division, Refugee 
    Appeal Division, Refugee Division) or if you are not sure. Keep in mind that 'Refugee Protection Division'
    and 'Refugee Division' are not the same.
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n RPD or other: "

    return get_response(prompt, system_message)


### Load the data

In [4]:
# get parsed data

# Set variables
start_year = 2001  # First year of data sought (2003 +)
end_year = 2020  # Last year of data sought (2021 -)
languages_sought = ['en', 'fr']  # languages in list e.g. ['en', 'fr'] or ['en']

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# load data
results = []
for year in range(start_year, end_year+1):
    for language in languages_sought:
        with open(data_path / f'{year}_{language}.json') as f:
            results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)

df

Unnamed: 0,citation,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
0,MA1-12669,,RPD,,486359.txt,2023-11-13,2002-08-22,2002,Immigration and Refugee Board\nRefugee Protect...,en,"{""decision-maker_name"": null}"
1,AA0-01604,,RPD,,638019.txt,2023-11-13,2002-11-08,2002,Immigration and Refugee Board\nRefugee Protect...,en,"{""decision-maker_name"": null}"
2,AA1-00073,,RPD,,638027.txt,2023-11-13,2002-11-06,2002,Immigration and Refugee Board\nRefugee Protect...,en,"{""decision-maker_name"": null}"
3,AA1-00454,,RPD,,638041.txt,2023-11-13,2002-11-01,2002,Immigration and Refugee Board\nRefugee Protect...,en,"{""decision-maker_name"": null}"
4,AA1-01289,,RPD,,638047.txt,2023-11-13,2002-10-17,2002,Immigration and Refugee Board\nRefugee Protect...,en,"{""decision-maker_name"": null}"
...,...,...,...,...,...,...,...,...,...,...,...
12463,VB9-04335,,RPD,,3568407.txt,2023-11-13,2020-01-28,2020,\nDossier de la SPR / RPD File: VB9-04335/0434...,fr,"{""decision-maker_name"": ""Megan Kammerer""}"
12464,TB0-04269,,RPD,,3580919.txt,2023-11-13,2020-02-04,2020,\nNo de dossier de la SPR / RPD File No.: TB0-...,fr,"{""decision-maker_name"": ""H. ROSS""}"
12465,TB8-03337,,RPD,,3580931.txt,2023-11-13,2020-02-24,2020,\nNo de dossier de la SPR / RPD File No.: TB8-...,fr,"{""decision-maker_name"": ""A. Casimiro""}"
12466,MB9-01194,,RPD,,3581415.txt,2023-11-13,2020-07-27,2020,Dossier de la SPR / RPD File: MB9-01194\nHuis ...,fr,"{""decision-maker_name"": ""Michel Colin""}"


In [5]:
# get raw data
DATA_DIRS = ["d:/IRB Decisions - Initial Request - TEXT/"]

# get list of files in each directory, including full path
path_file_list = []
for data_dir in DATA_DIRS:
    for file in pathlib.Path(data_dir).glob('**/*'):
        path_file_list.append(str(file))

# put the file_list into a dataframe
df_raw = pd.DataFrame(path_file_list, columns=['path_file_name'])

# add a column with just the file name (using pathlib
df_raw['file_name'] = df_raw['path_file_name'].apply(lambda x: pathlib.Path(x).name)

df_raw

    

Unnamed: 0,path_file_name,file_name
0,d:\IRB Decisions - Initial Request - TEXT\1000...,1000064.txt
1,d:\IRB Decisions - Initial Request - TEXT\1000...,1000599.txt
2,d:\IRB Decisions - Initial Request - TEXT\1000...,1000602.txt
3,d:\IRB Decisions - Initial Request - TEXT\1000...,1000606.txt
4,d:\IRB Decisions - Initial Request - TEXT\1000...,1000607.txt
...,...,...
116683,d:\IRB Decisions - Initial Request - TEXT\9996...,999668.txt
116684,d:\IRB Decisions - Initial Request - TEXT\9996...,999669.txt
116685,d:\IRB Decisions - Initial Request - TEXT\9996...,999670.txt
116686,d:\IRB Decisions - Initial Request - TEXT\~$03...,~$036980.txt


### Select a random sample of the data

In [6]:
# set random seed
import random
random.seed(2023)

# get random sample of df
df_sample = df.sample(n=500)

# get random sample of df_raw
df_sample_raw = df_raw.sample(n=500)

# open the text files in df_sample_raw, using file_name colum (and pathlib), and save content to unofficial_text column, being sure to close the file, with unicode encoding
df_sample_raw['unofficial_text'] = df_sample_raw['path_file_name'].apply(lambda x: pathlib.Path(x).open(encoding='utf-8', errors='ignore').read())
df_sample_raw



Unnamed: 0,path_file_name,file_name,unofficial_text
63994,d:\IRB Decisions - Initial Request - TEXT\2896...,2896929.txt,Commission de l'immigration et\ndu statut de r...
40588,d:\IRB Decisions - Initial Request - TEXT\1886...,1886080.txt,\nImmigration and\nRefugee Board of Canada\n\n...
41902,d:\IRB Decisions - Initial Request - TEXT\1901...,1901809.txt,\nImmigration and\nRefugee Board of Canada\n\n...
42642,d:\IRB Decisions - Initial Request - TEXT\1918...,1918882.txt,Commission de l'immigration\net du statut de r...
27601,d:\IRB Decisions - Initial Request - TEXT\1754...,1754112.txt,\n\nMotifs et dcision - Reasons and Decision\n...
...,...,...,...
113764,d:\IRB Decisions - Initial Request - TEXT\9406...,940613.txt,Immigration and\nRefugee Board\nImmigration Ap...
373,d:\IRB Decisions - Initial Request - TEXT\1009...,1009107.txt,\n\nIAD File No. / No de dossier de la SAI: TA...
18460,d:\IRB Decisions - Initial Request - TEXT\1237...,1237426.txt,\n\nRPD File No. / N de dossier de la SPR : TB...
77075,d:\IRB Decisions - Initial Request - TEXT\3207...,3207812.txt,\tRPD File No. / N de dossier de la SPR : TB7...


### Apply the models to the data

In [7]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_decision_date'] = df_sample['unofficial_text'].apply(lambda x: get_decision_date(x[:1000]))
df_sample


Unnamed: 0,citation,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date
12447,TB9-28520,,RPD,,3329366.txt,2023-11-13,2020-01-02,2020,RPD File /Dossier de la SPR :\nTB9-28520\nUCI ...,en,"{""decision-maker_name"": ""L. Figg""}",2020-01-02
10693,MB1-01279,,RPD,,1820804.txt,2023-11-12,2014-06-02,2014,Immigration and Refugee Board\nRefugee Protect...,fr,"{""decision-maker_name"": null}",2014-06-02
8415,TA9-23060,,RPD,,1393204.txt,2023-11-12,2012-06-05,2012,\nN° de dossier de la SPR / RPD File No.: TA9-...,fr,"{""decision-maker_name"": ""Pasquale Fiorino""}",2012-06-05
10277,MB1-00905,,RPD,,1830138.txt,2023-11-12,2014-07-24,2014,Immigration and\nRefugee Board\nRefugee Protec...,en,"{""decision-maker_name"": null}",2014-07-24
5251,MA8-06880,,RPD,,1126114.txt,2023-11-12,2010-08-09,2010,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": """"}",2010-08-09
...,...,...,...,...,...,...,...,...,...,...,...,...
3341,MA6-04696,,RPD,,774135.txt,2023-11-13,2008-03-27,2008,Commission de l'immigration et du statut de ré...,fr,"{""decision-maker_name"": null}",2008-03-27
2785,MA6-08613,,RPD,,1013235.txt,2023-11-12,2008-10-07,2008,Immigration and\nRefugee Board\nRefugee Protec...,en,"{""decision-maker_name"": null}",2008-10-07
1512,CA5-00488,,RPD,,1096783.txt,2023-11-12,2006-08-04,2006,Commission de l'immigration et du statut de ré...,fr,"{""decision-maker_name"": null}",2006-08-04
8227,TB0-06040,,RPD,,1301568.txt,2023-11-12,2012-06-18,2012,\nNo de dossier de la SPR / RPD File No.: TB0-...,fr,"{""decision-maker_name"": ""Walter Kawun""}",2012-07-23


In [8]:
# list any rows of the df where genAI_decision_date is not equal to the decision_date
df_sample[df_sample['genAI_decision_date'] != df_sample['document_date']]

Unnamed: 0,citation,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date
3801,TA6-03866,,RPD,,1119046.txt,2023-11-12,2009-04-08,2009,\nRPD File No. / N° de dossier de la SPR : TA6...,en,"{""decision-maker_name"": ""Robert H. Rushowy""}",2009-06-11
9003,VB2-01681,,RPD,,1559871.txt,2023-11-12,2013-05-22,2013,\nRPD File No. / No de dossier de SPR : VB2-01...,en,"{""decision-maker_name"": null}",2013-06-06
5679,VA9-00853,,RPD,,1299095.txt,2023-11-12,2010-12-13,2010,\nNo de dossier de la SPR / RPD File No.: VA9-...,fr,"{""decision-maker_name"": null}",2011-01-19
5363,VA9-02032,,RPD,,1149053.txt,2023-11-12,2010-11-24,2010,\nNo de dossier de la SPR / RPD File No.: VA9-...,fr,"{""decision-maker_name"": null}",2010-12-13
9153,TB0-15885,,RPD,,1740390.txt,2023-11-12,2013-07-03,2013,\nRPD File No. / N° de dossier de la SPR : TB0...,en,"{""decision-maker_name"": ""Luis F. Agostinho""}",2013-07-31
8389,TB1-06383,,RPD,,1391410.txt,2023-11-12,2012-05-17,2012,\nNo de dossier de la SPR / RPD File No.: TB1-...,fr,"{""decision-maker_name"": ""Barry D. Barnes""}",2012-07-27
4709,VA9-02032,,RPD,,1148996.txt,2023-11-12,2010-11-24,2010,\nRPD File No. / No de dossier de SPR : VA9-02...,en,"{""decision-maker_name"": null}",2010-12-13
8227,TB0-06040,,RPD,,1301568.txt,2023-11-12,2012-06-18,2012,\nNo de dossier de la SPR / RPD File No.: TB0-...,fr,"{""decision-maker_name"": ""Walter Kawun""}",2012-07-23


In [9]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_dataset'] = df_sample['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample


Unnamed: 0,citation,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset
12447,TB9-28520,,RPD,,3329366.txt,2023-11-13,2020-01-02,2020,RPD File /Dossier de la SPR :\nTB9-28520\nUCI ...,en,"{""decision-maker_name"": ""L. Figg""}",2020-01-02,RPD
10693,MB1-01279,,RPD,,1820804.txt,2023-11-12,2014-06-02,2014,Immigration and Refugee Board\nRefugee Protect...,fr,"{""decision-maker_name"": null}",2014-06-02,RPD
8415,TA9-23060,,RPD,,1393204.txt,2023-11-12,2012-06-05,2012,\nN° de dossier de la SPR / RPD File No.: TA9-...,fr,"{""decision-maker_name"": ""Pasquale Fiorino""}",2012-06-05,RPD
10277,MB1-00905,,RPD,,1830138.txt,2023-11-12,2014-07-24,2014,Immigration and\nRefugee Board\nRefugee Protec...,en,"{""decision-maker_name"": null}",2014-07-24,RPD
5251,MA8-06880,,RPD,,1126114.txt,2023-11-12,2010-08-09,2010,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": """"}",2010-08-09,RPD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3341,MA6-04696,,RPD,,774135.txt,2023-11-13,2008-03-27,2008,Commission de l'immigration et du statut de ré...,fr,"{""decision-maker_name"": null}",2008-03-27,RPD
2785,MA6-08613,,RPD,,1013235.txt,2023-11-12,2008-10-07,2008,Immigration and\nRefugee Board\nRefugee Protec...,en,"{""decision-maker_name"": null}",2008-10-07,RPD
1512,CA5-00488,,RPD,,1096783.txt,2023-11-12,2006-08-04,2006,Commission de l'immigration et du statut de ré...,fr,"{""decision-maker_name"": null}",2006-08-04,RPD
8227,TB0-06040,,RPD,,1301568.txt,2023-11-12,2012-06-18,2012,\nNo de dossier de la SPR / RPD File No.: TB0-...,fr,"{""decision-maker_name"": ""Walter Kawun""}",2012-07-23,RPD


In [10]:
# list any rows of the df where genAI_dataset is not equal to the dataset
df_sample[df_sample['genAI_dataset'] != df_sample['dataset']]

Unnamed: 0,citation,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset
2166,TA5-11822,,RPD,,636351.txt,2023-11-13,2007-12-17,2007,\nReasons and Decision ? Motifs et décision\nJ...,en,"{""decision-maker_name"": ""Lois D. Figg""}",2007-12-17,other
11083,MB4-05069,,RPD,,1849494.txt,2023-11-12,2015-06-18,2015,Immigration and\nRefugee Board of Canada\nRefu...,en,"{""decision-maker_name"": ""Me Maria De Andrade""}",2015-06-18,other


In [11]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample_raw['genAI_dataset'] = df_sample_raw['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample_raw

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
63994,d:\IRB Decisions - Initial Request - TEXT\2896...,2896929.txt,Commission de l'immigration et\ndu statut de r...,other
40588,d:\IRB Decisions - Initial Request - TEXT\1886...,1886080.txt,\nImmigration and\nRefugee Board of Canada\n\n...,other
41902,d:\IRB Decisions - Initial Request - TEXT\1901...,1901809.txt,\nImmigration and\nRefugee Board of Canada\n\n...,other
42642,d:\IRB Decisions - Initial Request - TEXT\1918...,1918882.txt,Commission de l'immigration\net du statut de r...,other
27601,d:\IRB Decisions - Initial Request - TEXT\1754...,1754112.txt,\n\nMotifs et dcision - Reasons and Decision\n...,other
...,...,...,...,...
113764,d:\IRB Decisions - Initial Request - TEXT\9406...,940613.txt,Immigration and\nRefugee Board\nImmigration Ap...,other
373,d:\IRB Decisions - Initial Request - TEXT\1009...,1009107.txt,\n\nIAD File No. / No de dossier de la SAI: TA...,other
18460,d:\IRB Decisions - Initial Request - TEXT\1237...,1237426.txt,\n\nRPD File No. / N de dossier de la SPR : TB...,RPD
77075,d:\IRB Decisions - Initial Request - TEXT\3207...,3207812.txt,\tRPD File No. / N de dossier de la SPR : TB7...,RPD


In [12]:
# list any rows of df_sample_raw where genAI_dataset is 'RPD' but the file_name is not in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'RPD') & (~df_sample_raw['file_name'].isin(df['source_url']))]


Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
88432,d:\IRB Decisions - Initial Request - TEXT\3596...,3596229.txt,\nDossier de la SPR / RPD File: TC0-11107\nIUC...,RPD
78475,d:\IRB Decisions - Initial Request - TEXT\3263...,3263509.txt,\nRAD File / Dossier de la SAR : TB9-09177\nTB...,RPD
46146,d:\IRB Decisions - Initial Request - TEXT\2040...,2040865.txt,RPD File No. / N de dossier de la SPR : MB5-02...,RPD
57223,d:\IRB Decisions - Initial Request - TEXT\2731...,2731087.txt,Commission de l'immigration\net du statut de r...,RPD
30909,d:\IRB Decisions - Initial Request - TEXT\1783...,1783566.txt,\n\n\nNo de dossier de la SPR / RPD File No.: ...,RPD
15241,d:\IRB Decisions - Initial Request - TEXT\1175...,1175564.txt,\n\nN de dossier de la SPR / RPD File No.: TA9...,RPD
87857,d:\IRB Decisions - Initial Request - TEXT\3590...,3590920.txt,RPD File / Dossier de la SPR : TB2-10060\nTB2-...,RPD
41578,d:\IRB Decisions - Initial Request - TEXT\1899...,1899035.txt,RPD File No. / N de dossier de la SPR : MB4-04...,RPD
71050,d:\IRB Decisions - Initial Request - TEXT\3064...,3064606.txt,\nDossier de la SAR / RAD File : MB8-12204\n\n...,RPD
41380,d:\IRB Decisions - Initial Request - TEXT\1895...,1895130.txt,\nNo de dossier de la SI / ID File No.: 0018-B...,RPD


In [13]:
# list any rows of df_sample_raw where genAI_dataset is 'other' and the file_name is in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] != 'RPD') & (df_sample_raw['file_name'].isin(df['source_url']))]

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset


In [14]:
# loop through these df_sample_raw[(df_sample_raw['genAI_dataset'] == 'RPD') & (~df_sample_raw['file_name'].isin(df['source_url']))]

cases_to_examine = df_sample_raw[(df_sample_raw['genAI_dataset'] == 'RPD') & (~df_sample_raw['file_name'].isin(df['source_url']))]

# iterate through cases_to_examine, print the filename, open the file, print the content

for index, row in cases_to_examine.iterrows():
    print(row['file_name'])

    with open(row['path_file_name'], encoding='utf-8', errors='ignore') as f:
        print(f.read())

    print("@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@")




3596229.txt

Dossier de la SPR / RPD File: TC0-11107
IUC / UCI: XXXX
Huis clos / Private Proceeding 
Motifs et dcision ? Reasons and Decision 
Demandeur(e)(s) d'asile 
XXXX XXXX XXXX
Claimant(s)



Date(s) de l'audience 
18 juin 2021

Date(s) of hearing



Lieu de l'audience 
Tenue virtuellement 
Place of hearing



Date de la dcision
et des motifs 
29 juin 2021

Date of decision
and reasons



Tribunal 
Yonatan Rozenszajn

Panel



Conseil(s) du (de la/des)
demandeur(e)(s) d'asile 
John Rokakis
Counsel for the claimant(s)



Reprsentant(e) dsign(e)
S.O.
Designated representative



Conseil du (de la) ministre 
Numa Qadri
Counsel for the Minister

INTRODUCTION
[1] Voici la dcision concernant la demande1 prsente par le ministre de la Citoyennet et de l'Immigration (le ministre) au titre de l'article 108 de la Loi sur l'immigration et la protection des rfugis (LIPR). Le ministre prsente une demande de constat de perte de l'asile accord par la Section de la protection des rfugis (SPR)  l'

### Excellent! 

On 500 included files 98.4% accuracy on decision date, with the 8 inaccuracies sometimes involving 
confusing decision date and hearing date, and in those cases the dates were mostly very
close together.

On 500 included files, 99.8% correct inclusion, with the one error being a weird edge case
that arguably isn't even an error (it was decided by RPD, but it was not a refugee claim
but instead an issue about unauthorized counsel).

On 500 raw files 100% accuracy in terms of all those included being correctly included.

On 500 raw files, 99% accuracy in terms of properly including those that should be included. The 
five missed RPD cases mostly involved typos in the RPD number or in the date, or unusual ways of recording the date.

Overall, this is good enough. I manually removed the case that was not a refugee claim. We could work
further on the dates, which would improve both the decision date accuracy and the handful of 
missing RPD cases, but we're in diminishing returns.