### Notebook to verify data parsing using GPT4

In [1]:
# Importing Libraries
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json
import pathlib

### Setup LLM Models

In [2]:
# Get API Key
with open('d:/AI-Projects/secrets/SECRETS-OPENAI.txt', 'r') as in_file:
    OPENAI_API_KEY = in_file.readlines()[0]

# Set up LLM
gpt = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=OPENAI_API_KEY, temperature=0, max_tokens=1024)

def get_response(prompt, system_message):
    messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=prompt),
    ]
    return gpt.invoke(messages).content

In [3]:
def get_decision_date (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only the date of the decision in YYYY-MM-DD format. 
    If you are not sure about the date, you return 'other'. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n DATE OF DECISION in YYYY-MM-DD format (other if unclear): "

    return get_response(prompt, system_message)

def get_rad_decision (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only 'RAD' if the case excerpt is a decision
    of the Refugee Appeal Division of the Immigration and Refugee Board, 'other' if it is 
    a different IRB division (e.g. Immigration Division, Immigration Appeal Division, Refugee 
    Protection Division) or if you are not sure. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n RAD or other: "

    return get_response(prompt, system_message)


### Load the data

In [4]:
# get parsed data

# Set variables
start_year = 2010  # First year of data sought (1877 +)
end_year = 2023  # Last year of data sought (2023 -)
languages_sought = ['en', 'fr']  # languages in list e.g. ['en', 'fr'] or ['en']

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# load data
results = []
for year in range(start_year, end_year+1):
    for language in languages_sought:
        with open(data_path / f'{year}_{language}.json') as f:
            results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)

df

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
0,TB9-07773,,RAD,,3229926.txt,2023-11-13,2010-01-24,2010,\nRAD File / Dossier de la SAR : TB9-07773\nTB...,en,"{""decision-maker_name"": ""Kim Polowek""}"
1,TB9-07773,,RAD,,3229927.txt,2023-11-13,2010-01-24,2010,\nDossier de la SAR / RAD File: TB9-07773\nTB9...,fr,"{""decision-maker_name"": ""Kim Polowek""}"
2,MB9-27420,,RAD,,3546053.txt,2023-11-13,2011-01-12,2011,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Marie-Lyne Thibault""}"
3,TB3-03406,,RAD,,1355235.txt,2023-11-12,2013-07-31,2013,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Edward Bosveld""}"
4,VB3-01099,,RAD,,1360268.txt,2023-11-12,2013-08-30,2013,\n\n\n\tRAD File No. / N° de dossier de la SAR...,en,"{""decision-maker_name"": null}"
...,...,...,...,...,...,...,...,...,...,...,...
27549,VC1-04668,,RAD,,3583931.txt,2023-11-13,2022-04-29,2022,\nDossier de la SAR / RAD File : VC1-04668\nVC...,fr,"{""decision-maker_name"": ""Me Kristine Plouffe-M..."
27550,MC1-07673,,RAD,,3596070.txt,2023-11-13,2022-02-25,2022,\nDossier de la SAR / RAD File: MC1-07673\n\nH...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
27551,MC1-06135,,RAD,,3599962.txt,2023-11-13,2022-01-28,2022,\nDossier de la SAR / RAD File : MC1-06135\n\n...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
27552,MC2-03068,,RAD,,MC2-03068 - Final February 2023.txt,2023-11-12,2023-02-02,2023,\nRAD File / Dossier de la SAR : MC2-03068\nMC...,en,"{""decision-maker_name"": ""Me Martine Durocher""}"


In [5]:
# get raw data
DATA_DIRS = ["d:/RAD Decisions TEXT/", "d:/IRB Decisions - Initial Request - TEXT/"]

# get list of files in each directory, including full path
path_file_list = []
for data_dir in DATA_DIRS:
    for file in pathlib.Path(data_dir).glob('**/*'):
        path_file_list.append(str(file))

# put the file_list into a dataframe
df_raw = pd.DataFrame(path_file_list, columns=['path_file_name'])

# add a column with just the file name (using pathlib
df_raw['file_name'] = df_raw['path_file_name'].apply(lambda x: pathlib.Path(x).name)

df_raw

    

Unnamed: 0,path_file_name,file_name
0,d:\RAD Decisions TEXT\AppToReopenTB8-14702a.txt,AppToReopenTB8-14702a.txt
1,d:\RAD Decisions TEXT\AppToReopenTB8-14702tf.txt,AppToReopenTB8-14702tf.txt
2,d:\RAD Decisions TEXT\Decision concerning G. B...,Decision concerning G. Bazin (finale version) ...
3,d:\RAD Decisions TEXT\Decision G. Bazin_ franc...,Decision G. Bazin_ francais_modifiee (003) - s...
4,d:\RAD Decisions TEXT\MB7-00112a.txt,MB7-00112a.txt
...,...,...
120572,d:\IRB Decisions - Initial Request - TEXT\9996...,999668.txt
120573,d:\IRB Decisions - Initial Request - TEXT\9996...,999669.txt
120574,d:\IRB Decisions - Initial Request - TEXT\9996...,999670.txt
120575,d:\IRB Decisions - Initial Request - TEXT\~$03...,~$036980.txt


### Select a random sample of the data

In [6]:
# set random seed
import random
random.seed(4242)

# get random sample of df
df_sample = df.sample(n=500)

# get random sample of df_raw
df_sample_raw = df_raw.sample(n=500)

# open the text files in df_sample_raw, using file_name colum (and pathlib), and save content to unofficial_text column, being sure to close the file, with unicode encoding
df_sample_raw['unofficial_text'] = df_sample_raw['path_file_name'].apply(lambda x: pathlib.Path(x).open(encoding='utf-8', errors='ignore').read())
df_sample_raw



Unnamed: 0,path_file_name,file_name,unofficial_text
55504,d:\IRB Decisions - Initial Request - TEXT\2243...,2243549.txt,\tRPD File No. / N de dossier de la SPR : TB3-...
69742,d:\IRB Decisions - Initial Request - TEXT\2931...,2931645.txt,Immigration and \nRefugee Board of Canada\n\nR...
110397,d:\IRB Decisions - Initial Request - TEXT\7601...,760159.txt,\n\nIAD File No. / No de dossier de la SAI: TA...
47327,d:\IRB Decisions - Initial Request - TEXT\1924...,1924401.txt,Immigration and\nRefugee Board of Canada\n\nIm...
96646,d:\IRB Decisions - Initial Request - TEXT\5222...,522262.txt,Commission de l'immigration et du statut de rf...
...,...,...,...
92020,d:\IRB Decisions - Initial Request - TEXT\3594...,3594370.txt,\nRAD File / Dossier de la SAR : TB8-17488\n\n...
52859,d:\IRB Decisions - Initial Request - TEXT\2103...,2103272.txt,\nImmigration and\nRefugee Board of Canada\n\n...
57930,d:\IRB Decisions - Initial Request - TEXT\2289...,2289335.txt,Commission de l'immigration\net du statut de r...
72355,d:\IRB Decisions - Initial Request - TEXT\3001...,3001258.txt,Immigration and Refugee\nBoard of Canada\nRefu...


### Apply the models to the data

In [7]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_decision_date'] = df_sample['unofficial_text'].apply(lambda x: get_decision_date(x[:1000]))
df_sample


Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date
22878,TB9-06678,,RAD,,3457781.txt,2023-11-13,2020-02-13,2020,\nDossier de la SAR / RAD File: TB9-06678\nTB9...,fr,"{""decision-maker_name"": ""S. Thompson""}",2020-02-13
13625,TB8-05170,,RAD,,3074151.txt,2023-11-13,2019-03-19,2019,\nRAD File / Dossier de la SAR : TB8-05170\n\n...,en,"{""decision-maker_name"": ""D. Lewis""}",2019-03-19
13006,VB7-06676,,RAD,,3010009.txt,2023-11-13,2018-11-13,2018,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Paula Faber""}",2018-11-13
11,TB3-04223,,RAD,,1386648.txt,2023-11-12,2013-09-02,2013,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Edward Bosveld""}",2013-09-02
12829,TB7-23482,,RAD,,2990447.txt,2023-11-13,2018-10-25,2018,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""G. Erauw""}",2018-10-25
...,...,...,...,...,...,...,...,...,...,...,...,...
10607,MB6-01554,,RAD,,2791401.txt,2023-11-13,2017-02-09,2017,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Me Alain Bissonnette""}",2017-02-09
17425,TB9-03795,,RAD,,3201186.txt,2023-11-13,2019-11-18,2019,\n\nDossier de la SAR / RAD File: TB9-03795\nT...,fr,"{""decision-maker_name"": ""S. Thompson""}",2019-11-18
13558,VB8-04280,,RAD,,3064726.txt,2023-11-13,2019-04-30,2019,\nRAD File / Dossier de la SAR : VB8-04280\n\n...,en,"{""decision-maker_name"": ""Christina Harrison Ba...",2019-04-30
25739,MB9-14740,,RAD,,MB9-14740tf.txt,2023-11-12,2021-03-31,2021,\nDossier de la SAR / RAD File: MB9-14740\n\nH...,fr,"{""decision-maker_name"": ""Nicole Ginsberg""}",2021-03-31


In [8]:
# list any rows of the df where genAI_decision_date is not equal to the decision_date
df_sample[df_sample['genAI_decision_date'] != df_sample['document_date']]

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date


In [9]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_dataset'] = df_sample['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample


Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset
22878,TB9-06678,,RAD,,3457781.txt,2023-11-13,2020-02-13,2020,\nDossier de la SAR / RAD File: TB9-06678\nTB9...,fr,"{""decision-maker_name"": ""S. Thompson""}",2020-02-13,RAD
13625,TB8-05170,,RAD,,3074151.txt,2023-11-13,2019-03-19,2019,\nRAD File / Dossier de la SAR : TB8-05170\n\n...,en,"{""decision-maker_name"": ""D. Lewis""}",2019-03-19,RAD
13006,VB7-06676,,RAD,,3010009.txt,2023-11-13,2018-11-13,2018,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Paula Faber""}",2018-11-13,RAD
11,TB3-04223,,RAD,,1386648.txt,2023-11-12,2013-09-02,2013,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Edward Bosveld""}",2013-09-02,RAD
12829,TB7-23482,,RAD,,2990447.txt,2023-11-13,2018-10-25,2018,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""G. Erauw""}",2018-10-25,RAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10607,MB6-01554,,RAD,,2791401.txt,2023-11-13,2017-02-09,2017,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Me Alain Bissonnette""}",2017-02-09,RAD
17425,TB9-03795,,RAD,,3201186.txt,2023-11-13,2019-11-18,2019,\n\nDossier de la SAR / RAD File: TB9-03795\nT...,fr,"{""decision-maker_name"": ""S. Thompson""}",2019-11-18,RAD
13558,VB8-04280,,RAD,,3064726.txt,2023-11-13,2019-04-30,2019,\nRAD File / Dossier de la SAR : VB8-04280\n\n...,en,"{""decision-maker_name"": ""Christina Harrison Ba...",2019-04-30,RAD
25739,MB9-14740,,RAD,,MB9-14740tf.txt,2023-11-12,2021-03-31,2021,\nDossier de la SAR / RAD File: MB9-14740\n\nH...,fr,"{""decision-maker_name"": ""Nicole Ginsberg""}",2021-03-31,RAD


In [10]:
# list any rows of the df where genAI_dataset is not equal to the dataset
df_sample[df_sample['genAI_dataset'] != df_sample['dataset']]

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset


In [11]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample_raw['genAI_dataset'] = df_sample_raw['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample_raw

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
55504,d:\IRB Decisions - Initial Request - TEXT\2243...,2243549.txt,\tRPD File No. / N de dossier de la SPR : TB3-...,other
69742,d:\IRB Decisions - Initial Request - TEXT\2931...,2931645.txt,Immigration and \nRefugee Board of Canada\n\nR...,RAD
110397,d:\IRB Decisions - Initial Request - TEXT\7601...,760159.txt,\n\nIAD File No. / No de dossier de la SAI: TA...,other
47327,d:\IRB Decisions - Initial Request - TEXT\1924...,1924401.txt,Immigration and\nRefugee Board of Canada\n\nIm...,other
96646,d:\IRB Decisions - Initial Request - TEXT\5222...,522262.txt,Commission de l'immigration et du statut de rf...,other
...,...,...,...,...
92020,d:\IRB Decisions - Initial Request - TEXT\3594...,3594370.txt,\nRAD File / Dossier de la SAR : TB8-17488\n\n...,RAD
52859,d:\IRB Decisions - Initial Request - TEXT\2103...,2103272.txt,\nImmigration and\nRefugee Board of Canada\n\n...,other
57930,d:\IRB Decisions - Initial Request - TEXT\2289...,2289335.txt,Commission de l'immigration\net du statut de r...,RAD
72355,d:\IRB Decisions - Initial Request - TEXT\3001...,3001258.txt,Immigration and Refugee\nBoard of Canada\nRefu...,RAD


In [13]:
# list any rows of df_sample_raw where genAI_dataset is 'RAD' but the file_name is not in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'RAD') & (~df_sample_raw['file_name'].isin(df['source_url']))]


Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
1092,d:\RAD Decisions TEXT\MC0-08482 f.txt,MC0-08482 f.txt,\nDossier de la SAR / RAD File : MC0-08482\n\n...,RAD


In [15]:
# list any rows of df_sample_raw where genAI_dataset is 'other' and the file_name is in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'other') & (df_sample_raw['file_name'].isin(df['source_url']))]

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset


In [16]:
df[df['citation1'] == 'MC0-08482']



Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
25501,MC0-08482,,RAD,,3549792.txt,2023-11-13,2021-08-24,2021,\nRAD File No. / No de dossier de la SAR : MC0...,en,"{""decision-maker_name"": ""Me Philippe Rabot""}"
27199,MC0-08482,,RAD,,3549791.txt,2023-11-13,2021-08-24,2021,\nDossier de la SAR / RAD File : MC0-08482\n\n...,fr,"{""decision-maker_name"": ""Me Philippe Rabot""}"


### Perfect! On 500 included files 100% accuracy. On 500 raw files 100% accuracy.