### Notebook to verify data parsing using GPT4

In [1]:
# Importing Libraries
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json
import pathlib

### Setup LLM Models

In [2]:
# Get API Key
with open('d:/AI-Projects/secrets/SECRETS-OPENAI.txt', 'r') as in_file:
    OPENAI_API_KEY = in_file.readlines()[0]

# Set up LLM
gpt = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=OPENAI_API_KEY, temperature=0, max_tokens=1024)

def get_response(prompt, system_message):
    messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=prompt),
    ]
    return gpt.invoke(messages).content

In [3]:
def get_decision_date (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only the date of the decision in YYYY-MM-DD format. 
    If you are not sure about the date, you return 'other'. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n DATE OF DECISION in YYYY-MM-DD format (other if unclear): "

    return get_response(prompt, system_message)

def get_rad_decision (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only 'RAD' if the case excerpt is a decision
    of the Refugee Appeal Division of the Immigration and Refugee Board, 'other' if it is 
    a different IRB division (e.g. Immigration Division, Immigration Appeal Division, Refugee 
    Protection Division) or if you are not sure. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n RAD or other: "

    return get_response(prompt, system_message)


### Load the data

In [4]:
# get parsed data

# Set variables
start_year = 2010  # First year of data sought (1877 +)
end_year = 2023  # Last year of data sought (2023 -)
languages_sought = ['en', 'fr']  # languages in list e.g. ['en', 'fr'] or ['en']

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# load data
results = []
for year in range(start_year, end_year+1):
    for language in languages_sought:
        with open(data_path / f'{year}_{language}.json') as f:
            results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)

df

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
0,TB9-07773,,RAD,,3229926.txt,2023-11-17,2010-01-24,2010,\nRAD File / Dossier de la SAR : TB9-07773\nTB...,en,"{""decision-maker_name"": ""Kim Polowek""}"
1,TB9-07773,,RAD,,3229927.txt,2023-11-17,2010-01-24,2010,\nDossier de la SAR / RAD File: TB9-07773\nTB9...,fr,"{""decision-maker_name"": ""Kim Polowek""}"
2,MB9-27420,,RAD,,3546053.txt,2023-11-17,2011-01-12,2011,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Marie-Lyne Thibault""}"
3,TB3-03406,,RAD,,1355235.txt,2023-11-17,2013-07-31,2013,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Edward Bosveld""}"
4,VB3-01099,,RAD,,1360268.txt,2023-11-17,2013-08-30,2013,\n\n\n\tRAD File No. / N° de dossier de la SAR...,en,"{""decision-maker_name"": null}"
...,...,...,...,...,...,...,...,...,...,...,...
27241,VC1-04668,,RAD,,3583931.txt,2023-11-17,2022-04-29,2022,\nDossier de la SAR / RAD File : VC1-04668\nVC...,fr,"{""decision-maker_name"": ""Me Kristine Plouffe-M..."
27242,MC1-07673,,RAD,,3596070.txt,2023-11-17,2022-02-25,2022,\nDossier de la SAR / RAD File: MC1-07673\n\nH...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
27243,MC1-06135,,RAD,,3599962.txt,2023-11-17,2022-01-28,2022,\nDossier de la SAR / RAD File : MC1-06135\n\n...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
27244,MC2-03068,,RAD,,MC2-03068 - Final February 2023.txt,2023-11-17,2023-02-02,2023,\nRAD File / Dossier de la SAR : MC2-03068\nMC...,en,"{""decision-maker_name"": ""Me Martine Durocher""}"


In [5]:
# get raw data
DATA_DIRS = ["d:/RAD Decisions TEXT/", "d:/IRB Decisions - Initial Request - TEXT/"]

# get list of files in each directory, including full path
path_file_list = []
for data_dir in DATA_DIRS:
    for file in pathlib.Path(data_dir).glob('**/*'):
        path_file_list.append(str(file))

# put the file_list into a dataframe
df_raw = pd.DataFrame(path_file_list, columns=['path_file_name'])

# add a column with just the file name (using pathlib
df_raw['file_name'] = df_raw['path_file_name'].apply(lambda x: pathlib.Path(x).name)

df_raw

    

Unnamed: 0,path_file_name,file_name
0,d:\RAD Decisions TEXT\AppToReopenTB8-14702a.txt,AppToReopenTB8-14702a.txt
1,d:\RAD Decisions TEXT\AppToReopenTB8-14702tf.txt,AppToReopenTB8-14702tf.txt
2,d:\RAD Decisions TEXT\Decision concerning G. B...,Decision concerning G. Bazin (finale version) ...
3,d:\RAD Decisions TEXT\Decision G. Bazin_ franc...,Decision G. Bazin_ francais_modifiee (003) - s...
4,d:\RAD Decisions TEXT\MB7-00112a.txt,MB7-00112a.txt
...,...,...
120572,d:\IRB Decisions - Initial Request - TEXT\9996...,999668.txt
120573,d:\IRB Decisions - Initial Request - TEXT\9996...,999669.txt
120574,d:\IRB Decisions - Initial Request - TEXT\9996...,999670.txt
120575,d:\IRB Decisions - Initial Request - TEXT\~$03...,~$036980.txt


### Select a random sample of the data

In [6]:
# set random seed
import random
random.seed(99)

# get random sample of df
df_sample = df.sample(n=500)

# get random sample of df_raw
df_sample_raw = df_raw.sample(n=500)

# open the text files in df_sample_raw, using file_name colum (and pathlib), and save content to unofficial_text column, being sure to close the file, with unicode encoding
df_sample_raw['unofficial_text'] = df_sample_raw['path_file_name'].apply(lambda x: pathlib.Path(x).open(encoding='utf-8', errors='ignore').read())
df_sample_raw



Unnamed: 0,path_file_name,file_name,unofficial_text
116545,d:\IRB Decisions - Initial Request - TEXT\9078...,907834.txt,\n\n\nRPD File No. / N de dossier de la SPR : ...
108620,d:\IRB Decisions - Initial Request - TEXT\7376...,737605.txt,\n\n\nIAD File No. / No de dossier de la SAI :...
101169,d:\IRB Decisions - Initial Request - TEXT\6463...,646314.txt,COMMISSION DE L'IMMIGRATION\nET DU STATUT DE R...
112966,d:\IRB Decisions - Initial Request - TEXT\8464...,846410.txt,IMMIGRATION AND REFUGEE BOARD (IMMIGRATION APP...
29056,d:\IRB Decisions - Initial Request - TEXT\1698...,1698413.txt,\nCommission de l'immigration\net du statut de...
...,...,...,...
80621,d:\IRB Decisions - Initial Request - TEXT\3201...,3201172.txt,\nDossier de la SAR / RAD File: TB8-32421\n\nH...
79338,d:\IRB Decisions - Initial Request - TEXT\3173...,3173786.txt,\nDossier de la SAR / RAD File : MB7-04810\n\n...
62489,d:\IRB Decisions - Initial Request - TEXT\2769...,2769779.txt,Immigration and \nRefugee Board of Canada\n\nR...
51642,d:\IRB Decisions - Initial Request - TEXT\2086...,2086241.txt,IMMIGRATION AND REFUGEE BOARD (IMMIGRATION APP...


### Apply the models to the data

In [7]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_decision_date'] = df_sample['unofficial_text'].apply(lambda x: get_decision_date(x[:1000]))
df_sample


Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date
6815,TB5-10735,,RAD,,2746227.txt,2023-11-17,2015-12-18,2015,\nCommission de l'immigration \net du statut d...,fr,"{""decision-maker_name"": ""Roslyn Ahara""}",2015-12-18
23072,MB9-21948,,RAD,,3509237.txt,2023-11-17,2020-12-12,2020,\nDossier de la SAR / RAD File : MB9-21948\n\n...,fr,"{""decision-maker_name"": ""Pascale Aubin""}",2020-12-12
9941,TB7-19022,,RAD,,2868254.txt,2023-11-17,2017-12-28,2017,Immigration and \nRefugee Board of Canada\n\nR...,en,"{""decision-maker_name"": ""Lauren Gamble""}",2017-12-28
1994,TB3-08255,,RAD,,1833707.txt,2023-11-17,2014-01-11,2014,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Daniel McSweeney""}",2014-01-11
18888,TB9-02351,,RAD,,3411746.txt,2023-11-17,2020-06-22,2020,\nRAD File / Dossier de la SAR : TB9-02351\n\n...,en,"{""decision-maker_name"": ""David Morris""}",2020-06-22
...,...,...,...,...,...,...,...,...,...,...,...,...
22452,TC0-02800,,RAD,,3444171.txt,2023-11-17,2020-12-10,2020,\n\nDossier de la SAR / RAD File: TC0-02800\n\...,fr,"{""decision-maker_name"": ""J. Pollock""}",2020-12-10
11018,MB7-24803,,RAD,,2876789.txt,2023-11-17,2018-04-05,2018,Immigration and\nRefugee Board of Canada.\n\nR...,en,"{""decision-maker_name"": ""Me Maria De Andrade""}",2018-04-05
18966,VB9-07690,,RAD,,3419441.txt,2023-11-17,2020-07-17,2020,\nRAD File / Dossier de la SAR : VB9-07690\nVB...,en,"{""decision-maker_name"": ""Rita Aggarwala""}",2020-07-17
16494,MB8-24791,,RAD,,3147017.txt,2023-11-17,2019-09-19,2019,\nDossier de la SAR / RAD File : MB8-24791\n\n...,fr,"{""decision-maker_name"": ""Me Toni Jedid""}",2019-09-19


In [8]:
# list any rows of the df where genAI_decision_date is not equal to the decision_date
df_sample[df_sample['genAI_decision_date'] != df_sample['document_date']]

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date


In [9]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_dataset'] = df_sample['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample


Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset
6815,TB5-10735,,RAD,,2746227.txt,2023-11-17,2015-12-18,2015,\nCommission de l'immigration \net du statut d...,fr,"{""decision-maker_name"": ""Roslyn Ahara""}",2015-12-18,RAD
23072,MB9-21948,,RAD,,3509237.txt,2023-11-17,2020-12-12,2020,\nDossier de la SAR / RAD File : MB9-21948\n\n...,fr,"{""decision-maker_name"": ""Pascale Aubin""}",2020-12-12,RAD
9941,TB7-19022,,RAD,,2868254.txt,2023-11-17,2017-12-28,2017,Immigration and \nRefugee Board of Canada\n\nR...,en,"{""decision-maker_name"": ""Lauren Gamble""}",2017-12-28,RAD
1994,TB3-08255,,RAD,,1833707.txt,2023-11-17,2014-01-11,2014,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Daniel McSweeney""}",2014-01-11,RAD
18888,TB9-02351,,RAD,,3411746.txt,2023-11-17,2020-06-22,2020,\nRAD File / Dossier de la SAR : TB9-02351\n\n...,en,"{""decision-maker_name"": ""David Morris""}",2020-06-22,RAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
22452,TC0-02800,,RAD,,3444171.txt,2023-11-17,2020-12-10,2020,\n\nDossier de la SAR / RAD File: TC0-02800\n\...,fr,"{""decision-maker_name"": ""J. Pollock""}",2020-12-10,RAD
11018,MB7-24803,,RAD,,2876789.txt,2023-11-17,2018-04-05,2018,Immigration and\nRefugee Board of Canada.\n\nR...,en,"{""decision-maker_name"": ""Me Maria De Andrade""}",2018-04-05,RAD
18966,VB9-07690,,RAD,,3419441.txt,2023-11-17,2020-07-17,2020,\nRAD File / Dossier de la SAR : VB9-07690\nVB...,en,"{""decision-maker_name"": ""Rita Aggarwala""}",2020-07-17,RAD
16494,MB8-24791,,RAD,,3147017.txt,2023-11-17,2019-09-19,2019,\nDossier de la SAR / RAD File : MB8-24791\n\n...,fr,"{""decision-maker_name"": ""Me Toni Jedid""}",2019-09-19,RAD


In [10]:
# list any rows of the df where genAI_dataset is not equal to the dataset
df_sample[df_sample['genAI_dataset'] != df_sample['dataset']]

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset


In [11]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample_raw['genAI_dataset'] = df_sample_raw['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample_raw

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {
  "error": {
    "message": "Request failed due to server shutdown",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Fri, 24 Nov 2023 20:08:13 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'gpt-4-1106-preview', 'openai-organization': 'refugee-law-lab', 'openai-processing-ms': '5631', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '5000', 'x-ratelimit-limit-tokens': '300000', 'x-ratelimit-limit-tokens_usage_based': '300000', 'x-ratelimit-remaining-requests': '4999', 'x-ratelimi

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
116545,d:\IRB Decisions - Initial Request - TEXT\9078...,907834.txt,\n\n\nRPD File No. / N de dossier de la SPR : ...,other
108620,d:\IRB Decisions - Initial Request - TEXT\7376...,737605.txt,\n\n\nIAD File No. / No de dossier de la SAI :...,other
101169,d:\IRB Decisions - Initial Request - TEXT\6463...,646314.txt,COMMISSION DE L'IMMIGRATION\nET DU STATUT DE R...,other
112966,d:\IRB Decisions - Initial Request - TEXT\8464...,846410.txt,IMMIGRATION AND REFUGEE BOARD (IMMIGRATION APP...,other
29056,d:\IRB Decisions - Initial Request - TEXT\1698...,1698413.txt,\nCommission de l'immigration\net du statut de...,RAD
...,...,...,...,...
80621,d:\IRB Decisions - Initial Request - TEXT\3201...,3201172.txt,\nDossier de la SAR / RAD File: TB8-32421\n\nH...,RAD
79338,d:\IRB Decisions - Initial Request - TEXT\3173...,3173786.txt,\nDossier de la SAR / RAD File : MB7-04810\n\n...,RAD
62489,d:\IRB Decisions - Initial Request - TEXT\2769...,2769779.txt,Immigration and \nRefugee Board of Canada\n\nR...,RAD
51642,d:\IRB Decisions - Initial Request - TEXT\2086...,2086241.txt,IMMIGRATION AND REFUGEE BOARD (IMMIGRATION APP...,other


In [12]:
# list any rows of df_sample_raw where genAI_dataset is 'RAD' but the file_name is not in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'RAD') & (~df_sample_raw['file_name'].isin(df['source_url']))]


Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
26636,d:\IRB Decisions - Initial Request - TEXT\1511...,1511335.txt,\n�\n�\n�\n�\n�\n�\n�\n�R�A�D� �F�i�l�e� �N�o�...,RAD
35737,d:\IRB Decisions - Initial Request - TEXT\1791...,1791818.txt,\n�\n�\n�\n�M�o�t�i�f�s� �e�t� �d��c�i�s�i�o�n...,RAD
40353,d:\IRB Decisions - Initial Request - TEXT\1828...,1828374.txt,C�o�m�m�i�s�s�i�o�n� �d�e� �l�'�i�m�m�i�g�r�a�...,RAD
26507,d:\IRB Decisions - Initial Request - TEXT\1509...,1509487.txt,\n�\n�\n�\n�\n�\n�\n�\n�N�� �d�e� �d�o�s�s�i�e...,RAD
1284,d:\RAD Decisions TEXT\MC1-00329 a.txt,MC1-00329 a.txt,\nRAD File / Dossier de la SAR : MC1-00329\n\n...,RAD
29873,d:\IRB Decisions - Initial Request - TEXT\1729...,1729380.txt,\n�\n�\n�\n�M�o�t�i�f�s� �e�t� �d��c�i�s�i�o�n...,RAD


In [13]:
# list any rows of df_sample_raw where genAI_dataset is 'other' and the file_name is in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'other') & (df_sample_raw['file_name'].isin(df['source_url']))]

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset


In [25]:
df[df['citation1'] == 'MB3-03369']



Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
