### Notebook to verify data parsing using GPT4

In [1]:
# Importing Libraries
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI

### Setup LLM Models

In [6]:
# Get API Key
with open('d:/AI-Projects/secrets/SECRETS-OPENAI.txt', 'r') as in_file:
    OPENAI_API_KEY = in_file.readlines()[0]

# Set up LLM
gpt = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=OPENAI_API_KEY, temperature=0, max_tokens=1024)

def get_response(prompt, system_message):
    messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=prompt),
    ]
    return gpt.invoke(messages).content

In [23]:
def get_decision_date (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only the date of the decision in YYYY-MM-DD format. 
    If you are not sure about the date, you return 'other'. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n DATE OF DECISION in YYYY-MM-DD format (other if unclear): "

    return get_response(prompt, system_message)

def get_rad_decision (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only 'RAD' if the case excerpt is a decision
    of the Refugee Appeal Division of the Immigration and Refugee Board, 'other' if it is 
    a different IRB division (e.g. Immigration Division, Immigration Appeal Division, Refugee 
    Protection Division) or if you are not sure. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n RAD or other: "

    return get_response(prompt, system_message)


### Load the data

In [9]:
# get data
import pandas as pd
import json
import pathlib

# Set variables
start_year = 2010  # First year of data sought (1877 +)
end_year = 2023  # Last year of data sought (2023 -)
languages_sought = ['en', 'fr']  # languages in list e.g. ['en', 'fr'] or ['en']

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# load data
results = []
for year in range(start_year, end_year+1):
    for language in languages_sought:
        with open(data_path / f'{year}_{language}.json') as f:
            results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)

df

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
0,TB9-07773,,RAD,,3229926.txt,2023-11-13,2010-01-24,2010,\nRAD File / Dossier de la SAR : TB9-07773\nTB...,en,"{""decision-maker_name"": ""Kim Polowek""}"
1,TB9-07773,,RAD,,3229927.txt,2023-11-13,2010-01-24,2010,\nDossier de la SAR / RAD File: TB9-07773\nTB9...,fr,"{""decision-maker_name"": ""Kim Polowek""}"
2,MB9-27420,,RAD,,3546053.txt,2023-11-13,2011-01-12,2011,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Marie-Lyne Thibault""}"
3,TB3-03406,,RAD,,1355235.txt,2023-11-12,2013-07-31,2013,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Edward Bosveld""}"
4,VB3-01099,,RAD,,1360268.txt,2023-11-12,2013-08-30,2013,\n\n\n\tRAD File No. / N° de dossier de la SAR...,en,"{""decision-maker_name"": null}"
...,...,...,...,...,...,...,...,...,...,...,...
26379,VC1-04668,,RAD,,3583931.txt,2023-11-13,2022-04-29,2022,\nDossier de la SAR / RAD File : VC1-04668\nVC...,fr,"{""decision-maker_name"": ""Me Kristine Plouffe-M..."
26380,MC1-07673,,RAD,,3596070.txt,2023-11-13,2022-02-25,2022,\nDossier de la SAR / RAD File: MC1-07673\n\nH...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
26381,MC1-06135,,RAD,,3599962.txt,2023-11-13,2022-01-28,2022,\nDossier de la SAR / RAD File : MC1-06135\n\n...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
26382,MC2-03068,,RAD,,MC2-03068 - Final February 2023.txt,2023-11-12,2023-02-02,2023,\nRAD File / Dossier de la SAR : MC2-03068\nMC...,en,"{""decision-maker_name"": ""Me Martine Durocher""}"


### Select a random sample of the data

In [24]:
# set random seed
import random
random.seed(888)

# get random sample of df
df_sample = df.sample(n=500)
df_sample

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
17578,MB9-11591,,RAD,,3223168.txt,2023-11-13,2020-01-07,2020,\nRAD File No. / Nº de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Me Murielle Henri""}"
20740,TB9-17400,,RAD,,3223208.txt,2023-11-13,2020-01-08,2020,Dossier de la SAR / RAD File: TB9-17400\nTB9-1...,fr,"{""decision-maker_name"": ""Rita Aggarwala""}"
3526,MB3-03879,,RAD,,1785882.txt,2023-11-12,2015-03-05,2015,\n\n\nImmigration and \nRefugee Board of Canad...,en,"{""decision-maker_name"": ""Anna Brychcy""}"
18580,TB9-23217,,RAD,,3457954.txt,2023-11-13,2020-02-10,2020,\nRAD File / Dossier de la SAR : TB9-23217\nTB...,en,"{""decision-maker_name"": ""Elana Rose""}"
15166,MB8-07552,,RAD,,3112067.txt,2023-11-13,2019-09-03,2019,\nDossier de la SAR / RAD File: MB8-07552\n\nH...,fr,"{""decision-maker_name"": ""Max Wolpert""}"
...,...,...,...,...,...,...,...,...,...,...,...
3338,MB4-03241,,RAD,,1734787.txt,2023-11-12,2015-01-19,2015,\n\n\n\nRAD File No. / N° d...,en,"{""decision-maker_name"": ""Normand Leduc""}"
15418,TB6-15915,,RAD,,3127166.txt,2023-11-13,2019-01-09,2019,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""T. Card""}"
6051,TB5-03152,,RAD,,2040755.txt,2023-11-12,2015-06-08,2015,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Milton Israel""}"
13631,MB9-01506,,RAD,,3176376.txt,2023-11-13,2019-10-31,2019,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Me Philippe Rabot""}"


### Apply the models to the data

In [25]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_decision_date'] = df_sample['unofficial_text'].apply(lambda x: get_decision_date(x[:1000]))
df_sample


Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {
  "error": {
    "message": "Request failed due to server shutdown",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Wed, 22 Nov 2023 02:10:16 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'gpt-4-1106-preview', 'openai-organization': 'refugee-law-lab', 'openai-processing-ms': '4949', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '5000', 'x-ratelimit-limit-tokens': '300000', 'x-ratelimit-limit-tokens_usage_based': '300000', 'x-ratelimit-remaining-requests': '4999', 'x-ratelimi

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date
17578,MB9-11591,,RAD,,3223168.txt,2023-11-13,2020-01-07,2020,\nRAD File No. / Nº de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Me Murielle Henri""}",2020-01-07
20740,TB9-17400,,RAD,,3223208.txt,2023-11-13,2020-01-08,2020,Dossier de la SAR / RAD File: TB9-17400\nTB9-1...,fr,"{""decision-maker_name"": ""Rita Aggarwala""}",2020-01-08
3526,MB3-03879,,RAD,,1785882.txt,2023-11-12,2015-03-05,2015,\n\n\nImmigration and \nRefugee Board of Canad...,en,"{""decision-maker_name"": ""Anna Brychcy""}",2015-03-05
18580,TB9-23217,,RAD,,3457954.txt,2023-11-13,2020-02-10,2020,\nRAD File / Dossier de la SAR : TB9-23217\nTB...,en,"{""decision-maker_name"": ""Elana Rose""}",2020-02-10
15166,MB8-07552,,RAD,,3112067.txt,2023-11-13,2019-09-03,2019,\nDossier de la SAR / RAD File: MB8-07552\n\nH...,fr,"{""decision-maker_name"": ""Max Wolpert""}",2019-09-03
...,...,...,...,...,...,...,...,...,...,...,...,...
3338,MB4-03241,,RAD,,1734787.txt,2023-11-12,2015-01-19,2015,\n\n\n\nRAD File No. / N° d...,en,"{""decision-maker_name"": ""Normand Leduc""}",2015-01-19
15418,TB6-15915,,RAD,,3127166.txt,2023-11-13,2019-01-09,2019,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""T. Card""}",2019-01-09
6051,TB5-03152,,RAD,,2040755.txt,2023-11-12,2015-06-08,2015,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Milton Israel""}",2015-06-08
13631,MB9-01506,,RAD,,3176376.txt,2023-11-13,2019-10-31,2019,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Me Philippe Rabot""}",2019-10-31


In [26]:
# list any rows of the df where genAI_decision_date is not equal to the decision_date
df_sample[df_sample['genAI_decision_date'] != df_sample['document_date']]

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date


In [27]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_dataset'] = df_sample['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample


Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset
17578,MB9-11591,,RAD,,3223168.txt,2023-11-13,2020-01-07,2020,\nRAD File No. / Nº de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Me Murielle Henri""}",2020-01-07,RAD
20740,TB9-17400,,RAD,,3223208.txt,2023-11-13,2020-01-08,2020,Dossier de la SAR / RAD File: TB9-17400\nTB9-1...,fr,"{""decision-maker_name"": ""Rita Aggarwala""}",2020-01-08,RAD
3526,MB3-03879,,RAD,,1785882.txt,2023-11-12,2015-03-05,2015,\n\n\nImmigration and \nRefugee Board of Canad...,en,"{""decision-maker_name"": ""Anna Brychcy""}",2015-03-05,RAD
18580,TB9-23217,,RAD,,3457954.txt,2023-11-13,2020-02-10,2020,\nRAD File / Dossier de la SAR : TB9-23217\nTB...,en,"{""decision-maker_name"": ""Elana Rose""}",2020-02-10,RAD
15166,MB8-07552,,RAD,,3112067.txt,2023-11-13,2019-09-03,2019,\nDossier de la SAR / RAD File: MB8-07552\n\nH...,fr,"{""decision-maker_name"": ""Max Wolpert""}",2019-09-03,RAD
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3338,MB4-03241,,RAD,,1734787.txt,2023-11-12,2015-01-19,2015,\n\n\n\nRAD File No. / N° d...,en,"{""decision-maker_name"": ""Normand Leduc""}",2015-01-19,RAD
15418,TB6-15915,,RAD,,3127166.txt,2023-11-13,2019-01-09,2019,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""T. Card""}",2019-01-09,RAD
6051,TB5-03152,,RAD,,2040755.txt,2023-11-12,2015-06-08,2015,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Milton Israel""}",2015-06-08,RAD
13631,MB9-01506,,RAD,,3176376.txt,2023-11-13,2019-10-31,2019,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Me Philippe Rabot""}",2019-10-31,RAD


In [28]:
# list any rows of the df where genAI_dataset is not equal to the dataset
df_sample[df_sample['genAI_dataset'] != df_sample['dataset']]

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other,genAI_decision_date,genAI_dataset


In [29]:
# test to make sure the RAD thing is working:

case_excerpt = """Commission de I'immigration
et du statut de réfugié
	
Section
de l'immigration

Immigration and
Refugee Board


Immigration
Division 

0003-A4-01309


MOTIFS ET DÉCISION/REASONS AND DECISION
Between
Entre

Ministre de la Citoyenneté et de l'Immigration \ Minister of Citizenship and Immigration
- et \ and -
XXXXXXXXX


Date(s) et lieu de l'audience 
1er octobre 2004
Date(s) and place of hearing

2 décembre 2004


10 février 2005


Toronto


     


     


     


     

Date de la décision 
14 avril 2005
Date of decision






Commissaire 
Dennis Paxton
Member






Pour l'intéressé(e)
J. S. Mangat
For the person concerned

     




Conseil du ministre 
C. Poulis
Minister's counsel

     




La Direction des services de révision et de traduction de la CISR peut vous procurer les présents motifs de décision dans l'autre langue officielle. Vous n'avez qu'à en faire la demande par écrit à l'adresse suivante : 344, rue Slater, Ottawa (Ontario) K1A 0K1, par courriel à l'adresse traduction.translation@cisr-irb.gc.ca ou par télécopie au (613) 947-3213.
You can obtain the translation of these reasons for decision in the other official language by writing to the Editing and Translation Services Directorate of the IRB, 344 Slater Street, Ottawa, Ontario, K1A 0K1 or by sending a request to the following e-mail address: translation.traduction@irb-cisr.gc.ca or to facsimile number (613) 947-3213.
"""

get_rad_decision(case_excerpt[:1000])

'other'

### Everything looks good.