### Notebook to verify data parsing using GPT4

In [1]:
# Importing Libraries
from langchain.schema.messages import HumanMessage, SystemMessage
from langchain.chat_models import ChatOpenAI
import pandas as pd
import json
import pathlib

### Setup LLM Models

In [2]:
# Get API Key
with open('d:/AI-Projects/secrets/SECRETS-OPENAI.txt', 'r') as in_file:
    OPENAI_API_KEY = in_file.readlines()[0]

# Set up LLM
gpt = ChatOpenAI(model="gpt-4-1106-preview", openai_api_key=OPENAI_API_KEY, temperature=0, max_tokens=1024)

def get_response(prompt, system_message):
    messages = [
    SystemMessage(content=system_message),
    HumanMessage(content=prompt),
    ]
    return gpt.invoke(messages).content

In [3]:
def get_decision_date (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only the date of the decision in YYYY-MM-DD format. 
    If you are not sure about the date, you return 'other'. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n DATE OF DECISION in YYYY-MM-DD format (other if unclear): "

    return get_response(prompt, system_message)

def get_rad_decision (case_excerpt):
    system_message = """You are a helpful research assistant for a Canadian law professor.
    You are given a case exerpt and you return only 'RAD' if the case excerpt is a decision
    of the Refugee Appeal Division of the Immigration and Refugee Board, 'other' if it is 
    a different IRB division (e.g. Immigration Division, Immigration Appeal Division, Refugee 
    Protection Division) or if you are not sure. 
    """
    prompt = f"CASE EXCERPT: {case_excerpt} /n/n/n RAD or other: "

    return get_response(prompt, system_message)


### Load the data

In [4]:
# get parsed data

# Set variables
start_year = 2010  # First year of data sought (1877 +)
end_year = 2023  # Last year of data sought (2023 -)
languages_sought = ['en', 'fr']  # languages in list e.g. ['en', 'fr'] or ['en']

# Set path to data
data_path = pathlib.Path('DATA/YEARLY/')

# load data
results = []
for year in range(start_year, end_year+1):
    for language in languages_sought:
        with open(data_path / f'{year}_{language}.json') as f:
            results.extend(json.load(f))

# convert to dataframe
df = pd.DataFrame(results)

df

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
0,TB9-07773,,RAD,,3229926.txt,2023-11-13,2010-01-24,2010,\nRAD File / Dossier de la SAR : TB9-07773\nTB...,en,"{""decision-maker_name"": ""Kim Polowek""}"
1,TB9-07773,,RAD,,3229927.txt,2023-11-13,2010-01-24,2010,\nDossier de la SAR / RAD File: TB9-07773\nTB9...,fr,"{""decision-maker_name"": ""Kim Polowek""}"
2,MB9-27420,,RAD,,3546053.txt,2023-11-13,2011-01-12,2011,\nRAD File No. / No de dossier de la SAR : MB9...,en,"{""decision-maker_name"": ""Marie-Lyne Thibault""}"
3,TB3-03406,,RAD,,1355235.txt,2023-11-12,2013-07-31,2013,\n\n\n\n\n\n\nRAD File No. / N° de dossier de ...,en,"{""decision-maker_name"": ""Edward Bosveld""}"
4,VB3-01099,,RAD,,1360268.txt,2023-11-12,2013-08-30,2013,\n\n\n\tRAD File No. / N° de dossier de la SAR...,en,"{""decision-maker_name"": null}"
...,...,...,...,...,...,...,...,...,...,...,...
26379,VC1-04668,,RAD,,3583931.txt,2023-11-13,2022-04-29,2022,\nDossier de la SAR / RAD File : VC1-04668\nVC...,fr,"{""decision-maker_name"": ""Me Kristine Plouffe-M..."
26380,MC1-07673,,RAD,,3596070.txt,2023-11-13,2022-02-25,2022,\nDossier de la SAR / RAD File: MC1-07673\n\nH...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
26381,MC1-06135,,RAD,,3599962.txt,2023-11-13,2022-01-28,2022,\nDossier de la SAR / RAD File : MC1-06135\n\n...,fr,"{""decision-maker_name"": ""Daphnée Ouellet""}"
26382,MC2-03068,,RAD,,MC2-03068 - Final February 2023.txt,2023-11-12,2023-02-02,2023,\nRAD File / Dossier de la SAR : MC2-03068\nMC...,en,"{""decision-maker_name"": ""Me Martine Durocher""}"


In [5]:
# get raw data
DATA_DIRS = ["d:/RAD Decisions TEXT/", "d:/IRB Decisions - Initial Request - TEXT/"]

# get list of files in each directory, including full path
path_file_list = []
for data_dir in DATA_DIRS:
    for file in pathlib.Path(data_dir).glob('**/*'):
        path_file_list.append(str(file))

# put the file_list into a dataframe
df_raw = pd.DataFrame(path_file_list, columns=['path_file_name'])

# add a column with just the file name (using pathlib
df_raw['file_name'] = df_raw['path_file_name'].apply(lambda x: pathlib.Path(x).name)

df_raw

    

Unnamed: 0,path_file_name,file_name
0,d:\RAD Decisions TEXT\AppToReopenTB8-14702a.txt,AppToReopenTB8-14702a.txt
1,d:\RAD Decisions TEXT\AppToReopenTB8-14702tf.txt,AppToReopenTB8-14702tf.txt
2,d:\RAD Decisions TEXT\Decision concerning G. B...,Decision concerning G. Bazin (finale version) ...
3,d:\RAD Decisions TEXT\Decision G. Bazin_ franc...,Decision G. Bazin_ francais_modifiee (003) - s...
4,d:\RAD Decisions TEXT\MB7-00112a.txt,MB7-00112a.txt
...,...,...
120572,d:\IRB Decisions - Initial Request - TEXT\9996...,999668.txt
120573,d:\IRB Decisions - Initial Request - TEXT\9996...,999669.txt
120574,d:\IRB Decisions - Initial Request - TEXT\9996...,999670.txt
120575,d:\IRB Decisions - Initial Request - TEXT\~$03...,~$036980.txt


### Select a random sample of the data

In [6]:
# set random seed
import random
random.seed(111)

# get random sample of df
df_sample = df.sample(n=500)

# get random sample of df_raw
df_sample_raw = df_raw.sample(n=100)

# open the text files in df_sample_raw, using file_name colum (and pathlib), and save content to unofficial_text column, being sure to close the file, with unicode encoding
df_sample_raw['unofficial_text'] = df_sample_raw['path_file_name'].apply(lambda x: pathlib.Path(x).open(encoding='utf-8', errors='ignore').read())
df_sample_raw



Unnamed: 0,path_file_name,file_name,unofficial_text
5973,d:\IRB Decisions - Initial Request - TEXT\1026...,1026114.txt,COMMISSION DE L'IMMIGRATION\nET DU STATUT DE R...
64998,d:\IRB Decisions - Initial Request - TEXT\2816...,2816890.txt,Immigration and \nRefugee Board of Canada\n\nR...
28850,d:\IRB Decisions - Initial Request - TEXT\1690...,1690578.txt,\nImmigration and Refugee Board\n\nRefugee Pro...
59591,d:\IRB Decisions - Initial Request - TEXT\2683...,2683392.txt,Commission de l'immigration\net du statut de r...
115654,d:\IRB Decisions - Initial Request - TEXT\8971...,897196.txt,\nIMMIGRATION AND REFUGEE BOARD (IMMIGRATION A...
...,...,...,...
89723,d:\IRB Decisions - Initial Request - TEXT\3532...,3532715.txt,\nRAD File No. / No de dossier de la SAR : MB9...
51449,d:\IRB Decisions - Initial Request - TEXT\2072...,2072084.txt,\tRPD File No. / N de dossier de la SPR : TB4-...
76588,d:\IRB Decisions - Initial Request - TEXT\3097...,3097733.txt,Immigration and\nRefugee Board of Canada\n\nIm...
31731,d:\IRB Decisions - Initial Request - TEXT\1761...,1761337.txt,\n\n\n\nReasons and Decision ? Motifs et dcisi...


### Apply the models to the data

In [7]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_decision_date'] = df_sample['unofficial_text'].apply(lambda x: get_decision_date(x[:1000]))
df_sample


KeyboardInterrupt: 

In [None]:
# list any rows of the df where genAI_decision_date is not equal to the decision_date
df_sample[df_sample['genAI_decision_date'] != df_sample['document_date']]

In [None]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample['genAI_dataset'] = df_sample['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample


In [None]:
# list any rows of the df where genAI_dataset is not equal to the dataset
df_sample[df_sample['genAI_dataset'] != df_sample['dataset']]

In [7]:
# iterate through the sample, pick first 1000 characters of unofficial_text and get response, using df.apply
df_sample_raw['genAI_dataset'] = df_sample_raw['unofficial_text'].apply(lambda x: get_rad_decision(x[:1000]))
df_sample_raw

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised APIError: Request failed due to server shutdown {
  "error": {
    "message": "Request failed due to server shutdown",
    "type": "server_error",
    "param": null,
    "code": null
  }
}
 500 {'error': {'message': 'Request failed due to server shutdown', 'type': 'server_error', 'param': None, 'code': None}} {'Date': 'Wed, 22 Nov 2023 14:03:21 GMT', 'Content-Type': 'application/json', 'Content-Length': '141', 'Connection': 'keep-alive', 'access-control-allow-origin': '*', 'openai-model': 'gpt-4-1106-preview', 'openai-organization': 'refugee-law-lab', 'openai-processing-ms': '13934', 'openai-version': '2020-10-01', 'strict-transport-security': 'max-age=15724800; includeSubDomains', 'x-ratelimit-limit-requests': '5000', 'x-ratelimit-limit-tokens': '300000', 'x-ratelimit-limit-tokens_usage_based': '300000', 'x-ratelimit-remaining-requests': '4999', 'x-ratelim

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
5973,d:\IRB Decisions - Initial Request - TEXT\1026...,1026114.txt,COMMISSION DE L'IMMIGRATION\nET DU STATUT DE R...,other
64998,d:\IRB Decisions - Initial Request - TEXT\2816...,2816890.txt,Immigration and \nRefugee Board of Canada\n\nR...,RAD
28850,d:\IRB Decisions - Initial Request - TEXT\1690...,1690578.txt,\nImmigration and Refugee Board\n\nRefugee Pro...,other
59591,d:\IRB Decisions - Initial Request - TEXT\2683...,2683392.txt,Commission de l'immigration\net du statut de r...,RAD
115654,d:\IRB Decisions - Initial Request - TEXT\8971...,897196.txt,\nIMMIGRATION AND REFUGEE BOARD (IMMIGRATION A...,other
...,...,...,...,...
89723,d:\IRB Decisions - Initial Request - TEXT\3532...,3532715.txt,\nRAD File No. / No de dossier de la SAR : MB9...,RAD
51449,d:\IRB Decisions - Initial Request - TEXT\2072...,2072084.txt,\tRPD File No. / N de dossier de la SPR : TB4-...,other
76588,d:\IRB Decisions - Initial Request - TEXT\3097...,3097733.txt,Immigration and\nRefugee Board of Canada\n\nIm...,other
31731,d:\IRB Decisions - Initial Request - TEXT\1761...,1761337.txt,\n\n\n\nReasons and Decision ? Motifs et dcisi...,other


In [8]:
# list any rows of df_sample_raw where genAI_dataset is 'RAD' but the file_name is not in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'RAD') & (~df_sample_raw['file_name'].isin(df['source_url']))]


Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset
59591,d:\IRB Decisions - Initial Request - TEXT\2683...,2683392.txt,Commission de l'immigration\net du statut de r...,RAD
58038,d:\IRB Decisions - Initial Request - TEXT\2294...,2294281.txt,Commission de l'immigration\net du statut de r...,RAD
1248,d:\RAD Decisions TEXT\MC0-10414 a.txt,MC0-10414 a.txt,\nRAD File / Dossier de la SAR : MC0-10414\nMC...,RAD


In [14]:
# list any rows of df_sample_raw where genAI_dataset is 'other' and the file_name is in df.source_url
df_sample_raw[(df_sample_raw['genAI_dataset'] == 'other') & (df_sample_raw['file_name'].isin(df['source_url']))]

Unnamed: 0,path_file_name,file_name,unofficial_text,genAI_dataset


In [9]:
df[df['citation1'] == 'TB5-02870']

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
4324,TB5-02870,,RAD,,2040748.txt,2023-11-12,2015-06-08,2015,Immigration and \nRefugee Board of Canada\n\nR...,en,"{""decision-maker_name"": ""Edward Bosveld""}"
6048,TB5-02870,,RAD,,2040749.txt,2023-11-12,2015-06-08,2015,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Edward Bosveld""}"


In [10]:
df[df['citation1'] == 'MB6-02395']

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other


In [12]:

df[df['citation1'] == 'TB6-10859']

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other


In [13]:
df[df['citation1'] == 'TB5-04528']


Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
4585,TB5-04528,,RAD,,2261464.txt,2023-11-13,2015-07-10,2015,Immigration and \nRefugee Board of Canada\n\nR...,en,"{""decision-maker_name"": ""Lesley Mason""}"
6307,TB5-04528,,RAD,,2261465.txt,2023-11-13,2015-07-10,2015,Commission de l'immigration\net du statut de r...,fr,"{""decision-maker_name"": ""Lesley Mason""}"


In [15]:
df[df['citation1'] == 'MC0-10414']

Unnamed: 0,citation1,citation2,dataset,name,source_url,scraped_timestamp,document_date,year,unofficial_text,language,other
24333,MC0-10414,,RAD,,3549797.txt,2023-11-13,2021-05-21,2021,\nRAD File / Dossier de la SAR : MC0-10414\nMC...,en,"{""decision-maker_name"": ""Cheryl Braden""}"
26031,MC0-10414,,RAD,,3549798.txt,2023-11-13,2021-05-21,2021,\nDossier de la SAR / RAD File: MC0-10414\nMC0...,fr,"{""decision-maker_name"": ""Cheryl Braden""}"
