In [1]:
import os
import pandas as pd

import os
os.chdir('..')
notebook_dir = os.getcwd()
print(notebook_dir)
os.environ['KMP_DUPLICATE_LIB_OK']='True'


/Users/apple/Documents/Swansea/Projects/Odyssey-Terms-Extraction-Journal


In [None]:
from openAIHandler import law_application_matcher
from openAIHandler import ModelSelector
from typing import List, Dict, Any
import pandas as pd
import os


def analyze_caselaw_df(
    caselaw_df,
    paragraph_column='paragraphs',
    section_id_column='section_id',
    section_text_column='section_text',
    id_column=None,
    llm=None
):
    """
    Analyzes case law paragraphs from a pandas DataFrame and matches them with their associated legislation sections.
    Each paragraph is paired with all its related sections (section_id: section_text), and only unique paragraphs are processed.
    Args:
        caselaw_df (DataFrame): Pandas DataFrame containing case law paragraphs and legislation sections
        paragraph_column (str): Name of the column containing paragraph text (default 'paragraphs')
        section_id_column (str): Name of the column containing section IDs (default 'section_id')
        section_text_column (str): Name of the column containing section text (default 'section_text')
        id_column (str, optional): Name of the column containing paragraph IDs
        llm: Language model to use (defaults to ModelSelector's current model)
    Returns:
        list: Results of the analysis with matches between paragraphs and their grouped legislation sections
    """

    # Validate columns exist
    for col in [paragraph_column, section_id_column, section_text_column]:
        if col not in caselaw_df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    if id_column and id_column not in caselaw_df.columns:
        raise ValueError(f"ID column '{id_column}' not found in DataFrame")

    # Use the ModelSelector to get the LLM if not provided
    if llm is None:
        llm = ModelSelector.get_llm(temperature=0)

    # Group by paragraph and collect all (section_id, section_text) pairs
    def collect_sections(group):
        return {row[section_id_column]: row[section_text_column] for _, row in group.iterrows() if pd.notna(row[section_id_column]) and pd.notna(row[section_text_column]) and str(row[section_text_column]).strip()}

    grouped = caselaw_df.groupby(paragraph_column).apply(collect_sections).reset_index()
    grouped.columns = [paragraph_column, 'sections']

    # If id_column is provided, get the first id for each paragraph
    if id_column:
        id_map = caselaw_df.drop_duplicates(paragraph_column)[[paragraph_column, id_column]].set_index(paragraph_column)[id_column].to_dict()

    all_results = []

    for idx, row in grouped.iterrows():
        para_text = row[paragraph_column]
        para_id = id_map[para_text] if id_column else idx
        sections_dict = row['sections']

        # Skip empty paragraphs
        if pd.isna(para_text) or not str(para_text).strip():
            continue
        if not sections_dict:
            continue

        # Call the new matcher with all sections at once
        match_results = law_application_matcher(para_id, para_text, sections_dict, llm)
        print("===================result from llm is ============")
        print(match_results)
        all_results.append(match_results)
    
    return all_results

In [1]:
from openAIHandler import law_application_matcher
from openAIHandler import ModelSelector
from typing import List, Dict, Any
import pandas as pd
import os


def analyze_caselaw_df(
    caselaw_df,
    paragraph_column='paragraphs',
    section_id_column='section_id',
    section_text_column='section_text',
    id_column=None,
    llm=None
):
    """
    Analyzes case law paragraphs from a pandas DataFrame and matches them with their associated legislation sections.
    Each paragraph is paired with all its related sections (section_id: section_text), and only unique paragraphs are processed.
    Args:
        caselaw_df (DataFrame): Pandas DataFrame containing case law paragraphs and legislation sections
        paragraph_column (str): Name of the column containing paragraph text (default 'paragraphs')
        section_id_column (str): Name of the column containing section IDs (default 'section_id')
        section_text_column (str): Name of the column containing section text (default 'section_text')
        id_column (str, optional): Name of the column containing paragraph IDs
        llm: Language model to use (defaults to ModelSelector's current model)
    Returns:
        list: Results of the analysis with matches between paragraphs and their grouped legislation sections
    """

    # Validate columns exist
    for col in [paragraph_column, section_id_column, section_text_column]:
        if col not in caselaw_df.columns:
            raise ValueError(f"Column '{col}' not found in DataFrame")
    if id_column and id_column not in caselaw_df.columns:
        raise ValueError(f"ID column '{id_column}' not found in DataFrame")

    # Use the ModelSelector to get the LLM if not provided
    if llm is None:
        llm = ModelSelector.get_llm(temperature=0)

    # Group by paragraph and collect all (section_id, section_text) pairs
    def collect_sections(group):
        return {row[section_id_column]: row[section_text_column] for _, row in group.iterrows() if pd.notna(row[section_id_column]) and pd.notna(row[section_text_column]) and str(row[section_text_column]).strip()}

    grouped = caselaw_df.groupby(paragraph_column).apply(collect_sections).reset_index()
    grouped.columns = [paragraph_column, 'sections']

    # If id_column is provided, get the first id for each paragraph
    if id_column:
        id_map = caselaw_df.drop_duplicates(paragraph_column)[[paragraph_column, id_column]].set_index(paragraph_column)[id_column].to_dict()

    all_results = []

    for idx, row in grouped.iterrows():
        para_text = row[paragraph_column]
        para_id = id_map[para_text] if id_column else idx
        sections_dict = row['sections']

        # Skip empty paragraphs
        if pd.isna(para_text) or not str(para_text).strip():
            continue
        if not sections_dict:
            continue

        # Call the new matcher with all sections at once
        match_results = law_application_matcher(para_id,para_text, sections_dict, llm)
        print("===================result from llm is ============")
        print(match_results)
        all_results.append(match_results)
    
    return all_results
df = pd.read_csv("../data/test2/csv_cases/csv_with_legislation/eat_2025_29.csv")
print(df.head())
results = analyze_caselaw_df(
    caselaw_df=df,
    paragraph_column="paragraphs",
    section_id_column="section_id",
    section_text_column="section_text",
    id_column="para_id"
)
results_df = pd.DataFrame(results)

# Join the original DataFrame with the results DataFrame on 'para_id'
joined_df = pd.merge(df, results_df, on='para_id', how='inner')
print(joined_df.head())

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
import pandas as pd

# Load your DataFrame
df = pd.read_csv("../data/test2/csv_cases/csv_with_legislation/eat_2025_29.csv")
print(df.head())

def update_para_id(file_name,para_id):
    return file_name+'#'+para_id
file_name = "eat_2025_29"
df['para_id'] = df['para_id'].apply(lambda x: update_para_id(file_name, x))




                                            case_uri para_id  \
0  https://caselaw.nationalarchives.gov.uk/eat/20...  para_1   
1  https://caselaw.nationalarchives.gov.uk/eat/20...  para_1   
2  https://caselaw.nationalarchives.gov.uk/eat/20...  para_1   
3  https://caselaw.nationalarchives.gov.uk/eat/20...  para_2   
4  https://caselaw.nationalarchives.gov.uk/eat/20...  para_2   

                                          paragraphs references  \
0  \n\t   \n\t 1. \n\t   \n\t     \n\t \n\t   The...         []   
1  \n\t   \n\t 1. \n\t   \n\t     \n\t \n\t   The...         []   
2  \n\t   \n\t 1. \n\t   \n\t     \n\t \n\t   The...         []   
3  \n\t   \n\t 2. \n\t   \n\t     \n\t \n\t   For...         []   
4  \n\t   \n\t 2. \n\t   \n\t     \n\t \n\t   For...         []   

  application_of_law_phrases_actual if_law_applied_actual  reason(optional)  \
0                                []                    no               NaN   
1                                []                   

In [18]:
print(df.head())

                                            case_uri             para_id  \
0  https://caselaw.nationalarchives.gov.uk/eat/20...  eat_2025_29#para_1   
1  https://caselaw.nationalarchives.gov.uk/eat/20...  eat_2025_29#para_1   
2  https://caselaw.nationalarchives.gov.uk/eat/20...  eat_2025_29#para_1   
3  https://caselaw.nationalarchives.gov.uk/eat/20...  eat_2025_29#para_2   
4  https://caselaw.nationalarchives.gov.uk/eat/20...  eat_2025_29#para_2   

                                          paragraphs references  \
0  \n\t   \n\t 1. \n\t   \n\t     \n\t \n\t   The...         []   
1  \n\t   \n\t 1. \n\t   \n\t     \n\t \n\t   The...         []   
2  \n\t   \n\t 1. \n\t   \n\t     \n\t \n\t   The...         []   
3  \n\t   \n\t 2. \n\t   \n\t     \n\t \n\t   For...         []   
4  \n\t   \n\t 2. \n\t   \n\t     \n\t \n\t   For...         []   

  application_of_law_phrases_actual if_law_applied_actual  reason(optional)  \
0                                []                    no    

In [48]:
import json
#make a jsonl file now
def make_batch_jsonl_from_csv(csv_path, oatchutput_dir="."):
    df = pd.read_csv(csv_path)

    required_columns = ['para_id', 'paragraphs', 'section_id', 'section_text']
    section_id_column = 'section_id'
    section_text_column = 'section_text'
    paragraph_column = 'paragraphs'
    id_column = 'para_id'

    for col in required_columns:   
        if col not in df.columns:
            raise ValueError(f"Missing required column: {col}")
    def update_para_id(file_name,para_id):
        return file_name+'#'+para_id
    def collect_sections(group):
        return {row[section_id_column]: row[section_text_column] for _, row in group.iterrows() if pd.notna(row[section_id_column]) and pd.notna(row[section_text_column]) and str(row[section_text_column]).strip()}
    file_name = os.path.splitext(os.path.basename(csv_path))[0]
    df['para_id'] = df['para_id'].apply(lambda x: update_para_id(file_name, x))
    print(file_name)

    grouped = df.groupby(paragraph_column).apply(collect_sections).reset_index()
    grouped.columns = [paragraph_column, 'sections']

    # If id_column is provided, get the first id for each paragraph
    if id_column:
        id_map = df.drop_duplicates(paragraph_column)[[paragraph_column, id_column]].set_index(paragraph_column)[id_column].to_dict()

    jsonl_lines =[]
    rid=1
    for idx, row in grouped.iterrows():
        para_text = row[paragraph_column]
        para_id = id_map[para_text] if id_column else idx
        sections_dict = row['sections']

        # Skip empty paragraphs
        if pd.isna(para_text) or not str(para_text).strip():
            continue
        if not sections_dict:
            continue
        sections_str = "\n".join([
        f"Section ID: {sid}\nSection Text: {stext}" for sid, stext in sections_dict.items()
        ])
        system_prompt = """
        You are a specialized legal analyst evaluating case law paragraphs and matching them with legislation.
        Follow this chain-of-thought process to analyze legal text:
        
        STEP 1: DETERMINE IF PARAGRAPH CONTAINS APPLICATION OF LAW
        First, analyze if the paragraph applies law to specific facts using these criteria:
        
        APPLICATION OF LAW DEFINITION:
        An application of law is where legal principles, statutes, or precedents are directly applied to 
        specific facts of the case. This goes beyond merely citing or discussing law in the abstract.
        
        INDICATORS OF APPLICATION OF LAW:
        - Judge connects specific legal rules/principles to the case's factual circumstances
        - Text shows reasoning that explains how the law resolves the specific facts
        - Contains judicial analysis leading to a conclusion based on legal principles
        - Legal tests or criteria are applied to case facts
        
        NOT APPLICATIONS OF LAW:
        - Mere citations without application to facts
        - Background information or case history
        - Statements about jurisdiction or general legal explanations
        - Summaries of arguments without judicial analysisACHX
        - Restatements of previous cases without connecting to current facts                                                                                                                                                                                                    
        
        STEP 2: IF APPLICATION EXISTS, MATCH WITH LEGISLATION
        Only if the paragraph contains application of law, analyze whether the legislation text corresponds:
        
        MATCHING CRITERIA:
        - Which law/clause is applied in the paragraph
        - Clear interpretative relationship (case law explains/applies the legislation)
        - Substantive connection (not merely tangential mentions)                            
        Your response must follow this exact format with chain-of-thought reasoning:
        1. First explicitly state your STEP 1 reasoning about whether application exists
        2. Give a clear YES/NO decision on whether application of law exists
        3. Only if YES, continue with STEP 2 reasoning about legislation matching
        4. End with a properly formatted JSON that includes all analysis results
        
        YOUR OUTPUT MUST BE FORMATTED AS A VALID JSON OBJECT. Do not include any explanations, notes, or text outside of the JSON object.

        The final JSON should follow this structure:
        {{
         "para_id": paragraph identifier(para_id) sent as an idntifier of record,
          "contains_application": true/false,
          "application_reasoning": "explanation of why paragraph does/doesn't contain application",
          "matches": [
              {{
                "caselaw_excerpt": "phrase/excerpt from case law",
                "section_id":"section_id from which the law is applied",
                "legislation_excerpt: "corresponding phrase/excerpt from legislation",
                "key_concept": "core legal concept being applied(from legislation text) should be an excerpt/verbatim ",
                "confidence"`: "High", "Medium", or "Low" based on how confident you are that it is actually where law is applied.
            ]
          }}
        }}
        
        
        If there's no application of law, the "legislation_match" field should be null.
        Always include proper chain-of-thought reasoning before providing the final JSON.
        """


        user_prompt = f"""
        CASELAW PARA ID:
        {para_id}

        CASELAW PARAGRAPH:
        {para_text}
        
        LEGISLATION TEXT:
        {sections_str}
        """

        jsonl_lines.append({
            "custom_id":f"request_{rid}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": { "model": "gpt-4o-mini",
            "messages": [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_prompt}
            ]}
        })
        rid = rid + 1
    output_path = os.path.join(output_dir, f"{file_name}_openai_batch_input.jsonl")
    with open(output_path, "w", encoding="utf-8") as f:
        for obj in jsonl_lines:
            f.write(json.dumps(obj) + "\n")

    print(f"Batch input saved to: {output_path}")
    return output_path
make_batch_jsonl_from_csv("../data/test2/csv_cases/csv_with_legislation/eat_2025_29.csv")
    


eat_2025_29
Batch input saved to: ./eat_2025_29_openai_batch_input.jsonl


  grouped = df.groupby(paragraph_column).apply(collect_sections).reset_index()


'./eat_2025_29_openai_batch_input.jsonl'

In [9]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


In [33]:
import openai
import time
# Assuming you have your OpenAI API key set up

# 1. Prepare the input file (e.g., batch_input.jsonl)

# 2. Upload the file
client = openai.OpenAI()
batch_input_file = client.files.create(file=open("./eat_2025_29_openai_batch_input.jsonl", "rb"), purpose="batch")


batch_input_file_id = batch_input_file.id
batch_job = client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
   
)

# 4. Monitor the job (example using a simple loop)
while batch_job.status != "completed":
    batch_job = client.batches.retrieve(batch_job.id)
    print(f"Job status: {batch_job.status}")
    # Add a delay to avoid excessive API calls
    
    time.sleep(10)

# 5. Download the results
output_file = client.files.retrieve(batch_job.output_file_id)
output_file_url = output_file.url
# Download the file from the URL (using a library like requests)

Job status: validating
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress
Job status: in_progress


KeyboardInterrupt: 

In [38]:
client.batches.list()

SyncCursorPage[Batch](data=[Batch(id='batch_68266d92253c8190aeb7376380bd0fd0', completion_window='24h', created_at=1747348882, endpoint='/v1/chat/completions', input_file_id='file-VftBYNSVsnv8KQTcJ26SDc', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1747352130, error_file_id=None, errors=None, expired_at=None, expires_at=1747435282, failed_at=None, finalizing_at=1747352123, in_progress_at=1747348883, metadata=None, output_file_id='file-BPT7ypSADWNkAZvEKAqvYL', request_counts=BatchRequestCounts(completed=57, failed=0, total=57)), Batch(id='batch_68266aed0c688190aa94530e77958a55', completion_window='24h', created_at=1747348205, endpoint='/v1/chat/completions', input_file_id='file-VLwcgvbukHp4iBViPb9U7q', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='unknown_parameter', line=1, message="Unknown parameter: 'metadata'.", param='metadata'), BatchErr

In [51]:
file_id = "file-BPT7ypSADWNkAZvEKAqvYL"
file_response = client.files.content(file_id)
with open("eat_2025_29_openai_batch_output.jsonl", "wb") as f:
    f.write(file_response.read())

In [59]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:
import os
import pandas as pd
def load_csv_files(folder_path):
    """Load all CSV files from a folder into a dictionary of DataFrames."""
    csv_files = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            case_name = file.replace('.csv', '')
            try:
                csv_files[case_name] = pd.read_csv(file_path)
                print(f"Loaded {case_name} with {len(csv_files[case_name])} rows")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return csv_files
input_folder_path =  '../data/test2/csv_cases/Experiment1-byPara'

case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [None]:
import os
import pandas as pd
def load_csv_files(folder_path):
    """Load all CSV files from a folder into a dictionary of DataFrames."""
    csv_files = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            case_name = file.replace('.csv', '')
            try:
                csv_files[case_name] = pd.read_csv(file_path)
                print(f"Loaded {case_name} with {len(csv_files[case_name])} rows")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return csv_files
input_folder_path =  '../data/test2/csv_cases/Experiment1-byPara'

case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [None]:
import os
import pandas as pd
def load_csv_files(folder_path):
    """Load all CSV files from a folder into a dictionary of DataFrames."""
    csv_files = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            case_name = file.replace('.csv', '')
            try:
                csv_files[case_name] = pd.read_csv(file_path)
                print(f"Loaded {case_name} with {len(csv_files[case_name])} rows")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return csv_files
input_folder_path =  '../data/test2/csv_cases/Experiment1-byPara'

case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [None]:
import os
import pandas as pd
def load_csv_files(folder_path):
    """Load all CSV files from a folder into a dictionary of DataFrames."""
    csv_files = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            case_name = file.replace('.csv', '')
            try:
                csv_files[case_name] = pd.read_csv(file_path)
                print(f"Loaded {case_name} with {len(csv_files[case_name])} rows")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return csv_files
input_folder_path =  '../data/test2/csv_cases/Experiment1-byPara'

case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [None]:
import os
import pandas as pd
def load_csv_files(folder_path):
    """Load all CSV files from a folder into a dictionary of DataFrames."""
    csv_files = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            case_name = file.replace('.csv', '')
            try:
                csv_files[case_name] = pd.read_csv(file_path)
                print(f"Loaded {case_name} with {len(csv_files[case_name])} rows")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return csv_files
input_folder_path =  '../data/test2/csv_cases/Experiment1-byPara'

case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [10]:
response_df = pd.DataFrame(parsed_responses)


In [11]:
response_df.head()

Unnamed: 0,para_id,if_law_applied_4o_mini_skipped_phase1,application_of_law_excepts,response
0,eat_2025_29#para_1,False,,"{'para_id': 'eat_2025_29#para_1', 'contains_ap..."
1,eat_2025_29#para_10,True,[{'caselaw_excerpt': 'the cause of the appella...,"{'para_id': 'eat_2025_29#para_10', 'contains_a..."
2,eat_2025_29#para_11,False,[],"{'para_id': 'eat_2025_29#para_11', 'contains_a..."
3,eat_2025_29#para_12,True,[{'caselaw_excerpt': 'the failure to make reas...,"{'para_id': 'eat_2025_29#para_12', 'contains_a..."
4,eat_2025_29#para_13,True,"[{'caselaw_excerpt': 'Mr Gourley's condition, ...","{'para_id': 'eat_2025_29#para_13', 'contains_a..."


In [12]:
response_df.if_law_applied_4o_mini_skipped_phase1.value_counts()

if_law_applied_4o_mini_skipped_phase1
True     42
False    15
Name: count, dtype: int64

In [15]:
response_df.head(5)

Unnamed: 0,para_id,if_law_applied_4o_mini_skipped_phase1,application_of_law_excepts,response
0,eat_2025_29#para_1,False,,"{'para_id': 'eat_2025_29#para_1', 'contains_ap..."
1,eat_2025_29#para_10,True,[{'caselaw_excerpt': 'the cause of the appella...,"{'para_id': 'eat_2025_29#para_10', 'contains_a..."
2,eat_2025_29#para_11,False,[],"{'para_id': 'eat_2025_29#para_11', 'contains_a..."
3,eat_2025_29#para_12,True,[{'caselaw_excerpt': 'the failure to make reas...,"{'para_id': 'eat_2025_29#para_12', 'contains_a..."
4,eat_2025_29#para_13,True,"[{'caselaw_excerpt': 'Mr Gourley's condition, ...","{'para_id': 'eat_2025_29#para_13', 'contains_a..."


In [66]:
import pandas as pd

# Build aggregation functions dynamically
agg_funcs = {
    'section_id': list,
    'section_text': list,
    **{col: 'first' for col in df.columns if col not in ['para_id', 'section_id', 'section_text']}
}

# Group df by para_id and aggregate
combined_df = df.groupby('para_id').agg(agg_funcs).reset_index()

# Join with response_df on para_id
joined_df = pd.merge(response_df, combined_df, on='para_id', how='inner')


In [72]:
import os
import pandas as pd

# Define the folder path
folder_path = 'path_to_your_folder'

# Initialize an empty list to store the dataframes
dfs = []

# Loop through all the csv files in the folder
for file in os.listdir(folder_path):
    if file.endswith(".csv"):
        # Read the csv file
        df = pd.read_csv(os.path.join(folder_path, file))
        # Embed the file name before para_id
        df['para_id'] = file.split('.')[0] + '#' + df['para_id']
        # Make a dictionary for section_id and section text
        section_dict = dict(zip(df['section_id'].iloc[0], df['section_text'].iloc[0]))
        # Append the dataframe to the list
        dfs.append(df)

# Concatenate all the dataframes into one
final_df = pd.concat(dfs, ignore_index=True)

Unnamed: 0,para_id,section_id,section_text,case_uri,paragraphs,references,application_of_law_phrases_actual,if_law_applied_actual,reason(optional),application_of_law_phrases.1,applied provision,act,legislative term,if_law_applied_gpt-4o-mini,application_of_law_phrases_gpt-4o-mini,reason_gpt-4o-mini,if_law_applied_gpt-4o,application_of_law_phrases_gpt-4o,reason_gpt-4o
0,eat_2025_29#para_1,"[ukpga/2010/15_section_133, ukpga/2010/15_sect...",[133 Remedies in pensions cases (1) This secti...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 1. \n\t \n\t \n\t \n\t The...,[],[],no,,,,,,0,[],The paragraph provides procedural background i...,0,[],The paragraph provides procedural information ...
1,eat_2025_29#para_10,"[ukpga/2010/15_section_133, ukpga/2010/15_sect...",[133 Remedies in pensions cases (1) This secti...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 10. \n\t \n\t \n\t \n\t Th...,[],[],no,,,,,,1,['the appellant’s position was that his psychi...,The paragraph discusses the application of leg...,0,[],The paragraph discusses the disputed issues at...
2,eat_2025_29#para_11,"[ukpga/2010/15_section_136, ukpga/2010/15_sect...",[136 Burden of proof (1) This section applies ...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 11. \n\t \n\t \n\t \n\t On...,[],[],no,,,,,,0,[],The paragraph discusses the evidence presented...,0,[],The paragraph provides procedural information ...
3,eat_2025_29#para_12,"[ukpga/2010/15_section_60, ukpga/2010/15_secti...",[60 Enquiries about disability and health (1) ...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 12. \n\t \n\t \n\t \n\t ...,[],[],no,,,,,,1,"[""the failure to make reasonable adjustments a...",The paragraph discusses how the failure to mak...,1,"[""the failure to make reasonable adjustments a...",The paragraph contains an application of law a...
4,eat_2025_29#para_13,"[ukpga/2010/15_section_1, ukpga/2010/15_sectio...",[Part 1 Determination of disability Impairment...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 13. \n\t \n\t \n\t \n\t ...,[],[],no,,,,,,0,[],This paragraph discusses the findings of a med...,0,[],The paragraph provides a factual account of Mr...
5,eat_2025_29#para_14,"[ukpga/2010/15_section_137, ukpga/2010/15_sect...",[137 Previous findings (1) A finding in releva...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 14. \n\t \n\t \n\t \n\t ...,[],[],no,,,,,,1,['We were particularly influenced by Dr Kinnib...,The paragraph demonstrates an application of l...,1,['We were particularly influenced by Dr Kinnib...,The paragraph demonstrates an application of l...
6,eat_2025_29#para_15,"[ukpga/2010/15_section_124, ukpga/2010/15_sect...",[124 Remedies: general (1) This section applie...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 15. \n\t \n\t \n\t \n\t ...,[],[],no,,,,,,1,['the Tribunal is satisfied that the claimant ...,The paragraph demonstrates an application of l...,1,['the tribunal is satisfied that the claimant ...,The paragraph demonstrates an application of l...
7,eat_2025_29#para_16,"[ukpga/2010/15_section_133, ukpga/2010/15_sect...",[133 Remedies in pensions cases (1) This secti...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 16. \n\t \n\t \n\t \n\t Wh...,[],[],no,,,,,,1,['the failure to make adjustments and the vict...,The paragraph discusses the unlawful acts of d...,1,['the initial onset of the illness was the res...,The paragraph applies legal principles to the ...
8,eat_2025_29#para_17,"[ukpga/2010/15_section_136, ukpga/2010/15_sect...",[136 Burden of proof (1) This section applies ...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 17. \n\t \n\t \n\t \n\t ...,[],[],no,,,,,,1,"[""there was no counter expert evidence present...",The paragraph discusses the tribunal's evaluat...,1,['We considered whether there were factors (ot...,The paragraph demonstrates an application of l...
9,eat_2025_29#para_18,"[ukpga/2010/15_section_133, ukpga/2010/15_sect...",[133 Remedies in pensions cases (1) This secti...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 18. \n\t \n\t \n\t \n\t ...,[],[],no,,,,,,1,['the tribunal considered the relevant heads o...,The paragraph demonstrates an application of l...,1,['the tribunal considered that the impact of t...,The paragraph contains multiple instances wher...


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [None]:


import json
import pandas as pd

# Read and parse each line in the output file
parsed_responses = []

with open("eat_2025_29_openai_batch_output.jsonl", "r") as f:
    for line in f.readlines():
        data = json.loads(line)
        try:
            raw_content = data["response"]["body"]["choices"][0]["message"]["content"]
            response_json = json.loads(raw_content)  # Parse the stringified JSON
            para_id = response_json.get("para_id")
            if_law_applied_4o_mini_skipped_phase1 = response_json.get("contains_application")
            application_of_law_excepts = response_json.get("matches")
            parsed_responses.append({
                "para_id": para_id,
                "if_law_applied_4o_mini_skipped_phase1":if_law_applied_4o_mini_skipped_phase1,
                "application_of_law_excepts":application_of_law_excepts,
                "response": response_json
            })
        except Exception as e:
            print(f"Error parsing line: {e}")


Error parsing line: Expecting value: line 1 column 1 (char 0)


In [68]:
joined_df.if_law_applied_actual.value_counts()

if_law_applied_actual
no     47
yes     9
Name: count, dtype: int64

In [70]:
joined_df.head(1)

Unnamed: 0,para_id,if_law_applied_4o_mini_skipped_phase1,application_of_law_excepts,response,section_id,section_text,case_uri,paragraphs,references,application_of_law_phrases_actual,...,application_of_law_phrases.1,applied provision,act,legislative term,if_law_applied_gpt-4o-mini,application_of_law_phrases_gpt-4o-mini,reason_gpt-4o-mini,if_law_applied_gpt-4o,application_of_law_phrases_gpt-4o,reason_gpt-4o
0,eat_2025_29#para_1,False,,"{'para_id': 'eat_2025_29#para_1', 'contains_ap...","[ukpga/2010/15_section_133, ukpga/2010/15_sect...",[133 Remedies in pensions cases (1) This secti...,https://caselaw.nationalarchives.gov.uk/eat/20...,\n\t \n\t 1. \n\t \n\t \n\t \n\t The...,[],[],...,,,,,0,[],The paragraph provides procedural background i...,0,[],The paragraph provides procedural information ...


In [71]:
joined_df.to_csv('../data/test2/csvs_for_skip_phase_1/eat_2025_29.csv')

In [None]:
import os
import glob
import pandas as pd

def update_para_id(file_name, para_id):
    try:
        return file_name + '#' + para_id
    except Exception as e:
        print(f"Error updating para_id: {para_id}")
        return file_name + '#para_' + str(para_id)

def collect_sections(group):
    # Create a dictionary of section_id: section_text pairs
    sections_dict = {}
    for _, row in group.iterrows():
        # Include all rows, even if they don't have section_id or section_text
        if pd.notna(row['section_id']) and pd.notna(row['section_text']) and str(row['section_text']).strip():
            sections_dict[row['section_id']] = row['section_text']
    return sections_dict

csv_files = glob.glob('../data/newData/*.csv')
csv_files2 = glob.glob('../data/test2/csv_cases/*.csv')
print(csv_files)
dfs = []

# First process files with sections to create a mapping of paragraphs to sections
sections_map = {}
for file in csv_files:
    df = pd.read_csv(file)
    file_name = os.path.splitext(os.path.basename(file))[0]
    print(file_name)
    df['para_id'] = df['para_id'].apply(lambda x: update_para_id(file_name, x))
    dfs.append(df)
    # Group by paragraphs and aggregate sections into a dictionary
    # sections_series = df.groupby('paragraphs').apply(collect_sections)
    # for para, sections in sections_series.items():
    #     sections_map[para] = sections

# # Then process all files to get complete dataset
# for file in csv_files2:
#     df = pd.read_csv(file)
#     file_name = os.path.splitext(os.path.basename(file))[0]
#     df['para_id'] = df['para_id'].apply(lambda x: update_para_id(file_name, x))
    
#     # Add sections from mapping, defaulting to empty dict if not found
#     df['sections'] = df['paragraphs'].map(lambda x: sections_map.get(x, {}))
    
    # dfs.append(df)

['../data/newData/ewhc_fam_2020_2339.csv', '../data/newData/ewcop_2020_45.csv', '../data/newData/ewhc_qb_2009_2362.csv', '../data/newData/ewhc_qb_2012_3151.csv', '../data/newData/ukpc_2012_27.csv', '../data/newData/ewca_civ_2007_268.csv', '../data/newData/ewca_civ_2023_219.csv', '../data/newData/ewhc_admin_2011_3684.csv', '../data/newData/ewca_crim_2010_547.csv', '../data/newData/ewca_crim_2003_3641.csv', '../data/newData/ewca_crim_2010_1474.csv', '../data/newData/ewca_crim_2011_578.csv', '../data/newData/ewhc_admin_2012_2280.csv', '../data/newData/ewcop_2022_14.csv', '../data/newData/ewhc_fam_2020_881.csv', '../data/newData/ewhc_admin_2006_493.csv', '../data/newData/ewhc_admin_2005_579.csv', '../data/newData/ewhc_ch_2011_2226.csv', '../data/newData/ewfc_2020_13.csv', '../data/newData/ewhc_admin_2012_1554.csv', '../data/newData/ewca_crim_2010_206.csv', '../data/newData/ewhc_scco_2023_1429.csv', '../data/newData/ewhc_admin_2013_1518.csv', '../data/newData/ewhc_comm_2022_894.csv', '../da

In [2]:
len(dfs)

31

In [3]:
df_combine = pd.concat(dfs)

In [4]:
df_combine.head(10)

Unnamed: 0,case_uri,para_id,paragraphs,references,application_of_law_phrases_actual,if_law_applied_actual,reason(optional),application_of_law_phrases.1,applied provision,act,legislative term,if_law_applied_gpt-4o-mini,application_of_law_phrases_gpt-4o-mini,reason_gpt-4o-mini,if_law_applied_gpt-4o,application_of_law_phrases_gpt-4o,reason_gpt-4o,sections
0,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_1,\n\t \n\t 1. \n\t \n\t \n\t These proc...,"[{'text': 'Charities Act 2011', 'href': 'http:...",[],no,,,,,,0,[],The paragraph provides background information ...,0,[],The paragraph provides background information ...,{}
1,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_2,\n\t \n\t 2. \n\t \n\t \n\t In summary...,[],[],no,,,,,,0,[],The paragraph discusses a dispute regarding a ...,0,[],The paragraph describes a factual dispute rega...,{'id/ukpga/2011/25_section_344': '344 Other mi...
2,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_3,\n\t \n\t 3. \n\t \n\t \n\t The practi...,[],[],no,,,,,,0,[],The paragraph discusses the practical signific...,0,[],The paragraph discusses the practical implicat...,{'id/ukpga/2011/25_section_181': '181 Power to...
3,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_4,\n\t \n\t 4. \n\t \n\t \n\t It is a ma...,[],[],no,,,,,,0,[],The paragraph discusses procedural delays and ...,0,[],The paragraph discusses procedural delays and ...,{'id/ukpga/2011/25_section_45B': '45B Power to...
4,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_5,\n\t \n\t 5. \n\t \n\t \n\t The case...,[],[],no,,,,,,0,[],The paragraph provides procedural background a...,0,[],The paragraph provides procedural information ...,{'id/ukpga/2011/25_section_317': '317 Appeal f...
5,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_6,"\n\t \n\t 6. \n\t \n\t \n\t In short, ...",[],[],no,,,,,,0,[],The paragraph does not contain any application...,0,[],The paragraph only states the outcome of the c...,{'id/ukpga/2011/25_section_217': '217 Constitu...
6,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_7,\n\t \n\t 7. \n\t \n\t \n\t It is comm...,[],[],no,,,,,,1,['the constitution of a charitable unincorpora...,The paragraph applies legal principles regardi...,0,[],The paragraph discusses the general legal fram...,{'id/ukpga/2011/25_section_218': '218 Third pa...
7,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_8,\n\t \n\t 8. \n\t \n\t \n\t As recorde...,[],[],no,,,,,,0,[],The paragraph provides historical context abou...,0,[],The paragraph provides historical and backgrou...,{'id/ukpga/2011/25_section_206': '206 Constitu...
8,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_9,\n\t \n\t 9. \n\t \n\t \n\t The four a...,[],[],no,,,,,,0,[],The paragraph discusses the aims and objective...,0,[],The paragraph describes the aims and objective...,{'id/ukpga/2011/25_section_3': '3 Descriptions...
9,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_10,\n\t \n\t 10. \n\t \n\t \n\t Clause ...,[],[],no,,,,,,0,[],The paragraph discusses the clauses of a membe...,0,[],The paragraph merely outlines clauses from a m...,{'id/ukpga/2011/25_section_217': '217 Constitu...


In [5]:
#df_combine = df_combine.drop_duplicates()
len(df_combine)

1152

In [10]:
import csv

# Clean the data first
df_clean = df_combine.copy()
for col in df_clean.select_dtypes(include=['object']).columns:
    df_clean[col] = df_clean[col].astype(str).str.replace('\n', ' ').str.replace('\r', ' ')

df_clean.to_csv('../data/test2/csvs_for_skip_phase_1/combined.csv', 
                index=False, 
                quoting=csv.QUOTE_ALL)

In [45]:
df_combine.columns

Index(['Unnamed: 0', 'case_uri', 'para_id', 'paragraphs', 'references',
       'application_of_law_phrases_actual', 'if_law_applied_actual',
       'reason(optional)', 'application_of_law_phrases.1', 'applied provision',
       'act', 'legislative term', 'if_law_applied_gpt-4o-mini',
       'application_of_law_phrases_gpt-4o-mini', 'reason_gpt-4o-mini',
       'if_law_applied_gpt-4o', 'application_of_law_phrases_gpt-4o',
       'reason_gpt-4o', 'sections'],
      dtype='object')

In [46]:
df_combine['if_law_applied_actual'].value_counts()

if_law_applied_actual
0    922
1    230
Name: count, dtype: int64

In [48]:
df_combine

Unnamed: 0.1,Unnamed: 0,case_uri,para_id,paragraphs,references,application_of_law_phrases_actual,if_law_applied_actual,reason(optional),application_of_law_phrases.1,applied provision,act,legislative term,if_law_applied_gpt-4o-mini,application_of_law_phrases_gpt-4o-mini,reason_gpt-4o-mini,if_law_applied_gpt-4o,application_of_law_phrases_gpt-4o,reason_gpt-4o,sections
0,0,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_1,\n\t \n\t 1. \n\t \n\t \n\t These proc...,"[{'text': 'Charities Act 2011', 'href': 'http:...",[],0,,,,,,0,[],The paragraph provides background information ...,0,[],The paragraph provides background information ...,{}
1,1,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_2,\n\t \n\t 2. \n\t \n\t \n\t In summary...,[],[],0,,,,,,0,[],The paragraph discusses a dispute regarding a ...,0,[],The paragraph describes a factual dispute rega...,{'id/ukpga/2011/25_section_344': '344 Other mi...
2,2,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_3,\n\t \n\t 3. \n\t \n\t \n\t The practi...,[],[],0,,,,,,0,[],The paragraph discusses the practical signific...,0,[],The paragraph discusses the practical implicat...,"{'id/ukpga/2011/25_section_181': ""181 Power to..."
3,3,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_4,\n\t \n\t 4. \n\t \n\t \n\t It is a ma...,[],[],0,,,,,,0,[],The paragraph discusses procedural delays and ...,0,[],The paragraph discusses procedural delays and ...,{'id/ukpga/2011/25_section_45B': '45B Power to...
4,4,https://caselaw.nationalarchives.gov.uk/ewhc/c...,ewhc_ch_2021_324#para_5,\n\t \n\t 5. \n\t \n\t \n\t The case...,[],[],0,,,,,,0,[],The paragraph provides procedural background a...,0,[],The paragraph provides procedural information ...,{'id/ukpga/2011/25_section_317': '317 Appeal f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1147,47,https://caselaw.nationalarchives.gov.uk/ewfc/2...,ewfc_2025_41#para_48,\n\t \n\t 48. \n\t \n\t \n\t The route...,"[{'text': 'section 28(3)', 'href': 'http://www...",[],0,,,,,,1,['the route to the declaration sought by Mr J ...,This paragraph applies legal principles from s...,1,['The route to the declaration sought by Mr J ...,The paragraph applies section 28(3) of the sta...,{}
1148,48,https://caselaw.nationalarchives.gov.uk/ewfc/2...,ewfc_2025_41#para_49,\n\t \n\t 49. \n\t \n\t \n\t For compl...,[],[],0,,,,,,0,[],The paragraph states there is no public policy...,0,[],The paragraph merely states that there is no p...,{'id/ukpga/1986/55_section_63': '63 . . . . . ...
1149,49,https://caselaw.nationalarchives.gov.uk/ewfc/2...,ewfc_2025_41#para_50,\n\t \n\t 50. \n\t \n\t \n\t By my ord...,"[{'text': 'section 14', 'href': 'http://www.le...",[],0,,,,,,1,['I shall direct a court officer to send a cop...,The paragraph applies section 14 A of the Birt...,1,"['By my order, I shall direct a court officer ...",The paragraph demonstrates an application of l...,{'id_ukpga_Eliz2_1-2_20_section_14': '14 Re–re...
1150,50,https://caselaw.nationalarchives.gov.uk/ewfc/2...,ewfc_2025_41#para_51,\n\t \n\t 51. \n\t \n\t \n\t I have wr...,[],[],0,,,,,,0,[],The paragraph discusses a letter written for t...,0,[],The paragraph does not demonstrate an applicat...,{'id/ukpga/1984/42_section_35': '35 Considerat...


In [44]:
# Convert 'if_law_applied_actual' to 0,1
#df_combine['if_law_applied_actual'] = df_combine['if_law_applied_actual'].map({'no': 0, 'yes': 1})

# Measure precision, recall, and f-measure
from sklearn.metrics import precision_score, recall_score, f1_score


y_pred = df_combine['if_law_applied_gpt-4o']
y_true = df_combine['if_law_applied_actual']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")

y_pred = df_combine['if_law_applied_gpt-4o-mini']

precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Precision: {precision}, Recall: {recall}, F1-score: {f1}")


Precision: 0.49140893470790376, Recall: 0.6217391304347826, F1-score: 0.5489443378119002
Precision: 0.3159922928709056, Recall: 0.7130434782608696, F1-score: 0.4379172229639519


In [None]:
df_combine['populated'] = #if_law_applied_gpt-4o

In [43]:
import pandas as pd
df_combine =pd.read_csv('../data/test2/csvs_for_skip_phase_1/combined.csv',index_col=False)
df_combine['case_uri'].nunique()

31

In [9]:
from sklearn.model_selection import train_test_split

# Splitting the dataframe into training and testing sets based on case_uri
training_case_uris = df_combine['case_uri'].unique()[:22]
testing_case_uris = df_combine['case_uri'].unique()[22:31]

df_training = df_combine[df_combine['case_uri'].isin(training_case_uris)]
df_testing = df_combine[df_combine['case_uri'].isin(testing_case_uris)]

In [14]:
df_training.to_csv('../data/test2/csvs_for_skip_phase_1/training.csv',index=False)

In [15]:
df_testing.to_csv('../data/test2/csvs_for_skip_phase_1/testing.csv',index=False)

In [12]:
import pandas as pd
df_training=pd.read_csv('../data/test2/csvs_for_skip_phase_1/training.csv',index_col=False)
system_prompt2 = """
    You are analyzing paragraphs from UK case law to determine if they contain an application of law to specific facts.

    APPLICATION OF LAW DEFINITION:
    An application of law is where statutory legal provisions are applied to the specific facts of the case at hand. This goes beyond merely citing or discussing law in the abstract and without specific reference to the facts of the case at hand.


    INDICATORS OF APPLICATION OF LAW:
    1. The judge connects specific statutory legal provisions to the specific factual circumstances.  
    2. The text shows reasoning that explains how the law resolves or addresses the unique facts
    3. The paragraph contains the judge's analysis leading to a conclusion based on legal principles
    4. Legal tests or criteria are being applied to the case facts

    NOT APPLICATIONS OF LAW:
    1. Mere citations of statutes, cases, or legal principles without application to facts
    2. Background procedural information or case history
    3. Statements about jurisdiction or general legal explanations
    4. Summaries of arguments made by parties without judicial analysis
    5. Restatements of previous cases without connecting them to current facts

    For each paragraph, determine if it contains an application of law, identify the specific phrases showing application, and provide a brief explanation for your decision.

    return a valid json, no prefix like this '''json before or end
    """
# Prepare training data in JSONL format
import json

# Create a list to store the training examples
training_examples = []

# Iterate through each row in the training dataframe
for _, row in df_training.iterrows():
    # Create the messages structure for each example
    messages = [
        {"role": "system", "content": system_prompt2},
        {"role": "user", "content": f"Paragraph ID: {row['para_id']}\n\nText: {row['paragraphs']}"},
        {"role": "assistant", "content": json.dumps({
            "para_id": row['para_id'],
            "contains_application": bool(row['if_law_applied_actual']),
            "application_phrases": row['application_of_law_phrases_actual'] if isinstance(row['application_of_law_phrases_actual'], list) else [],
            "reason": row['reason(optional)'] if pd.notna(row['reason(optional)']) else ""
        })}
    ]
    
    # Add the example to our list
    training_examples.append({"messages": messages})

# Write the training examples to a JSONL file
with open('../data/test2/csvs_for_skip_phase_1/training_data.jsonl', 'w') as f:
    for example in training_examples:
        f.write(json.dumps(example) + '\n')


In [4]:
import pandas as pd
df_training=pd.read_csv('../data/test2/csvs_for_skip_phase_1/testing.csv',index_col=False)
system_prompt2 = """
    You are analyzing paragraphs from UK case law to determine if they contain an application of law to specific facts.

    APPLICATION OF LAW DEFINITION:
    An application of law is where statutory legal provisions are applied to the specific facts of the case at hand. This goes beyond merely citing or discussing law in the abstract and without specific reference to the facts of the case at hand.


    INDICATORS OF APPLICATION OF LAW:
    1. The judge connects specific statutory legal provisions to the specific factual circumstances.  
    2. The text shows reasoning that explains how the law resolves or addresses the unique facts
    3. The paragraph contains the judge's analysis leading to a conclusion based on legal principles
    4. Legal tests or criteria are being applied to the case facts

    NOT APPLICATIONS OF LAW:
    1. Mere citations of statutes, cases, or legal principles without application to facts
    2. Background procedural information or case history
    3. Statements about jurisdiction or general legal explanations
    4. Summaries of arguments made by parties without judicial analysis
    5. Restatements of previous cases without connecting them to current facts

    For each paragraph, determine if it contains an application of law, identify the specific phrases showing application, and provide a brief explanation for your decision.

    return a valid json, no prefix like this '''json before or end
    """
# Prepare training data in JSONL format
import json

# Create a list to store the training examples
training_examples = []

# Iterate through each row in the training dataframe
for _, row in df_training.iterrows():
    # Create the messages structure for each example
    messages = [
        {"role": "system", "content": system_prompt2},
        {"role": "user", "content": f"Paragraph ID: {row['para_id']}\n\nText: {row['paragraphs']}"},
        {"role": "assistant", "content": json.dumps({
            "para_id": row['para_id'],
            "contains_application": bool(row['if_law_applied_actual']),
            "application_phrases": row['application_of_law_phrases_actual'] if isinstance(row['application_of_law_phrases_actual'], list) else [],
            "reason": row['reason(optional)'] if pd.notna(row['reason(optional)']) else ""
        })}
    ]
    
    # Add the example to our list
    training_examples.append({"messages": messages})

# Write the training examples to a JSONL file
with open('../data/test2/csvs_for_skip_phase_1/validation_data.jsonl', 'w') as f:
    for example in training_examples:
        f.write(json.dumps(example) + '\n')

In [4]:
#FineTuning The OPenAI Model
from IPython.display import Markdown, display
from openai import OpenAI
import os
from dotenv import load_dotenv

load_dotenv('.env')


OPENAI_API_KEY= os.getenv("OPENAI_API_KEY")

openai_api_key = OPENAI_API_KEY

print(openai_api_key)

client = OpenAI(api_key=openai_api_key)

response = client.chat.completions.create(
  model="gpt-4o-mini",
  messages=[
    {"role": "system", "content": "You are a great philosopher."},
    {"role": "user", "content": "What is the meaning of life?"}
  ]
)
display(Markdown(response.choices[0].message.content))
# train_file = client.files.create(
#   file=open('../data/test2/csvs_for_skip_phase_1/training_data.jsonl', "rb"),
#   purpose="fine-tune"
# )

# valid_file = client.files.create(
#   file=open('../data/test2/csvs_for_skip_phase_1/validation_data.jsonl', "rb"),
#   purpose="fine-tune"
# )

# print(f"Training file Info: {train_file}")
# print(f"Validation file Info: {valid_file}")

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


The meaning of life has been a profound question that has sparked philosophical, spiritual, and scientific inquiry throughout history. Answers can vary greatly depending on cultural, religious, and personal beliefs. 

From a philosophical perspective, some argue that life’s meaning is subjective and each individual must create their own purpose. This can be achieved through relationships, creativity, personal growth, and contributions to society. Existentialists, for instance, propose that life inherently has no predetermined meaning, and it's up to individuals to find their own.

Religious or spiritual traditions often offer different interpretations. For example, many religious teachings suggest that life's meaning is connected to serving a higher power, living a virtuous life, or preparing for an afterlife.

In modern existential and scientific thought, some might argue that life's meaning is rooted in the pursuit of knowledge, happiness, and understanding our place in the universe. 

Ultimately, the meaning of life may not have a singular answer, but rather, it invites each person to explore, question, and define it for themselves. What are your thoughts on it?

In [7]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [5]:
result = client.fine_tuning.jobs.list()


In [6]:
result

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-mini-2024-07-18:swansea-university::BYK2HlnR', finished_at=1747519866, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=['file-4GUie34C5QHwHcfH8E2aUM'], seed=445825256, status='succeeded', trained_tokens=1023672, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=1, learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None), FineTuningJob(id='ftjob-7

In [7]:
fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)

ft:gpt-4o-mini-2024-07-18:swansea-university::BYK2HlnR


In [None]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [None]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [9]:
import json


# Read the fine-tuning validation file
with open('../data/test2/csvs_for_skip_phase_1/validation_data.jsonl', 'r') as file:
  validation_data = file.readlines()


role_system_messages = []
role_user_messages = []
role_assisstant_messages = []

for data in validation_data:
  # Convert the string to JSON
  validation_json = json.loads(data)

  # Extract the value of "messages"
  messages = validation_json["messages"]

  # Extract the messages
  role_system = messages[0]
  role_user = messages[1]
  role_assisstant = messages[2]

  role_system_messages.append(role_system)
  role_user_messages.append(role_user)
  role_assisstant_messages.append(role_assisstant)
  
# Print the original content from the validation file
print("Original content from validation file:")
print(validation_data[0])  # Assuming we print the first line for demonstration
print(type(validation_data[0]))
# Convert the string to JSON
validation_json = json.loads(validation_data[0])

# Extract the value of "messages"
messages = validation_json["messages"]

# Print the extracted messages
print("Extracted messages:")
role_system = messages[0]
print(role_system)
role_user = messages[1]
print(role_user)
role_assisstant = messages[2]
print(role_assisstant)


Original content from validation file:
{"messages": [{"role": "system", "content": "\n    You are analyzing paragraphs from UK case law to determine if they contain an application of law to specific facts.\n\n    APPLICATION OF LAW DEFINITION:\n    An application of law is where statutory legal provisions are applied to the specific facts of the case at hand. This goes beyond merely citing or discussing law in the abstract and without specific reference to the facts of the case at hand.\n\n\n    INDICATORS OF APPLICATION OF LAW:\n    1. The judge connects specific statutory legal provisions to the specific factual circumstances.  \n    2. The text shows reasoning that explains how the law resolves or addresses the unique facts\n    3. The paragraph contains the judge's analysis leading to a conclusion based on legal principles\n    4. Legal tests or criteria are being applied to the case facts\n\n    NOT APPLICATIONS OF LAW:\n    1. Mere citations of statutes, cases, or legal principle

In [15]:
len(all_paragraph_results)

1

In [14]:
all_paragraph_results[1]

IndexError: list index out of range

In [11]:
# Convert list of dictionaries to DataFrame
# Filter out None values and ensure all items are dictionaries before creating DataFrame
valid_results = [x for x in all_paragraph_results if x is not None and isinstance(x, dict)]
results_df = pd.DataFrame(valid_results)

# Handle the list in application_of_law_phrases column by joining elements with '|'
results_df['application_of_law_phrases'] = results_df['application_of_law_phrases'].apply(lambda x: '|'.join(x) if isinstance(x, list) else x)

# Display basic info about the DataFrame
print(f"DataFrame shape: {results_df.shape}")
print("\nColumns:", results_df.columns.tolist())
print("\nFirst few rows:")
print(results_df.head())

DataFrame shape: (1, 4)

Columns: ['para_id', 'if_law_applied', 'application_of_law_phrases', 'reason']

First few rows:
                    para_id  if_law_applied  \
0  ewca_civ_2015_414#para_1               1   

                          application_of_law_phrases  \
0  is the applicable time limit for service out o...   

                                              reason  
0  The paragraph discusses the application of spe...  


In [12]:
len(results_df)

1

In [13]:
results_df

Unnamed: 0,para_id,if_law_applied,application_of_law_phrases,reason
0,ewca_civ_2015_414#para_1,1,is the applicable time limit for service out o...,The paragraph discusses the application of spe...


In [27]:
# %%
import json
import pandas as pd
import os
from openAIHandler import getLegalClassifierChain, ModelSelector # Assuming ModelSelector is in openAIHandler

# Define file paths
notebook_dir = os.getcwd() # Assumes the notebook is run from the project root/src
input_csv_path = os.path.join("../data/test2/csvs_for_skip_phase_1/testing.csv")
# Define the fine-tuned model name
fine_tuned_model_name = "ft:gpt-4o-mini-2024-07-18:swansea-university::B3pbF9HD"

# Load the DataFrame
try:
    df_test = pd.read_csv(input_csv_path)
    print(f"Loaded data from {input_csv_path}")
    print(f"DataFrame shape: {df_test.shape}")
    print("DataFrame columns:", df_test.columns.tolist())
    print("First 5 rows:")
    print(df_test.head())
except FileNotFoundError:
    print(f"Error: Input file not found at {input_csv_path}")
    # Exit or handle error appropriately
    exit()
except Exception as e:
    print(f"Error loading CSV: {e}")
    # Exit or handle error appropriately
    exit()

# Prepare data for getLegalClassifierChain
# The function expects a list of dicts like [{'para_id': '...', 'para_content': '...'}]
# Ensure 'para_id' and 'paragraphs' columns exist and handle potential NaNs
required_cols = ['para_id', 'paragraphs', 'if_law_applied_actual']
if not all(col in df_test.columns for col in required_cols):
    print(f"Error: Input CSV must contain columns {required_cols}")
    exit()

case_law_data_list = []
# Group by case to maintain structure if needed by the chain, or just process paragraphs
# For getLegalClassifierChain which takes case_law_json_list directly:
for index, row in df_test.iterrows():
    if pd.notna(row['para_id']) and pd.notna(row['paragraphs']):
         case_law_data_list.append({
             'para_id': str(row['para_id']),
             'para_content': str(row['paragraphs'])
         })

if not case_law_data_list:
    print("Error: No valid paragraphs found in the input CSV.")
    exit()

print(f"\nPrepared {len(case_law_data_list)} paragraphs for analysis.")

# Define placeholder examples for few-shot learning (REPLACE WITH REAL EXAMPLES)
# The getLegalClassifierChain uses a FewShotChatMessagePromptTemplate
# Example structure based on the prompt in openAIHandler.py:
# ("human", "para_id: {para_id}\npara_content: {para_content}")
# ("ai", "para_id: {para_id}\nif_law_applied: {if_law_applied}\napplication_of_law_phrases: {application_of_law_phrases}")
# Read examples from JSON file
examples_path = os.path.join("../data/test2/examples.json")
try:
    with open(examples_path, 'r') as f:
        examples = json.load(f)
    print(f"Loaded {len(examples)} examples from {examples_path}")
except FileNotFoundError:
    print(f"Error: Examples file not found at {examples_path}")
    exit()
except json.JSONDecodeError:
    print(f"Error: Invalid JSON in examples file at {examples_path}")
    exit()
except Exception as e:
    print(f"Error loading examples: {e}")
    exit()
print(f"\nUsing {len(examples)} placeholder examples for few-shot learning. **REPLACE THESE!**")

examples = []
# Set the model type and get the chain
ModelSelector.MODEL_TYPE = "openai" # Ensure OpenAI is selected to use fine-tuned model
print(f"\nSet ModelSelector.MODEL_TYPE to '{ModelSelector.MODEL_TYPE}'")

# Get the classifier chain using the fine-tuned model name
try:
    # The getLegalClassifierChain takes examples and llm_type
    parser, classifier_chain = getLegalClassifierChain(examples=examples, llm_type=fine_tuned_model_name, temperature=0)
    print(f"Successfully created classifier chain using model '{fine_tuned_model_name}'")
except Exception as e:
    print(f"Error creating classifier chain: {e}")
    # Check if the error is due to the fine-tuned model not being found/accessible
    if "authentication" in str(e).lower() or "access" in str(e).lower() or "not found" in str(e).lower():
         print("Please ensure the fine-tuned model name is correct and your API key has access.")
    exit()


# Run the analysis
print("\nRunning analysis with the classifier chain...")
# The chain expects the input variable 'case_law' according to the getLegalClassifierUsingJson prompt structure,
# but the getLegalClassifierChain prompt structure uses 'para_id', 'para_content', and 'format_instructions'.
# Let's inspect getLegalClassifierChain again to be sure.
# Re-reading `getLegalClassifierChain` in openAIHandler.py:
# The final_prompt uses `("human", "para_id: {para_id}\npara_content: {para_content}\n{format_instructions}")`
# and the chain is `final_prompt | llm | parser`.
# This means the chain expects `para_id`, `para_content`, and `format_instructions` as input variables.
# It does NOT process the `case_law_json_list` as the `getLegalClassifierUsingJson` function does.
# This indicates getLegalClassifierChain is for processing one paragraph at a time.

# Let's adjust the plan to loop through paragraphs and call the chain for each one.
# We also need the format_instructions from the parser.

all_paragraph_results = []
format_instructions = parser.get_format_instructions()
print(f"\nAnalyzing {len(case_law_data_list)} paragraphs individually...")
# Create a JSON file to store results
results_file = "paragraph_analysis_results_without_examples.json"
results_list = []

for para_data in case_law_data_list:
    try:
        # Call the chain for each paragraph
        paragraph_analysis = classifier_chain.invoke({
            "para_id": para_data['para_id'],
            "para_content": para_data['para_content'],
            "format_instructions": format_instructions # Pass the required format instructions
        })
        all_paragraph_results.append(paragraph_analysis)
        
        # Append to results list and write to JSON file
        results_list.append(paragraph_analysis)
        with open(results_file, 'w') as f:
            json.dump(results_list, f, indent=2)
            
    except Exception as e:
        print(f"Error analyzing paragraph {para_data['para_id']}: {e}")
        # Append an error result or skip the paragraph
        error_result = {"para_id": para_data['para_id'], "error": str(e)}
        all_paragraph_results.append(error_result)
        results_list.append(error_result)
        with open(results_file, 'w') as f:
            json.dump(results_list, f, indent=2)
        
print("\nAnalysis complete. Results saved to", results_file)


Loaded data from ../data/test2/csvs_for_skip_phase_1/testing.csv
DataFrame shape: (287, 19)
DataFrame columns: ['Unnamed: 0', 'case_uri', 'para_id', 'paragraphs', 'references', 'application_of_law_phrases_actual', 'if_law_applied_actual', 'reason(optional)', 'application_of_law_phrases.1', 'applied provision', 'act', 'legislative term', 'if_law_applied_gpt-4o-mini', 'application_of_law_phrases_gpt-4o-mini', 'reason_gpt-4o-mini', 'if_law_applied_gpt-4o', 'application_of_law_phrases_gpt-4o', 'reason_gpt-4o', 'sections']
First 5 rows:
   Unnamed: 0                                           case_uri  \
0           0  https://caselaw.nationalarchives.gov.uk/ewca/c...   
1           2  https://caselaw.nationalarchives.gov.uk/ewca/c...   
2           4  https://caselaw.nationalarchives.gov.uk/ewca/c...   
3           8  https://caselaw.nationalarchives.gov.uk/ewca/c...   
4          11  https://caselaw.nationalarchives.gov.uk/ewca/c...   

                    para_id  \
0  ewca_civ_2015_414#p

In [32]:
results_df

Unnamed: 0,para_id,if_law_applied,application_of_law_phrases,reason
0,ewca_civ_2015_414#para_1,True,[applicable time limit for service out of the ...,The paragraph discusses the specific applicati...
1,ewca_civ_2015_414#para_2,False,[],The paragraph discusses the respondent's regis...
2,ewca_civ_2015_414#para_3,True,[how long is the period within which the claim...,The paragraph discusses the application of sec...
3,ewca_civ_2015_414#para_4,False,[],The paragraph contains citations of the Civil ...
4,ewca_civ_2015_414#para_5,True,[interpretation and application of the Civil P...,The paragraph discusses the interpretation and...
...,...,...,...,...
282,ewfc_2025_41#para_47,False,[],The paragraph does not provide any specific ap...
283,ewfc_2025_41#para_49,False,[],The paragraph does not connect any statutory l...
284,ewfc_2025_41#para_50,True,[section 14 A of the Births and Deaths Regist...,The paragraph applies the statutory provision ...
285,ewfc_2025_41#para_51,False,[],The paragraph does not contain any application...


In [31]:
len(results_df)

287

In [30]:
# Process results and merge with original DataFrame
# Convert analysis results to a DataFrame for easier merging
# Filter out None values and ensure all items are dictionaries before creating DataFrame
valid_results = [result for result in all_paragraph_results if result is not None and isinstance(result, dict)]
results_df = pd.DataFrame(valid_results)

# Rename 'para_id' in results_df to match original df for merging
results_df.rename(columns={'para_id': 'para_id'}, inplace=True) # Already same name, but good practice

# Merge results back to the original DataFrame using 'para_id'
# Using a left merge to keep all original rows, even if analysis failed or found no match
merged_df = df_test.merge(results_df, on='para_id', how='left', suffixes=('_actual', '_predicted'))


In [33]:
merged_df['if_law_applied_actual'] = merged_df['if_law_applied_actual'].map({'no': 0, 'yes': 1})

merged_df.head(5)

Unnamed: 0.1,Unnamed: 0,case_uri,para_id,paragraphs,references,application_of_law_phrases_actual,if_law_applied_actual,reason(optional),application_of_law_phrases.1,applied provision,...,if_law_applied_gpt-4o-mini,application_of_law_phrases_gpt-4o-mini,reason_gpt-4o-mini,if_law_applied_gpt-4o,application_of_law_phrases_gpt-4o,reason_gpt-4o,sections,if_law_applied,application_of_law_phrases,reason
0,0,https://caselaw.nationalarchives.gov.uk/ewca/c...,ewca_civ_2015_414#para_1,\n\t \n\t 1. \n\t \n\t \n\t This appea...,"[{'text': 'section 1139', 'href': 'http://www....",[],0,,,,...,0,[],The paragraph discusses the applicable time li...,0,[],The paragraph discusses a procedural question ...,,True,[applicable time limit for service out of the ...,The paragraph discusses the specific applicati...
1,2,https://caselaw.nationalarchives.gov.uk/ewca/c...,ewca_civ_2015_414#para_2,\n\t \n\t 2. \n\t \n\t \n\t In this ca...,"[{'text': 'section (1', 'href': 'http://www.le...",[],0,,,,...,0,[],The paragraph discusses the registration of a ...,0,[],The paragraph discusses the jurisdictional asp...,,False,[],The paragraph discusses the respondent's regis...
2,4,https://caselaw.nationalarchives.gov.uk/ewca/c...,ewca_civ_2015_414#para_3,\n\t \n\t 3. \n\t \n\t \n\t In \n\t ...,"[{'text': 'section 1139', 'href': 'http://www....",[],0,,,,...,1,['this court had to consider whether the prede...,The paragraph discusses the application of sec...,0,[],The paragraph discusses the interpretation of ...,,True,[how long is the period within which the claim...,The paragraph discusses the application of sec...
3,8,https://caselaw.nationalarchives.gov.uk/ewca/c...,ewca_civ_2015_414#para_4,\n\t \n\t 4. \n\t \n\t \n\t I now turn...,[],[],0,,,,...,0,[],The paragraph contains a citation of procedura...,0,[],The paragraph merely cites procedural rules fr...,,False,[],The paragraph contains citations of the Civil ...
4,11,https://caselaw.nationalarchives.gov.uk/ewca/c...,ewca_civ_2015_414#para_5,\n\t \n\t 5. \n\t \n\t \n\t The learne...,"[{'text': 'section 1139', 'href': 'http://www....",[],0,,,,...,0,[],The paragraph discusses the facts and procedur...,0,[],The paragraph discusses the interpretation and...,,True,[interpretation and application of the Civil P...,The paragraph discusses the interpretation and...


In [34]:
# print("Sample of raw results:", all_paragraph_results[:5]) # Optional: print sample raw results



# Clean up merged DataFrame columns (optional, depends on how you want the output)
# Remove redundant predicted columns if not needed, or select specific columns
# For evaluation, we need 'if_law_applied_actual' and 'if_law_applied_predicted'
# We might also compare 'application_of_law_phrases_actual' and 'application_of_law_phrases_predicted'
# Let's focus on 'if_law_applied' for precision/recall for now.

# Convert boolean predictions to consistent type (e.g., int 0/1 or string 'yes'/'no')
# Assuming 'if_law_applied_actual' is boolean or similar (check df.head())
# Assuming 'if_law_applied_predicted' from LLM is boolean
#merged_df['if_law_applied_actual'] = merged_df['if_law_applied_actual'].astype(bool)
# Fill potential NaN predictions (e.g., if LLM call failed) with False (or a specific indicator if needed)
merged_df['if_law_applied_predicted'] = merged_df['if_law_applied'].fillna(False).astype(bool)


# Calculate Precision, Recall, F1-Measure
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

# Drop rows where actual ground truth is missing if necessary for evaluation
# For 'if_law_applied', we'll assume all rows in the test set have a ground truth label.

y_true = merged_df['if_law_applied_actual']
y_pred = merged_df['if_law_applied_predicted']

# Handle cases where there might be no positive actuals or no positive predictions
# precision_score, recall_score, f1_score might raise errors in such cases.
# Add zero_division handling or check for edge cases.
# zero_division=1 means score is 1 if there are no predicted positives (and thus no true positives).
# zero_division=0 means score is 0 if there are no predicted positives but there are actual positives.
# Let's use zero_division=1 as it's often more appropriate for balanced evaluation when no positives are found.

try:
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    print("\nEvaluation Results:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Measure: {f1:.4f}")
    print(f"Confusion Matrix:")
    print(f"  True Negative: {tn}")
    print(f"  False Positive: {fp}")
    print(f"  False Negative: {fn}")
    print(f"  True Positive: {tp}")

except Exception as e:
    print(f"\nError during evaluation: {e}")
    print("Could not calculate metrics. Please check the data and results.")

# Optional: Save the merged DataFrame with predictions for inspection
output_results_path = os.path.join(notebook_dir, "../data/test2/results/testing_analysis_with_predictions_no_example.csv")
os.makedirs(os.path.dirname(output_results_path), exist_ok=True)
merged_df.to_csv(output_results_path, index=False)
print(f"\nResults with predictions saved to {output_results_path}")

# You would need to do a separate, more complex evaluation to compare
# 'application_of_law_phrases_actual' and 'application_of_law_phrases_predicted'.
# This typically involves comparing lists of strings, which is not straightforward with standard metrics.


Evaluation Results:
Precision: 0.1921
Recall: 0.6905
F1-Measure: 0.3005
Confusion Matrix:
  True Negative: 143
  False Positive: 122
  False Negative: 13
  True Positive: 29

Results with predictions saved to /Users/apple/Documents/Swansea/Projects/Odyssey-Terms-Extraction-Journal/src/../data/test2/results/testing_analysis_with_predictions_no_example.csv


In [35]:
merged_df.columns

Index(['Unnamed: 0', 'case_uri', 'para_id', 'paragraphs', 'references',
       'application_of_law_phrases_actual', 'if_law_applied_actual',
       'reason(optional)', 'application_of_law_phrases.1', 'applied provision',
       'act', 'legislative term', 'if_law_applied_gpt-4o-mini',
       'application_of_law_phrases_gpt-4o-mini', 'reason_gpt-4o-mini',
       'if_law_applied_gpt-4o', 'application_of_law_phrases_gpt-4o',
       'reason_gpt-4o', 'sections', 'if_law_applied',
       'application_of_law_phrases', 'reason', 'if_law_applied_predicted'],
      dtype='object')

In [38]:
y_true = merged_df['if_law_applied_actual']
y_pred = merged_df['if_law_applied_gpt-4o']

# Handle cases where there might be no positive actuals or no positive predictions
# precision_score, recall_score, f1_score might raise errors in such cases.
# Add zero_division handling or check for edge cases.
# zero_division=1 means score is 1 if there are no predicted positives (and thus no true positives).
# zero_division=0 means score is 0 if there are no predicted positives but there are actual positives.
# Let's use zero_division=1 as it's often more appropriate for balanced evaluation when no positives are found.

try:
    precision = precision_score(y_true, y_pred, zero_division=1)
    recall = recall_score(y_true, y_pred, zero_division=1)
    f1 = f1_score(y_true, y_pred, zero_division=1)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    print("\nEvaluation Results:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1-Measure: {f1:.4f}")
    print(f"Confusion Matrix:")
    print(f"  True Negative: {tn}")
    print(f"  False Positive: {fp}")
    print(f"  False Negative: {fn}")
    print(f"  True Positive: {tp}")

except Exception as e:
    print(f"\nError during evaluation: {e}")
    print("Could not calculate metrics. Please check the data and results.")

# Optional: Save the merged DataFrame with predictions for inspection
output_results_path = os.path.join(notebook_dir, "../data/test2/results/testing_analysis_with_predictions_no_example.csv")
os.makedirs(os.path.dirname(output_results_path), exist_ok=True)
merged_df.to_csv(output_results_path, index=False)
print(f"\nResults with predictions saved to {output_results_path}")



Evaluation Results:
Precision: 0.3200
Recall: 0.5714
F1-Measure: 0.4103
Confusion Matrix:
  True Negative: 214
  False Positive: 51
  False Negative: 18
  True Positive: 24

Results with predictions saved to /Users/apple/Documents/Swansea/Projects/Odyssey-Terms-Extraction-Journal/src/../data/test2/results/testing_analysis_with_predictions_no_example.csv


In [None]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [None]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [None]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [None]:
model = client.fine_tuning.jobs.create(
  training_file=train_file.id, 
  validation_file=valid_file.id,
  model="gpt-4o-mini-2024-07-18", 
  hyperparameters={
    "n_epochs": 3,
	"learning_rate_multiplier": 0.05
  }
)
job_id = model.id
status = model.status

print(f'Fine-tuning model with jobID: {job_id}.')
print(f"Training Response: {model}")
print(f"Training Status: {status}")

Fine-tuning model with jobID: ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4.
Training Response: FineTuningJob(id='ftjob-NbqqLZQ6tuWXTcbB4lnPxeV4', created_at=1747517066, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-OsZ85QSzhdEd1hPN5XXItCTn', result_files=[], seed=445825256, status='validating_files', trained_tokens=None, training_file='file-UKVsUQatZF9d3tSs9VfUfj', validation_file='file-SSPuXui7JGPskRJ7cownjb', estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size='auto', learning_rate_multiplier=0.05, n_epochs=3)), type='supervised'), user_provided_suffix=None, metadata=None, usage_metrics=None, shared_with_openai=False, eval_id=None)
Training Status: validating_files


In [8]:
import os
import pandas as pd
def load_csv_files(folder_path):
    """Load all CSV files from a folder into a dictionary of DataFrames."""
    csv_files = {}
    for file in os.listdir(folder_path):
        if file.endswith('.csv'):
            file_path = os.path.join(folder_path, file)
            case_name = file.replace('.csv', '')
            try:
                csv_files[case_name] = pd.read_csv(file_path)
                print(f"Loaded {case_name} with {len(csv_files[case_name])} rows")
            except Exception as e:
                print(f"Error loading {file}: {str(e)}")
    return csv_files
input_folder_path =  '../data/test2/csv_cases/Experiment1-byPara'

case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [5]:
#read the csv files from data/test4/Full_case_experiments
#Attach the csvfile with para_id
from sklearn.metrics import precision_score, recall_score, f1_score

# Read the csv file with actual values
f1_list = []
base_input_folder = '../data/test2/csv_cases/Experiment1-byPara'
experiment_folder_path = '../data/test4/Full_case_experiments'

cases = list(case_files.keys())
for case_name in cases:
    base_input_file = os.path.join(base_input_folder, case_name)
    experiment_file = os.path.join(experiment_folder_path, case_name)

    base_df = pd.read_csv(base_input_file+'.csv',index_col=False)
    experiment_df = pd.read_csv(experiment_file+'.csv',index_col=False)
    
    # Merge the dataframes on 'para_id'
    merged_df = pd.merge(base_df[['para_id', 'if_law_applied_actual', 'if_law_applied_gpt-4o']],
                        experiment_df[['para_id', 'if_law_applied']],
                        on='para_id')
    
    # Create combined prediction where both models must predict 1
    merged_df['combined_prediction'] = ((merged_df['if_law_applied_gpt-4o'] == 1) & 
                                      (merged_df['if_law_applied'] == True)).astype(int)

    # Find false negatives (actual=1, predicted=0)
    false_negatives = merged_df[
        (merged_df['if_law_applied_actual'] == 1) & 
        (merged_df['combined_prediction'] == 0)
    ]

    # Find false positives (actual=0, predicted=1)
    false_positives = merged_df[
        (merged_df['if_law_applied_actual'] == 0) & 
        (merged_df['combined_prediction'] == 1)
    ]

    print("False Negatives (missed applications of law):")
    print(false_negatives[['para_id']])
    
    print("\ncase_name:", case_name)
    #measure precision, recall and f1 score using scikit-learn
    y_true = merged_df['if_law_applied_actual']
    y_pred = merged_df['combined_prediction']
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    f1_list.append(f1)

NameError: name 'case_files' is not defined

In [22]:
def change_csv_file_name(csv_file_path):
    """
    Changes the name of a CSV file to a new name.

    Parameters:
    csv_file_path (str): The path to the original CSV file.
    new_name (str): The new name for the CSV file (without extension).
    """

    #read the csv file in pandas dataframe
    df = pd.read_csv(csv_file_path)
    # Extract the case_uri and para_id from the first row
    case_uri = df['case_uri'].iloc[0]
    
    #from     case_uri remove 'https://caselaw.nationalarchives.gov.uk/' and replace '/' with '_'
    filr_name = case_uri.replace('https://caselaw.nationalarchives.gov.uk/', '').replace('/', '_')
    # Save the csv file with the new name
    directory = os.path.dirname(csv_file_path)
    new_file_path = os.path.join(directory, filr_name + '.csv')
    df.to_csv(new_file_path, index=False)
    os.remove(csv_file_path)  # Remove the old file
    # Ensure the new file name is valid
for file in os.listdir('../data/newData'):
    filename = '../data/newData/'+file
    print(filename)
    change_csv_file_name(filename)

../data/newData/updated_new_case_20.csv
../data/newData/updated_new_case_34.csv
../data/newData/updated_new_case_35.csv
../data/newData/updated_new_case_21.csv
../data/newData/updated_new_case_46.csv
../data/newData/updated_new_case_47.csv
../data/newData/ewhc_fam_2016_2860.csv
../data/newData/updated_new_case_45.csv
../data/newData/updated_new_case_44.csv
../data/newData/updated_new_case_50.csv
../data/newData/ewhc_admin_2023_2088.csv
../data/newData/updated_new_case_40.csv
../data/newData/updated_new_case_41.csv
../data/newData/updated_new_case_43.csv
../data/newData/updated_new_case_42.csv
../data/newData/ewca_civ_2013_1096.csv
../data/newData/ewcop_2015_43.csv
../data/newData/updated_new_case_49.csv
../data/newData/updated_new_case_48.csv
../data/newData/ewhc_admin_2009_2940.csv
../data/newData/updated_new_case_38.csv
../data/newData/ewhc_admin_2017_2794.csv
../data/newData/updated_new_case_10.csv
../data/newData/ewhc_qb_2012_3162.csv
../data/newData/updated_new_case_39.csv
../data