In [44]:
import json

def combine_jsonl_files(input_files, output_file):
    """
    Combine multiple JSONL files into a single JSONL file.

    Args:
        input_files (list of str): List of input JSONL file paths.
        output_file (str): Output JSONL file path.
    """
    combined_records = []
    for file_path in input_files:
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                line = line.strip()
                if line:
                    try:
                        record = json.loads(line)
                        combined_records.append(record)
                    except json.JSONDecodeError as e:
                        print(f"Skipping invalid JSON in {file_path}: {e}")

    with open(output_file, 'w', encoding='utf-8') as out_f:
        for record in combined_records:
            out_f.write(json.dumps(record, ensure_ascii=False) + '\n')
    print(f"Combined {len(combined_records)} records into '{output_file}'.")

files_to_combine = [
    "../data/final_test/final/batch_2_requests_results.jsonl",
    "../data/final_test/final/remaining_dataset_results_part_01.jsonl",
    "../data/final_test/final/remaining_dataset_results_part_02.jsonl"
]
output_path = "../data/final_test/final/combined_dataset_output.jsonl"
combine_jsonl_files(files_to_combine, output_path)


FileNotFoundError: [Errno 2] No such file or directory: '../data/final_test/final/batch_2_requests_results.jsonl'

In [None]:
import json
import sys
import csv
import re
from typing import Dict, List, Tuple

def load_jsonl_file(filepath: str) -> List[dict]:
    """Load JSONL file and return list of JSON objects."""
    try:
        with open(filepath, 'r', encoding='utf-8') as file:
            return [json.loads(line.strip()) for line in file if line.strip()]
    except FileNotFoundError:
        print(f"Error: File '{filepath}' not found.")
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error: Invalid JSON in file '{filepath}': {e}")
        sys.exit(1)

def extract_input_data(input_records: List[dict]) -> Dict[str, Tuple[str, str, str, str]]:
    """Extract para_id, section_id, para_content, and section_text from input records."""
    input_data = {}
    
    for record in input_records:
        try:
            custom_id = record.get('custom_id')
            if not custom_id:
                continue
                
            # Extract para_id and section_id from the user message content
            messages = record.get('body', {}).get('messages', [])
            user_message = None
            
            for msg in messages:
                if msg.get('role') == 'user':
                    user_message = msg.get('content', '')
                    break
            
            if not user_message:
                continue
                
            # Parse para_id, section_id, para_content, and section_text from the content
            lines = user_message.split('\n')
            para_id = None
            section_id = None
            para_content = None
            section_text = None
            
            for line in lines:
                line = line.strip()
                if line.startswith('para_id:'):
                    para_id = line.split('para_id:')[1].strip()
                elif line.startswith('section_id:'):
                    section_id = line.split('section_id:')[1].strip()
                elif line.startswith('para_content:'):
                    para_content = line.split('para_content:')[1].strip()
                elif line.startswith('section_text:'):
                    section_text = line.split('section_text:')[1].strip()
            
            if para_id and section_id and para_content and section_text:
                input_data[custom_id] = (para_id, section_id, para_content, section_text)
                
        except Exception as e:
            print(f"Warning: Error processing input record with custom_id '{custom_id}': {e}")
            continue
    
    return input_data

def extract_output_data(output_records: List[dict]) -> Dict[str, Tuple[str, str, List[dict]]]:
    """Extract para_id, section_id, and extracted_phrases from output records."""
    output_data = {}
    
    for record in output_records:
        try:
            custom_id = record.get('custom_id')
            if not custom_id:
                continue
                
            # Extract the assistant's response content
            response_content = record.get('response', {}).get('body', {}).get('choices', [{}])[0].get('message', {}).get('content', '')
            
            if not response_content:
                continue
                
            # Parse JSON from the response content
            try:
                response_json = json.loads(response_content)
                para_id = response_json.get('para_id')
                section_id = response_json.get('section_id')
                extracted_phrases = response_json.get('extracted_phrases', [])
                
                if para_id and section_id:
                    output_data[custom_id] = (para_id, section_id, extracted_phrases)
                    
            except json.JSONDecodeError:
                print(f"Warning: Invalid JSON in response for custom_id '{custom_id}'")
                continue
                
        except Exception as e:
            print(f"Warning: Error processing output record with custom_id '{custom_id}': {e}")
            continue
    
    return output_data

def parse_legislation_id(section_id: str) -> str:
     # Default to returning the original section_id if parsing fails
    """
    Parse section_id to extract standardized legislation ID.
    Examples:
    - http://www.legislation.gov.uk/id/ukpga/1990/8/section/73_ -> 1990/8_73
    - http://www.legislation.gov.uk/id/ukpga/1990/8/section/73 -> 1990/8_73
    - id/ukpga/1990/8_section-191 -> 1990/8_191
    - id/ukpga/1989/41/section/31 -> 1989/41_31
    """
    try:
        # Remove trailing underscore if present
        section_id = section_id.rstrip('_')
        
        # Remove URL prefix if present
        if section_id.startswith('http://www.legislation.gov.uk/'):
            section_id = section_id.replace('http://www.legislation.gov.uk/', '')
        
        # Handle different formats
        if '/section/' in section_id:
            # Format: id/ukpga/1990/8/section/73
            parts = section_id.split('/')
            

    
            if len(parts) >= 5 and parts[0] == 'id' and parts[1] == 'ukpga':
                year = parts[2]
                chapter = parts[3]
                section = parts[5] if len(parts) > 5 else parts[4]
                # Remove any trailing underscore from section number
                section = section.rstrip('_')
                return f"{year}/{chapter}_{section}"
        
        elif '_section-' in section_id:
            # Format: id/ukpga/1990/8_section-191
            parts = section_id.split('/')
            if len(parts) >= 3 and parts[0] == 'id' and parts[1] == 'ukpga':
                year = parts[2]
                chapter_section = parts[3] if len(parts) > 3 else parts[2]
                if '_section-' in chapter_section:
                    return section_id
                    chapter, section = chapter_section.split('_section-')
                    # Remove any trailing underscore from section number
                    section = section.rstrip('_')
                    return f"{year}/{chapter}_{section}"
        
        # Fallback: try to extract year/chapter_section pattern
        match = re.search(r'(\d{4})/(\d+).*?(\d+)', section_id.rstrip('_'))
        if match:
            year, chapter, section = match.groups()
            return f"{year}/{chapter}_{section}"
            
        return section_id  # Return original if parsing fails
        
    except Exception:
        return section_id  # Return original if any error occurs

def create_standardized_act_id(section_id: str, section_number: str = "") -> str:
    """
    Create standardized act ID in format: year/chapter_section_subsection
    Example: 1989/41_section_91A
    """
    try:
        legislation_id = parse_legislation_id(section_id)
        if '_' in legislation_id:
            base_part = legislation_id.split('_')[0]  # e.g., "1989/41"
            section_part = legislation_id.split('_')[1] if '_' in legislation_id else ""
            return f"{base_part}_section_{section_part}"
        return f"{legislation_id}_section_{section_number}"
    except Exception:
        return section_id

def extract_para_id_from_full_id(para_id: str) -> str:
    """Extract just the paragraph ID from full identifier."""
    if '#' in para_id:
        return para_id.split('#')[-1]
    return para_id

def generate_caselaw_url(para_id: str) -> str:
    """
    Generate caselaw URL from para_id.
    Example: ewfc_b_2024_40#para_77 -> https://caselaw.nationalarchives.gov.uk/ewfc/b/2024/40
    """
    try:
        if '#' in para_id:
            case_id = para_id.split('#')[0]
        else:
            case_id = para_id
        
        # Convert underscores to forward slashes for URL
        url_path = case_id.replace('_', '/')
        return f"https://caselaw.nationalarchives.gov.uk/{url_path}"
    except Exception:
        return ""

def create_csv_from_valid_records(input_data: Dict, output_data: Dict, content_valid_custom_ids: List[str], output_csv: str):
    """Create CSV file with detailed information for content valid records."""
    
    csv_data = []
    
    for custom_id in content_valid_custom_ids:
        if custom_id not in input_data or custom_id not in output_data:
            continue
            
        # Get input data
        input_para_id, input_section_id, para_content, section_text = input_data[custom_id]
        
        # Get output data
        output_para_id, output_section_id, extracted_phrases = output_data[custom_id]
        
        # Generate URL and extract para_id
        url = generate_caselaw_url(input_para_id)
        para_id_only = extract_para_id_from_full_id(input_para_id)
        
        # Convert extracted_phrases to string for the case_term_phrases column
        case_term_phrases = json.dumps(extracted_phrases) if extracted_phrases else ""
        
        # Create a row for each extracted phrase
        if extracted_phrases:
            for phrase in extracted_phrases:
                legislation_id = parse_legislation_id(input_section_id)
                standardized_act_id = create_standardized_act_id(input_section_id)
                
                row = {
                    'url': url,
                    'para_id': para_id_only,
                    'paragraphs': para_content,
                    'case_term_phrases': case_term_phrases,
                    'section_id': legislation_id,
                    'section_text': section_text,
                    'case_term': phrase.get('case_law_excerpt', ''),
                    'legislation_term': phrase.get('legislation_excerpt', ''),
                    'confidence': phrase.get('confidence', ''),
                    'reasoning': phrase.get('reasoning', ''),
                    'key_phrases': phrase.get('legislation_excerpt', ''),  # Same as legislation_term
                    'standardized_act_id': standardized_act_id
                }
                csv_data.append(row)
        else:
            pass
            # If no extracted phrases, create one row with empty phrase data
            legislation_id = parse_legislation_id(input_section_id)
            standardized_act_id = create_standardized_act_id(input_section_id)
            
            row = {
                'url': url,
                'para_id': para_id_only,
                'paragraphs': para_content,
                'case_term_phrases': case_term_phrases,
                'legislation_id': legislation_id,
                'section_text': section_text,
                'case_term': '',
                'legislation_term': '',
                'confidence': '',
                'reasoning': '',
                'key_phrases': '',
                'standardized_act_id': standardized_act_id
            }
            csv_data.append(row)
    
    # Write CSV file
    if csv_data:
        fieldnames = [
            'url', 'para_id', 'paragraphs', 'case_term_phrases', 'legislation_id',
            'section_text', 'case_term', 'legislation_term', 'confidence', 
            'reasoning', 'key_phrases', 'standardized_act_id'
        ]
        
        with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            writer.writerows(csv_data)
        
        print(f"\nCSV file created: {output_csv}")
        print(f"Total rows written: {len(csv_data)}")
        print(f"Records processed: {len(content_valid_custom_ids)}")
    else:
        print("\nNo valid data to write to CSV")

def validate_extracted_phrases(custom_id: str, extracted_phrases: List[dict], para_content: str, section_text: str) -> Tuple[bool, List[str]]:
    """
    Validate that extracted phrases actually exist in the source texts.
    Returns (is_valid, list_of_errors)
    """
    errors = []
    is_valid = True
    
    if not extracted_phrases:
        return True, []  # No phrases to validate
    
    for i, phrase in enumerate(extracted_phrases):
        case_law_excerpt = (phrase.get('case_law_excerpt') or 
                                  phrase.get('case_law_term') or 
                                  phrase.get('caselaw_term') or
                                  phrase.get('case_law_phrase') or
                                  phrase.get('case_law'))
                
        legislation_excerpt = (phrase.get('legislation_excerpt') or 
                                    phrase.get('legislation_term') or 
                                    phrase.get('legislation_phrase') or
                                    phrase.get('legislation'))
        
        # Check if case_law_excerpt exists in para_content
        if case_law_excerpt and case_law_excerpt not in para_content:
            errors.append(f"Phrase {i+1}: case_law_excerpt '{case_law_excerpt}' not found in paragraph content")
            is_valid = False
        
        # Check if legislation_excerpt exists in section_text
        if legislation_excerpt and legislation_excerpt not in section_text:
            errors.append(f"Phrase {i+1}: legislation_excerpt '{legislation_excerpt}' not found in section text")
            is_valid = False
    
    return is_valid, errors

def validate_records(input_file: str, output_file: str) -> Tuple[List[str], List[str], List[str], List[str]]:
    """Validate that para_id and section_id match between input and output files, and validate extracted phrases."""
    
    print("Loading input file...")
    input_records = load_jsonl_file(input_file)
    print(f"Loaded {len(input_records)} input records")
    
    print("Loading output file...")
    output_records = load_jsonl_file(output_file)
    print(f"Loaded {len(output_records)} output records")
    
    print("Extracting data from input records...")
    input_data = extract_input_data(input_records)
    print(f"Extracted data from {len(input_data)} input records")
    
    print("Extracting data from output records...")
    output_data = extract_output_data(output_records)
    print(f"Extracted data from {len(output_data)} output records")
    
    valid_custom_ids = []
    invalid_custom_ids = []
    content_valid_custom_ids = []
    content_invalid_custom_ids = []
    
    # Check each custom_id that appears in both input and output
    all_custom_ids = set(input_data.keys()) | set(output_data.keys())
    
    for custom_id in all_custom_ids:
        if custom_id not in input_data:
            invalid_custom_ids.append(f"{custom_id} (missing in input)")
            continue
            
        if custom_id not in output_data:
            invalid_custom_ids.append(f"{custom_id} (missing in output)")
            continue
            
        input_para_id, input_section_id, para_content, section_text = input_data[custom_id]
        output_para_id, output_section_id, extracted_phrases = output_data[custom_id]
        
        # First check if IDs match
        if input_para_id == output_para_id and input_section_id == output_section_id:
            valid_custom_ids.append(custom_id)
            
            # For valid IDs, also check if extracted phrases are actually from the source texts
            phrases_valid, phrase_errors = validate_extracted_phrases(custom_id, extracted_phrases, para_content, section_text)
            
            if phrases_valid:
                content_valid_custom_ids.append(custom_id)
            else:
                error_details = "; ".join(phrase_errors)
                content_invalid_custom_ids.append(f"{custom_id} (Content errors: {error_details})")
        else:
            invalid_custom_ids.append(f"{custom_id} (para_id: '{input_para_id}' -> '{output_para_id}', section_id: '{input_section_id}' -> '{output_section_id}')")
    
    return valid_custom_ids, invalid_custom_ids, content_valid_custom_ids, content_invalid_custom_ids

def main():
    """Main function to run the validation."""
    input_file = '../data/final_test/final/extraction-gpt4o-mini-swansea/input_batches/combined_dataset_input.jsonl'
    output_file = '../data/final_test/final/extraction-gpt4o-mini-swansea/output_batches/combined_dataset_output.jsonl'

    output_csv = '../data/final_test/final/extraction-gpt4o-mini-swansea/combined_dataset_output_FeINAL.csv'
    
    
    print("=" * 60)
    print("JSONL VALIDATION SCRIPT")
    print("=" * 60)
    
    print("=" * 60)
    print("JSONL VALIDATION SCRIPT")
    print("=" * 60)
    
    valid_custom_ids, invalid_custom_ids, content_valid_custom_ids, content_invalid_custom_ids = validate_records(input_file, output_file)
    
    print("\n" + "=" * 60)
    print("VALIDATION RESULTS")
    print("=" * 60)
    
    print(f"\n1. ID Validation Results:")
    print("=" * 30)
    
    print(f"\nValid custom_ids (matching IDs) ({len(valid_custom_ids)}):")
    print("-" * 40)
    if valid_custom_ids:
        for custom_id in sorted(valid_custom_ids):
            print(f"✓ {custom_id}")
    else:
        print("No valid custom_ids found")
    
    print(f"\nInvalid custom_ids (ID mismatch/missing) ({len(invalid_custom_ids)}):")
    print("-" * 40)
    if invalid_custom_ids:
        for custom_id in sorted(invalid_custom_ids):
            print(f"✗ {custom_id}")
    else:
        print("No invalid custom_ids found")
    
    print(f"\n2. Content Validation Results (for records with valid IDs):")
    print("=" * 30)
    
    print(f"\nContent valid custom_ids (extracted phrases match source) ({len(content_valid_custom_ids)}):")
    print("-" * 40)
    if content_valid_custom_ids:
        for custom_id in sorted(content_valid_custom_ids):
            print(f"✓ {custom_id}")
    else:
        print("No content valid custom_ids found")
    
    print(f"\nContent invalid custom_ids (extracted phrases don't match source) ({len(content_invalid_custom_ids)}):")
    print("-" * 40)
    if content_invalid_custom_ids:
        for custom_id in sorted(content_invalid_custom_ids):
            print(f"✗ {custom_id}")
    else:
        print("No content invalid custom_ids found")
    
    print(f"\n3. Summary:")
    print("=" * 30)
    print(f"Total records processed: {len(valid_custom_ids) + len(invalid_custom_ids)}")
    print(f"ID validation - Valid: {len(valid_custom_ids)}, Invalid: {len(invalid_custom_ids)}")
    print(f"Content validation - Valid: {len(content_valid_custom_ids)}, Invalid: {len(content_invalid_custom_ids)}")
    print(f"Overall success rate: {len(content_valid_custom_ids)}/{len(valid_custom_ids) + len(invalid_custom_ids)} ({(len(content_valid_custom_ids)/(len(valid_custom_ids) + len(invalid_custom_ids))*100):.1f}%)" if (len(valid_custom_ids) + len(invalid_custom_ids)) > 0 else "No records to process")
    
    # Generate CSV for content valid records
    if content_valid_custom_ids:
        print(f"\n4. Generating CSV for valid records...")
        print("=" * 30)
        
        # Re-extract data for CSV generation (we need the full data)
        input_records = load_jsonl_file(input_file)
        output_records = load_jsonl_file(output_file)
        input_data = extract_input_data(input_records)
        output_data = extract_output_data(output_records)
        
        create_csv_from_valid_records(input_data, output_data, content_valid_custom_ids, output_csv)
    else:
        print(f"\n4. No valid records to export to CSV")
        print("=" * 30)

if __name__ == "__main__":
    main()

JSONL VALIDATION SCRIPT
JSONL VALIDATION SCRIPT
Loading input file...
Loaded 17470 input records
Loading output file...
Loaded 17470 output records
Extracting data from input records...
Extracted data from 17470 input records
Extracting data from output records...
Extracted data from 17463 output records

VALIDATION RESULTS

1. ID Validation Results:

Valid custom_ids (matching IDs) (17461):
----------------------------------------
✓ request_1
✓ request_10
✓ request_100
✓ request_1000
✓ request_10000
✓ request_10001
✓ request_10002
✓ request_10003
✓ request_10004
✓ request_10005
✓ request_10006
✓ request_10007
✓ request_10008
✓ request_10009
✓ request_1001
✓ request_10010
✓ request_10011
✓ request_10012
✓ request_10013
✓ request_10014
✓ request_10015
✓ request_10016
✓ request_10017
✓ request_10018
✓ request_10019
✓ request_1002
✓ request_10020
✓ request_10021
✓ request_10022
✓ request_10023
✓ request_10024
✓ request_10025
✓ request_10026
✓ request_10027
✓ request_10028
✓ request_10029


ValueError: dict contains fields not in fieldnames: 'section_id'

In [5]:
import pandas as pd

df_DEEP_SEEK = pd.read_csv('../data/final_test/final/extraction-deepseek/combined_all_output_deepseek_FINAL.csv')

df_DEEP_SEEK.columns

df_DEEP_SEEK.head(2)

Unnamed: 0,custom_id,para_id,paragraph_text,section_text,section_id,thinking,extracted_phrases,reason,llm_para_id,llm_section_id
0,request_5,eat_2022_192#para_21,21. The claimant continued to be signed off wo...,16 Gender reassignment discrimination: cases o...,id/ukpga/2010/15_section-16,"Okay, let's tackle this problem step by step. ...",[],The case law paragraph discusses depressive sy...,eat_2022_192#para_21,id/ukpga/2010/15_section-16
1,request_4,eat_2022_192#para_18,"18. Subsequently, the claimant was referred to...",Part 1 Introductory Preliminary 1 This Schedul...,id/ukpga/2010/15_schedule-8-part-1,"Okay, so I need to figure out which part of th...","[{""case_law_excerpt"": ""he did not consider any...",The paragraph applies the legislative provisio...,eat_2022_192#para_18,id/ukpga/2010/15_schedule-8-part-1


In [6]:
rows_matching_para_id = df_DEEP_SEEK[df_DEEP_SEEK['para_id'] == df_DEEP_SEEK['llm_para_id']]
rows_matching_para_id.head()


Unnamed: 0,custom_id,para_id,paragraph_text,section_text,section_id,thinking,extracted_phrases,reason,llm_para_id,llm_section_id
0,request_5,eat_2022_192#para_21,21. The claimant continued to be signed off wo...,16 Gender reassignment discrimination: cases o...,id/ukpga/2010/15_section-16,"Okay, let's tackle this problem step by step. ...",[],The case law paragraph discusses depressive sy...,eat_2022_192#para_21,id/ukpga/2010/15_section-16
1,request_4,eat_2022_192#para_18,"18. Subsequently, the claimant was referred to...",Part 1 Introductory Preliminary 1 This Schedul...,id/ukpga/2010/15_schedule-8-part-1,"Okay, so I need to figure out which part of th...","[{""case_law_excerpt"": ""he did not consider any...",The paragraph applies the legislative provisio...,eat_2022_192#para_18,id/ukpga/2010/15_schedule-8-part-1
2,request_16,eat_2022_192#para_39,"39. Dealing with the section 15 claim, the ET ...",130 Section 129: supplementary (1) This sectio...,id/ukpga/2010/15_section-130,"Alright, I need to figure out how to apply the...","[{""case_law_excerpt"": ""sickness absence"", ""leg...",The paragraph applies the concept of incapacit...,eat_2022_192#para_39,id/ukpga/2010/15_section-130
3,request_2,eat_2022_192#para_12,"12. In April 2016, the claimant was moved into...",Part 3 Limitations on the duty Lack of knowled...,id/ukpga/2010/15_schedule-8-part-3,"Alright, let me try to work through this probl...","[{""case_law_excerpt"": ""he never completed or r...",The case applies the legislative provision by ...,eat_2022_192#para_12,id/ukpga/2010/15_schedule-8-part-3
4,request_15,eat_2022_192#para_39,"39. Dealing with the section 15 claim, the ET ...",16 Gender reassignment discrimination: cases o...,id/ukpga/2010/15_section-16,"Okay, so I need to figure out how to apply the...","[{""case_law_excerpt"": ""stress and depressive s...",The case applies the legislative framework by ...,eat_2022_192#para_39,id/ukpga/2010/15_section-16


In [7]:
rows_matching_para_id.para_id.nunique()

9002

In [8]:
import ast

# Count how many extracted_phrases are empty lists
empty_count = rows_matching_para_id['extracted_phrases'].apply(lambda x: isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) == 0).sum()
print(f"Number of empty lists in extracted_phrases: {empty_count}")
#get the df with extrc

#get the df with extracted_phrases that are not empty lists
df_not_empty = rows_matching_para_id[rows_matching_para_id['extracted_phrases'].apply(lambda x: isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) != 0)]
import ast

# Parse and unpack the extracted_phrases list into a DataFrame
def unpack_extracted_phrases(row):
    phrases = ast.literal_eval(row['extracted_phrases'])
    if isinstance(phrases, list) and len(phrases) > 0:
        # Attach the parent row's info to each phrase
        return [
            {**phrase, **{col: row[col] for col in df_not_empty.columns if col != 'extracted_phrases'}}
            for phrase in phrases
        ]
    else:
        return []

# Flatten all extracted phrases into a new DataFrame
unpacked = []
for idx, row in df_not_empty.iterrows():
    unpacked.extend(unpack_extracted_phrases(row))

df_phrases = pd.DataFrame(unpacked)
df_phrases.head(2)

#get the df with extracted_phrases that are not empty lists

Number of empty lists in extracted_phrases: 4320


Unnamed: 0,case_law_excerpt,legislation_excerpt,confidence,reasoning,custom_id,para_id,paragraph_text,section_text,section_id,thinking,reason,llm_para_id,llm_section_id
0,he did not consider any adjustments were neces...,"A must comply with the first, second and third...",High,The case directly applies the legislative requ...,request_4,eat_2022_192#para_18,"18. Subsequently, the claimant was referred to...",Part 1 Introductory Preliminary 1 This Schedul...,id/ukpga/2010/15_schedule-8-part-1,"Okay, so I need to figure out which part of th...",The paragraph applies the legislative provisio...,eat_2022_192#para_18,id/ukpga/2010/15_schedule-8-part-1
1,sickness absence,incapacity case,High,"The case law discusses sickness absence, which...",request_16,eat_2022_192#para_39,"39. Dealing with the section 15 claim, the ET ...",130 Section 129: supplementary (1) This sectio...,id/ukpga/2010/15_section-130,"Alright, I need to figure out how to apply the...",The paragraph applies the concept of incapacit...,eat_2022_192#para_39,id/ukpga/2010/15_section-130


In [12]:
# Create a column 'is_valid' where:
# - case_law_excerpt is in paragraph_text
# - and legislation_excerpt is in section_text
# If both are True, then is_valid is True, else False

def check_valid(row):
    case_ok = str(row.get('case_law_excerpt', '')) in str(row.get('paragraph_text', ''))
    legis_ok = str(row.get('legislation_excerpt', '')) in str(row.get('section_text', ''))
    return case_ok and legis_ok

df_phrases['is_valid'] = df_phrases.apply(check_valid, axis=1)


In [13]:
df_phrases.is_valid.value_counts()

is_valid
True     12180
False     7182
Name: count, dtype: int64

In [14]:
df_phrases_valid = df_phrases[df_phrases['is_valid'] == True]
df_phrases_valid.head(2)


Unnamed: 0,case_law_excerpt,legislation_excerpt,confidence,reasoning,custom_id,para_id,paragraph_text,section_text,section_id,thinking,reason,llm_para_id,llm_section_id,is_valid
0,he did not consider any adjustments were neces...,"A must comply with the first, second and third...",High,The case directly applies the legislative requ...,request_4,eat_2022_192#para_18,"18. Subsequently, the claimant was referred to...",Part 1 Introductory Preliminary 1 This Schedul...,id/ukpga/2010/15_schedule-8-part-1,"Okay, so I need to figure out which part of th...",The paragraph applies the legislative provisio...,eat_2022_192#para_18,id/ukpga/2010/15_schedule-8-part-1,True
1,sickness absence,incapacity case,High,"The case law discusses sickness absence, which...",request_16,eat_2022_192#para_39,"39. Dealing with the section 15 claim, the ET ...",130 Section 129: supplementary (1) This sectio...,id/ukpga/2010/15_section-130,"Alright, I need to figure out how to apply the...",The paragraph applies the concept of incapacit...,eat_2022_192#para_39,id/ukpga/2010/15_section-130,True


In [15]:
df_phrases_valid = df_phrases_valid.drop_duplicates(subset=['para_id', 'section_id', 'case_law_excerpt', 'legislation_excerpt'])

In [18]:
df_phrases_valid.confidence.value_counts(normalize=True) * 100

confidence
High           60.049342
Medium         32.467105
Low             7.442434
None            0.032895
Medium-High     0.008224
Name: proportion, dtype: float64

In [59]:
import pandas as pd
output_csv_openai = '../data/final_test/final/extraction-gpt4o-mini-swansea/combined_dataset_output_gpt4o_mini_FINAL2.csv'
# Load the CSV
df_open_Ai = pd.read_csv(output_csv_openai)

In [60]:
df_open_Ai.columns

Index(['custom_id', 'para_id', 'section_id', 'paragraph_text', 'section_text',
       'extracted_phrases', 'reason'],
      dtype='object')

In [61]:
import ast

# Keep only rows where case_term_phrases is not an empty list after parsing with ast.literal_eval
df_open_Ai = df_open_Ai[df_open_Ai['extracted_phrases'].apply(lambda x: len(ast.literal_eval(x)) > 0 if pd.notnull(x) else False)]
len(df_open_Ai)

13742

In [62]:
df_not_empty_df_open_Ai = df_open_Ai[df_open_Ai['extracted_phrases'].apply(lambda x: isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) != 0)]
# Flatten all extracted phrases into a new DataFrame
unpacked_openai = []
for idx, row in df_not_empty_df_open_Ai.iterrows():
    unpacked_openai.extend(unpack_phrases(row))


df_open_Ai = pd.DataFrame(unpacked_openai)
df_open_Ai.head(2)

Unnamed: 0,case_law_excerpt,legislation_excerpt,confidence,reasoning,custom_id,para_id,section_text,section_id,paragraph_text,reason
0,pre-employment health questionnaire was sent t...,A person (A) to whom an application for work i...,High,The pre-employment health questionnaire direct...,request_1,eat_2022_192#para_12,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60,"12. In April 2016, the claimant was moved into...",The case law reflects on disability inquiries ...
1,claimant ticked the relevant box to state he h...,whether or not a person has a disability is to...,High,The claimant's indication of disability is con...,request_1,eat_2022_192#para_12,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60,"12. In April 2016, the claimant was moved into...",The case law reflects on disability inquiries ...


In [63]:
df_open_Ai.dropna(subset=['case_law_excerpt'], inplace=True)

In [64]:
df_open_Ai['is_valid'] = df_open_Ai.apply(lambda row: str(row.get('case_law_excerpt', '')) in str(row.get('paragraph_text', '')) and str(row.get('legislation_excerpt', '')) in str(row.get('section_text', '')), axis=1)

In [65]:
df_open_Ai = df_open_Ai[df_open_Ai['is_valid'] == True]

In [66]:
len(df_open_Ai)

15321

In [67]:
df_open_Ai.para_id.nunique()

7500

In [68]:
df_open_Ai.drop_duplicates(subset=['para_id','case_law_excerpt', 'legislation_excerpt'], inplace=True)

In [69]:
len(df_open_Ai)

15207

In [71]:
df_open_Ai.confidence.value_counts(normalize=True) * 100

confidence
High      73.763483
Medium    25.743225
Low        0.493291
Name: proportion, dtype: float64

In [1]:
# Create a boolean mask for duplicates based on the specified columns
df_open_Ai_mask = df_open_Ai.duplicated(subset=['para_id', 'case_law_excerpt', 'legislation_excerpt'], keep=False)

# DataFrame with duplicate values
df_duplicates_openai = df_open_Ai[df_open_Ai_mask].copy()

# DataFrame without duplicate values
df_no_duplicates_openai = df_open_Ai[~df_open_Ai_mask].copy()

print(f"Number of rows with duplicates: {len(df_duplicates_openai)}")
print(f"Number of rows without duplicates: {len(df_no_duplicates_openai)}")


NameError: name 'df_open_Ai' is not defined

In [35]:
df_open_Ai.columns

Index(['url', 'para_id', 'paragraphs', 'case_term_phrases', 'legislation_id',
       'section_text', 'case_term', 'legislation_term', 'confidence',
       'reasoning', 'key_phrases', 'standardized_act_id', 'standard_para_id'],
      dtype='object')

In [36]:
df_open_Ai.drop_duplicates(subset=['standard_para_id','legislation_id'], inplace=True)

In [88]:
len(df_open_Ai)

15207

In [76]:
import pandas as pd
df_llama = pd.read_csv('../data/final_test/final/extraction-llama/combined_all_output_llama_FINAL.csv')
df_llama.head(2)

Unnamed: 0,custom_id,para_id,section_text,section_id,paragraph_text,extracted_phrases,reason
0,request_1,eat_2022_192#para_12,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60,"12. In April 2016, the claimant was moved into...","[{""case_law_excerpt"": ""the claimant ticked the...",The paragraph discusses the claimant's disclos...
1,request_2,eat_2022_192#para_12,Part 3 Limitations on the duty Lack of knowled...,id/ukpga/2010/15_schedule-8-part-3,"12. In April 2016, the claimant was moved into...","[{""case_law_excerpt"": ""did not suffer any subs...",The paragraph indirectly applies the concept o...


In [77]:
import ast

# Check which rows in df have non-empty extracted_phrases
df_non_empty_llama = df_llama[df_llama['extracted_phrases'].apply(lambda x: isinstance(ast.literal_eval(x), list) and len(ast.literal_eval(x)) > 0)]
print(f"Number of rows with non-empty extracted_phrases: {len(df_non_empty_llama)}")
df_non_empty_llama.head(2)
#Unpack extracted_phrases into a flat DataFrame
def unpack_phrases(row):
    phrases = ast.literal_eval(row['extracted_phrases'])
    if isinstance(phrases, list) and len(phrases) > 0:
        # Map equivalent keys to standardized ones
        standardized_phrases = []
        for phrase in phrases:
            # Map legistration_excerpt and legislation_exempt to legislation_excerpt
            if 'legistration_excerpt' in phrase:
                phrase['legislation_excerpt'] = phrase.pop('legistration_excerpt')
            if 'legislation_exempt' in phrase:
                phrase['legislation_excerpt'] = phrase.pop('legislation_exempt')
            # Map case_lawexcerpt to case_law_excerpt
            if 'case_lawexcerpt' in phrase:
                phrase['case_law_excerpt'] = phrase.pop('case_lawexcerpt')
            standardized_phrases.append({**phrase, **{col: row[col] for col in df_non_empty_llama.columns if col != 'extracted_phrases'}})
        return standardized_phrases
    else:
        return []

unpacked = []
for idx, row in df_non_empty_llama.iterrows():
    unpacked.extend(unpack_phrases(row))

df_unpacked_phrases_llama = pd.DataFrame(unpacked)
df_unpacked_phrases_llama.head(2)

Number of rows with non-empty extracted_phrases: 6370


Unnamed: 0,case_law_excerpt,legislation_excerpt,confidence,reasoning,custom_id,para_id,section_text,section_id,paragraph_text,reason
0,the claimant ticked the relevant box to state ...,a question about the health of the applicant,Medium,The case law excerpt relates to the claimant's...,request_1,eat_2022_192#para_12,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60,"12. In April 2016, the claimant was moved into...",The paragraph discusses the claimant's disclos...
1,a pre-employment health questionnaire was sent...,a question about the health of the applicant,Medium,The pre-employment health questionnaire sent t...,request_1,eat_2022_192#para_12,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60,"12. In April 2016, the claimant was moved into...",The paragraph discusses the claimant's disclos...


In [86]:
df_unpacked_phrases_llama.columns

Index(['case_law_excerpt', 'legislation_excerpt', 'confidence', 'reasoning',
       'custom_id', 'para_id', 'section_text', 'section_id', 'paragraph_text',
       'reason', 'is_valid'],
      dtype='object')

In [79]:
df_unpacked_phrases_llama['is_valid'] = df_unpacked_phrases_llama.apply(check_valid, axis=1)

In [80]:
df_unpacked_phrases_llama.drop_duplicates(subset=['para_id', 'section_id', 'case_law_excerpt', 'legislation_excerpt'], inplace=True)

In [81]:
df_unpacked_phrases_llama.para_id.nunique()

4637

In [82]:
len(df_unpacked_phrases_llama)

9890

In [83]:
df_unpacked_phrases_llama.confidence.value_counts(normalize=True) * 100

confidence
High      47.037412
Medium    40.333670
Low       12.628918
Name: proportion, dtype: float64

In [85]:
# Find the number of unique standard_para_id in df_open_Ai
nunique_openai = df_open_Ai['para_id'].nunique()
# Find the number of unique para_id in df_phrases_valid
nunique_phrases = df_phrases_valid['para_id'].nunique()
# Find the number of unique para_id in df_unpacked_phrases_llama
nunique_llama = df_unpacked_phrases_llama['para_id'].nunique()

# Find the intersection (common unique ids) across all three
common_ids = set(df_open_Ai['para_id']).intersection(
    set(df_phrases_valid['para_id']),
    set(df_unpacked_phrases_llama['para_id'])
)
n_common = len(common_ids)

# Find the union (total unique ids across all three)
union_ids = set(df_open_Ai['para_id']).union(
    set(df_phrases_valid['para_id']),
    set(df_unpacked_phrases_llama['para_id'])
)
n_union = len(union_ids)

print(f"Unique in df_open_Ai: {nunique_openai}")
print(f"Unique in df_phrases_valid: {nunique_phrases}")
print(f"Unique in df_unpacked_phrases_llama: {nunique_llama}")
print(f"Number of common unique ids (intersection of all): {n_common}")
print(f"Total unique ids in union (all): {n_union}")

Unique in df_open_Ai: 7500
Unique in df_phrases_valid: 6715
Unique in df_unpacked_phrases_llama: 4637
Number of common unique ids (intersection of all): 3842
Total unique ids in union (all): 8267


In [43]:
df_open_Ai.legislation_id

0                              2001/12_3
2                             1984/60_78
4                              1976/74_2
5                              1970/9_35
6                              1996/31_8
                      ...               
17096                        1974/47_44B
17097                          1998/42_6
17098                         2000/36_26
17099    id/ukpga/Eliz2/5-6/31_section-8
17100                       1973/18_22ZB
Name: legislation_id, Length: 9610, dtype: object

In [89]:
# Combine all unique para_id/section_id pairs from all three models
all_para_section_pairs = set()

# Use standard_para_id for OpenAI, para_id for DeepSeek and Llama
openai_pairs = set(zip(df_open_Ai['para_id'], df_open_Ai['section_id']))
deepseek_pairs = set(zip(df_phrases_valid['para_id'], df_phrases_valid['section_id']))
llama_pairs = set(zip(df_unpacked_phrases_llama['para_id'], df_unpacked_phrases_llama['section_id']))

all_para_section_pairs = openai_pairs | deepseek_pairs | llama_pairs

In [91]:
len(all_para_section_pairs)

13908

In [95]:
df_source = '../data/final_test/final/withsectionpositvefinal_cleaned.csv'
df_source = pd.read_csv(df_source)
df_source.head(2)

Unnamed: 0.1,Unnamed: 0,case_uri,para_id,paragraphs,references,if_law_applied,application_of_law_phrases,reason,if_law_applied_llama,application_of_law_phrases_llama,...,if_law_applied_claude,application_of_law_phrases_claude,reason_claude,confidence,agreement_with,final_annotation,case_name,section_id,section_text,section_id_standardized
0,0,https://caselaw.nationalarchives.gov.uk/eat/20...,eat_2022_192#para_12,"12. In April 2016, the claimant was moved into...",[],True,['this the ET considered was indicative of the...,The Employment Tribunal applies legal principl...,True,"['This, the ET considered, was indicative of t...",...,,,,,,True,eat_2022_192,id/ukpga/2010/15_section-60,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60
1,1,https://caselaw.nationalarchives.gov.uk/eat/20...,eat_2022_192#para_12,"12. In April 2016, the claimant was moved into...",[],True,['this the ET considered was indicative of the...,The Employment Tribunal applies legal principl...,True,"['This, the ET considered, was indicative of t...",...,,,,,,True,eat_2022_192,id/ukpga/2010/15_schedule-8-part-3,Part 3 Limitations on the duty Lack of knowled...,id/ukpga/2010/15_schedule-8-part-3


In [101]:
source_pairs = set(zip(df_source['para_id'], df_source['section_id_standardized']))

In [104]:
remaining_pairs = source_pairs - all_para_section_pairs


In [105]:
df_not_processed_prepare_For_regeneration = pd.DataFrame(list(remaining_pairs), columns=['para_id', 'section_id'])

In [106]:
len(df_not_processed_prepare_For_regeneration)

4604

In [None]:
# Helper: get phrase for a model by para_id and section_id
def get_model_phrase(df, para_id, section_id, case_term_col, legis_term_col, conf_col, reason_col):
    rows = df[(df['para_id'] == para_id) & (df['section_id'] == section_id)]
    if not rows.empty:
        return (
            list(rows[case_term_col]),
            list(rows[legis_term_col]),
            list(rows[conf_col]),
            list(rows[reason_col])
        )
    return [], [], [], []

combined_rows = []
for para_id, section_id in all_para_section_pairs:
    # Get para_text and section_text from any available model (OpenAI preferred, then DeepSeek, then Llama)
    para_text = None
    section_text = None

    # OpenAI
    openai_row = df_open_Ai[(df_open_Ai['para_id'] == para_id) & (df_open_Ai['section_id'] == section_id)]
    if not openai_row.empty:
        para_text = openai_row.iloc[0]['paragraph_text']
        section_text = openai_row.iloc[0]['section_text']
    else:
        # DeepSeek
        deepseek_row = df_phrases_valid[(df_phrases_valid['para_id'] == para_id) & (df_phrases_valid['section_id'] == section_id)]
        if not deepseek_row.empty:
            para_text = deepseek_row.iloc[0]['paragraph_text']
            section_text = deepseek_row.iloc[0]['section_text']
        else:
            # Llama
            llama_row = df_unpacked_phrases_llama[(df_unpacked_phrases_llama['para_id'] == para_id) & (df_unpacked_phrases_llama['section_id'] == section_id)]
            if not llama_row.empty:
                para_text = llama_row.iloc[0]['paragraph_text']
                section_text = llama_row.iloc[0]['section_text']

    # Get phrases for each model
    case_term_llama, legislation_term_llama, confidence_llama, reason_llama = get_model_phrase(
        df_unpacked_phrases_llama, para_id, section_id, 'case_law_excerpt', 'legislation_excerpt', 'confidence', 'reasoning'
    )
    case_term_openai, legislation_term_openai, confidence_openai, reason_openai = get_model_phrase(
        df_open_Ai, para_id, section_id, 'case_law_excerpt', 'legislation_excerpt', 'confidence', 'reasoning'
    )
    case_term_deepseek, legislation_term_deepseek, confidence_deepseek, reason_deepseek = get_model_phrase(
        df_phrases_valid, para_id, section_id, 'case_law_excerpt', 'legislation_excerpt', 'confidence', 'reasoning'
    )
    if any([
        (case_term_llama and legislation_term_llama and confidence_llama and reason_llama) or
        (case_term_openai and legislation_term_openai and confidence_openai and reason_openai) or
        (case_term_deepseek and legislation_term_deepseek and confidence_deepseek and reason_deepseek)
    ]):
        combined_rows.append({
            'para_id': para_id,
            'section_id': section_id,
            'para_text': para_text,
            'section_text': section_text,
            'case_term_llama': case_term_llama,
            'legislation_term_llama': legislation_term_llama,
            'confidence_llama': confidence_llama,
            'reason_llama': reason_llama,
            'case_term_openai': case_term_openai,
            'legislation_term_openai': legislation_term_openai,
            'confidence_openai': confidence_openai,
            'reason_openai': reason_openai,
            'case_term_deepseek': case_term_deepseek,
            'legislation_term_deepseek': legislation_term_deepseek,
            'confidence_deepseek': confidence_deepseek,
            'reason_deepseek': reason_deepseek
        })

df_combined = pd.DataFrame(combined_rows)
df_combined.drop_duplicates(subset=['para_id', 'section_id'], inplace=True)
df_combined.head(2)

Unnamed: 0,para_id,section_id,para_text,section_text,case_term_llama,legislation_term_llama,confidence_llama,reason_llama,case_term_openai,legislation_term_openai,confidence_openai,reason_openai,case_term_deepseek,legislation_term_deepseek,confidence_deepseek,reason_deepseek
0,ewca_crim_2007_2548#para_9,id/ukpga/2003/44_section-244A,"9. So far as the imprisonment is concerned, Mr...",244A Release on licence of prisoners serving s...,,,,,,,,,custodial sentence was in principle justified ...,appropriate custodial term,Medium,The case discusses the justification for a cus...
1,ewhc_admin_2008_470#para_25,id/ukpga/1989/33_section-6,"25. The other way in which the point is put, a...",6 General restrictions on return. (1) A person...,the gravity of an offence is relevant to wheth...,the offence of which that person is accused or...,Medium,Relevance of offence gravity to changes in cir...,return him to Poland for the sake of a trivial...,"might, if returned, be prejudiced at his trial...",High,Case law discusses the oppressive nature of re...,,,,


In [123]:
len(df_combined)

13893

In [124]:
df_for_regeneration = df_combined[
    ~df_combined[['confidence_llama', 'confidence_openai', 'confidence_deepseek']].apply(
        lambda x: any(str(v).lower() == 'high' for v in x), axis=1
    )
]

In [128]:
len(source_pairs)

17251

In [125]:
df_combined_regeneration = pd.concat([df_for_regeneration, df_not_processed_prepare_For_regeneration], ignore_index=True)

In [130]:
filtered_df_combined_regeneration = df_combined_regeneration[
    df_combined_regeneration.apply(lambda row: (row['para_id'], row['section_id']) in source_pairs, axis=1)
]

In [131]:
len(filtered_df_combined_regeneration)

8258

In [137]:
df_source.head(1)

Unnamed: 0.1,Unnamed: 0,case_uri,para_id,paragraphs,references,if_law_applied,application_of_law_phrases,reason,if_law_applied_llama,application_of_law_phrases_llama,...,if_law_applied_claude,application_of_law_phrases_claude,reason_claude,confidence,agreement_with,final_annotation,case_name,section_id,section_text,section_id_standardized
0,0,https://caselaw.nationalarchives.gov.uk/eat/20...,eat_2022_192#para_12,"12. In April 2016, the claimant was moved into...",[],True,['this the ET considered was indicative of the...,The Employment Tribunal applies legal principl...,True,"['This, the ET considered, was indicative of t...",...,,,,,,True,eat_2022_192,id/ukpga/2010/15_section-60,60 Enquiries about disability and health (1) A...,id/ukpga/2010/15_section-60


In [3]:
df_combined.head(1)

NameError: name 'df_combined' is not defined

In [138]:
# I need the pairs rows that where para_id and section_id are in source_pairs and df_combined but not in filtered_df_combined_regeneration

# Get all pairs in df_combined
combined_pairs = set(zip(df_combined['para_id'], df_combined['section_id']))

# Get all pairs in filtered_df_combined_regeneration
regeneration_pairs = set(zip(filtered_df_combined_regeneration['para_id'], filtered_df_combined_regeneration['section_id']))

# Find pairs in both source_pairs and df_combined, but not in filtered_df_combined_regeneration
target_pairs = (source_pairs & combined_pairs) - regeneration_pairs

# Get the corresponding rows from df_combined
df_pairs_in_combined_not_in_regeneration = df_combined[
    df_combined.apply(lambda row: (row['para_id'], row['section_id']) in target_pairs, axis=1)
]

df_pairs_in_combined_not_in_regeneration.head()

Unnamed: 0,para_id,section_id,para_text,section_text,case_term_llama,legislation_term_llama,confidence_llama,reason_llama,case_term_openai,legislation_term_openai,confidence_openai,reason_openai,case_term_deepseek,legislation_term_deepseek,confidence_deepseek,reason_deepseek
1,ewhc_admin_2008_470#para_25,id/ukpga/1989/33_section-6,"25. The other way in which the point is put, a...",6 General restrictions on return. (1) A person...,the gravity of an offence is relevant to wheth...,the offence of which that person is accused or...,Medium,Relevance of offence gravity to changes in cir...,return him to Poland for the sake of a trivial...,"might, if returned, be prejudiced at his trial...",High,Case law discusses the oppressive nature of re...,,,,
2,ewhc_qb_2016_2355#para_21,http://www.legislation.gov.uk/id/ukpga/Eliz2/5...,21. The judge expressly referred to the provis...,2 Extent of occupier’s ordinary duty (1) An oc...,the judge expressly addressed the question of ...,a duty to take such care as in all the circums...,High,Direct application of the common duty of care ...,the Claimant would be reasonably safe in using...,the visitor will be reasonably safe in using t...,High,Direct application of the common duty of care ...,,,,
3,ewhc_admin_2012_1033#para_10,id/ukpga/Geo5/15-16/20_section-52,10. The appellant submitted first that the wor...,52 Conveyances to be by deed. (1) All conveyan...,,,,,statutory periodic tenancy arose following the...,assured tenancies of dwelling-houses in Englan...,High,Reference to assured tenancies relates directl...,statutory periodic tenancy arose following the...,assured tenancies of dwelling-houses in Englan...,High,The case discusses the implications of statuto...
4,ewhc_admin_2007_1304#para_50,id/ukpga/2002/29_section-330,50. I also have to take some account of the ap...,330 Failure to disclose: regulated sector (1) ...,engaged in fraud on the scale of the Narushima...,engaged in money laundering,Medium,The case law excerpt implies a suspicion of mo...,,,,,engaged in money laundering,engaged in money laundering,High,Direct reference to the legislative term for m...
6,ewca_crim_2005_3377#para_21,id/ukpga/2003/44_section-58,21. We cannot accept Mr Chambers' submissions....,58 General right of appeal in respect of rulin...,,,,,judge did not direct the jury,judge starts his summing-up to the jury,High,The case law discusses the judge's duty to dir...,,,,


In [144]:
len(filtered_df_combined_regeneration)

8258

In [142]:
df_pairs_in_combined_not_in_regeneration

Unnamed: 0,para_id,section_id,para_text,section_text,case_term_llama,legislation_term_llama,confidence_llama,reason_llama,case_term_openai,legislation_term_openai,confidence_openai,reason_openai,case_term_deepseek,legislation_term_deepseek,confidence_deepseek,reason_deepseek
1,ewhc_admin_2008_470#para_25,id/ukpga/1989/33_section-6,"25. The other way in which the point is put, a...",6 General restrictions on return. (1) A person...,the gravity of an offence is relevant to wheth...,the offence of which that person is accused or...,Medium,Relevance of offence gravity to changes in cir...,return him to Poland for the sake of a trivial...,"might, if returned, be prejudiced at his trial...",High,Case law discusses the oppressive nature of re...,,,,
2,ewhc_qb_2016_2355#para_21,http://www.legislation.gov.uk/id/ukpga/Eliz2/5...,21. The judge expressly referred to the provis...,2 Extent of occupier’s ordinary duty (1) An oc...,the judge expressly addressed the question of ...,a duty to take such care as in all the circums...,High,Direct application of the common duty of care ...,the Claimant would be reasonably safe in using...,the visitor will be reasonably safe in using t...,High,Direct application of the common duty of care ...,,,,
3,ewhc_admin_2012_1033#para_10,id/ukpga/Geo5/15-16/20_section-52,10. The appellant submitted first that the wor...,52 Conveyances to be by deed. (1) All conveyan...,,,,,statutory periodic tenancy arose following the...,assured tenancies of dwelling-houses in Englan...,High,Reference to assured tenancies relates directl...,statutory periodic tenancy arose following the...,assured tenancies of dwelling-houses in Englan...,High,The case discusses the implications of statuto...
4,ewhc_admin_2007_1304#para_50,id/ukpga/2002/29_section-330,50. I also have to take some account of the ap...,330 Failure to disclose: regulated sector (1) ...,engaged in fraud on the scale of the Narushima...,engaged in money laundering,Medium,The case law excerpt implies a suspicion of mo...,,,,,engaged in money laundering,engaged in money laundering,High,Direct reference to the legislative term for m...
6,ewca_crim_2005_3377#para_21,id/ukpga/2003/44_section-58,21. We cannot accept Mr Chambers' submissions....,58 General right of appeal in respect of rulin...,,,,,judge did not direct the jury,judge starts his summing-up to the jury,High,The case law discusses the judge's duty to dir...,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13882,ewhc_kb_2023_1256#para_71,id/ukpga/2013/26_section-1,71. The case on serious harm which is pursued ...,1 Serious harm (1) A statement is not defamato...,serious harm,serious harm to the reputation of the claimant,High,Direct application of the serious harm require...,serious harm was a “threshold” issue,A statement is not defamatory unless its publi...,High,Direct reference to the legislative requiremen...,serious harm,serious harm,High,Direct application of the legislative requirem...
13887,ewhc_ch_2020_3295#para_14,id/ukpga/2011/25_section-280A,14. The first question arises under clause 8(a...,280A Amendment of the trusts of an unincorpora...,benevolent fund,benevolent fund,Medium,Reference to a type of charity,no such institution with that name,power under subsection (2) is not exercisable ...,Medium,The case discusses a benevolent fund and its s...,well-known unincorporated association,charity which is not a company or other body c...,High,Direct reference to the legal status of the en...
13888,ewhc_comm_2005_2115#para_64,id/ukpga/1996/23_section-34,64. Having reached these clear conclusions wit...,34 Procedural and evidential matters. (1) It s...,,,,,at least a real or serious issue to be tried a...,whether to apply strict rules of evidence (or ...,High,The case law discusses issues to be tried whic...,I have considered carefully the various denial...,when these should be supplied and the extent t...,High,The case law discusses the consideration of wi...
13889,ewhc_fam_2023_1096#para_19,id/ukpga/1984/42_section-31F,19. In the absence of any application to adduc...,31F Proceedings and decisions (1) The family c...,the family court may adjourn a hearing,The family court may adjourn a hearing,High,Direct reference to the court's power to adjou...,appellant was the subject of a debarring order...,Every judgment and order of the family court i...,Medium,The reference to the debarring order connects ...,,,,


In [143]:
len(df_pairs_in_combined_not_in_regeneration)

8979

In [None]:
#Now I have to combine the three datafrmaes with columns para_id=standard_para_id, section_id, para_text,section_text,case_term_llama, legislation_term_llama,confidence_llama,reason_llama, case_term_openai, legislation_term_openai, confidence_openai, reason_openai, case_term_deepseek, legislation_term_deepseek,confidence_deepseek, reason_deepseek with all the valied extracted phrases nonduplicated

# After that I have to check for the each (para_id,section_id) pair where I don't have any result `I need to ask claude to regenearte it or give a reason-- for time being make a dataframe
# For the ones (para_id,section_id) where I have the results but not any of the them have high confidence I have to append those in to the dataframe for the regenrations
#For the ones where I have the results and at least one of them has high confidence I have to append those in to the dataframe for the validation
#If only one record is high confidence then keep it
#If more than one record is high confidence then send them to claude for LLM as a judge 
#Ad=fter this script I have three dataframes 
# df_for_regeneration: for the ones where I don't have any results or all of them are low/Medium confidence
# df_for_validation: for the ones where I have results and at least two of them has high confidence
# df_keep: for the ones where I have results and only one of them has high confidence

KeyError: 'standard_para_id'

In [7]:
df = df.dropna(subset=['case_term_phrases'])
print(len(df))
print(df[['url', 'para_id']].drop_duplicates().shape[0])

13381
6861


In [53]:
len(df)

13381

In [19]:
acts = df['legislation_id'].unique()
for act in acts:
    print(act)
   

1984/60_76
1999/23_23
1989/41_26ZA
1984/60_54A
1990/8_20
id/ukpga/Geo6/12-13-14/88_section-7
2013/26_3
1998/29_45
1978/47_4
1980/58_32A
1996/23_41
1989/41_11H
id/ukpga/Geo6and1Eliz2/15-16/66_section-9
2002/41_101
2002/29_20
2007/3_416
1983/20_64J
1973/18_23
2009/25_121
1970/9_59DA
1990/8_61Y
1988/33_25
id/ukpga/Eliz2/5-6/11_section-1
1992/5_117
1972/30_3
1995/50_7B-n.i.
1998/37_66BA
2006/46_901
2005/9_24
1970/9_57
1988/33_36
2008/9_116
1985/68_69
2003/44_54
1990/8_78
1996/18_11
1996/52_184
1996/25_33
id/ukpga/Geo5/1-2/6_section-1A
id/ukpga/Vict/24-25/100_section-20
1970/9_47B
1970/9_42
2008/14_117
2004/12_165
1988/48_288
1983/20_12A
1984/60_78
1988/33_45
2006/46_582
2003/44_104
1981/54_87
2011/20_25
1999/23_34
2013/18_18
1998/42_9
1990/43_59
1995/26_33
1990/8_2C
2003/39_39
1996/40_3
1999/33_3
2006/46_169A
2002/41_115
2009/25_52
1984/60_63KA
id/ukpga/Eliz2/2-3/56_section-20
2003/41_85
1990/8_94
1968/19_37
1980/58_37
1968/19_23
2003/44_109
1982/53_19
1990/8_61N
2003/1_691
1978/47_7
2015/

In [13]:


# Filter for 1989/41
filtered_df = df[df['legislation_id'].str.contains('2002/38', na=False)]
filtered_df = filtered_df.dropna(subset=['case_term_phrases'])
filtered_df = filtered_df.dropna(subset=['case_term_phrases'])
filtered_df = filtered_df[filtered_df['confidence'] == "High"]
# Save to new CSV
filtered_df.to_csv('../data/final_test/final/sample_2002_38.csv', index=False)

print(f"Original rows: {len(df)}")
print(f"Filtered rows: {len(filtered_df)}")

Original rows: 13381
Filtered rows: 58


In [17]:
len(df)

13381

In [18]:
df.confidence.value_counts(normalize=True) * 100

confidence
High      73.830169
Medium    25.661534
Low        0.508297
Name: proportion, dtype: float64

In [33]:
# Check for duplicate rows based on para_id, url, case_term, and legislation_term
# Drop duplicate rows based on para_id, url, case_term, and legislation_term
# Count how many cases (unique url + para_id) have duplication in the original df
dup_counts = df.duplicated(subset=['para_id', 'url', 'case_term', 'legislation_term'], keep=False)
duplicated_cases = df[dup_counts].drop_duplicates(subset=['url', 'para_id','case_term', 'legislation_term'])
print(f"Number of cases with duplication: {len(duplicated_cases)}")

KeyError: Index(['case_term', 'url', 'legislation_term'], dtype='object')

In [None]:
# Create a boolean mask for duplicates based on the specified columns
dup_mask = df.duplicated(subset=['para_id', 'url', 'case_term', 'legislation_term'], keep=False)

# DataFrame with duplicate values
df_duplicates = df[dup_mask].copy()

# DataFrame without duplicate values
df_no_duplicates = df[~dup_mask].copy()

print(f"Number of rows with duplicates: {len(df_duplicates)}")
print(f"Number of rows without duplicates: {len(df_no_duplicates)}")
df_no_duplicates = df_no_duplicates.drop_duplicates(subset=['url', 'para_id'])

len(df_no_duplicates)

In [36]:
df_no_duplicates[
    (df['url'] == 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2006/3335') &
    (df['para_id'] == 'para_22') &
    (df['case_term'] == 'deliberately inflicted serious injury') &
    (df['legislation_term'] == 'unlawfully and maliciously wound or inflict any grievous bodily harm')
]

Unnamed: 0,url,para_id,paragraphs,case_term_phrases,legislation_id,section_text,case_term,legislation_term,confidence,reasoning,key_phrases,standardized_act_id
47,https://caselaw.nationalarchives.gov.uk/ewca/c...,para_22,"22. In the present case the offender had, in e...","[{""case_law_excerpt"": ""deliberately inflicted ...",id/ukpga/Vict/24-25/100_section-20,"20 Inflicting bodily injury, with or without w...",deliberately inflicted serious injury,unlawfully and maliciously wound or inflict an...,High,Direct application of inflicting grievous bodi...,unlawfully and maliciously wound or inflict an...,id/ukpga/Vict/24-25/100_section_section-20
7637,https://caselaw.nationalarchives.gov.uk/ewca/c...,para_22,"22. In the present case the offender had, in e...","[{""case_law_excerpt"": ""deliberately inflicted ...",id/ukpga/Vict/24-25/100_section-20-n.i.,"20 Inflicting bodily injury, with or without w...",deliberately inflicted serious injury,unlawfully and maliciously wound or inflict an...,High,Direct interpretation of the legal concept of ...,unlawfully and maliciously wound or inflict an...,id/ukpga/Vict/24-25/100_section_section-20-n.i.


In [139]:
df_original = pd.read_csv('../data/final_test/final/withsectionpositvefinal_cleaned.csv')
df_original.head(1)

Unnamed: 0.2,Unnamed: 0.1,case_uri,para_id,paragraphs,references,if_law_applied,application_of_law_phrases,reason,if_law_applied_llama,application_of_law_phrases_llama,...,application_of_law_phrases_claude,reason_claude,confidence,agreement_with,final_annotation,case_name,para_text_length,section_id,section_text,Unnamed: 0
0,16266,https://caselaw.nationalarchives.gov.uk/eat/20...,eat_2022_192#para_12,"12. In April 2016, the claimant was moved into...",[],True,['this the ET considered was indicative of the...,The Employment Tribunal applies legal principl...,True,"['This, the ET considered, was indicative of t...",...,,,,,True,eat_2022_192,,id/ukpga/2010/15_section-60,60 Enquiries about disability and health (1) A...,26827.0


In [141]:
df_original['section_id'].value_counts()

section_id
id/ukpga/1998/42_section-7                                    109
id/ukpga/1989/41_section-1                                     96
id/ukpga/1984/60_section-78                                    78
id/ukpga/1998/42_section-2                                     76
id/ukpga/1980/58_section-32A                                   66
                                                             ... 
id/ukpga/1984/24_section-30                                     1
http://www.legislation.gov.uk/id/ukpga/1984/24/section/41_      1
http://www.legislation.gov.uk/id/ukpga/1983/54/section/41_      1
id/ukpga/1984/24_section-33                                     1
id/ukpga/1968/73_section-166                                    1
Name: count, Length: 5130, dtype: int64

In [143]:
# Get section_id values containing 'http' or 'https'
import re

def standardize_section_id(section_id):
    """
    Convert section_id containing 'http' or 'https' to the standard format:
    'id/ukpga/{year}/{chapter}_section-{section}'
    """
    if pd.isna(section_id):
        return section_id
    if isinstance(section_id, str) and section_id.startswith('http'):
        # Example: http://www.legislation.gov.uk/id/ukpga/1990/8/section/187_
        match = re.search(r'/ukpga/(\d{4})/(\d+)/section/([\w\-]+)_?', section_id)
        if match:
            year, chapter, section = match.groups()
            return f'id/ukpga/{year}/{chapter}_section-{section}'
    return section_id

# Apply the function to all section_ids in df_original
df_original['section_id_standardized'] = df_original['section_id'].apply(standardize_section_id)

# Show the changed values for those that originally contained 'http'
section_ids_with_http = df_original[df_original['section_id'].str.contains('http', na=False)][['section_id', 'section_id_standardized']]
section_ids_with_http.value_counts()

print(df_original[df_original['section_id'].str.contains('Geo', na=False)])

       Unnamed: 0.1                                           case_uri  \
2274          15042  https://caselaw.nationalarchives.gov.uk/ewca/c...   
2278           2648  https://caselaw.nationalarchives.gov.uk/ewca/c...   
2279           2649  https://caselaw.nationalarchives.gov.uk/ewca/c...   
2280           2650  https://caselaw.nationalarchives.gov.uk/ewca/c...   
2281           2651  https://caselaw.nationalarchives.gov.uk/ewca/c...   
...             ...                                                ...   
17357         16112  https://caselaw.nationalarchives.gov.uk/ukut/l...   
17358         16113  https://caselaw.nationalarchives.gov.uk/ukut/l...   
17359         16114  https://caselaw.nationalarchives.gov.uk/ukut/l...   
17360          2868  https://caselaw.nationalarchives.gov.uk/ukut/l...   
17361          2867  https://caselaw.nationalarchives.gov.uk/ukut/l...   

                         para_id  \
2274   ewca_civ_2015_718#para_22   
2278   ewca_civ_2015_718#para_39   
227

In [144]:
df_original.columns

Index(['Unnamed: 0.1', 'case_uri', 'para_id', 'paragraphs', 'references',
       'if_law_applied', 'application_of_law_phrases', 'reason',
       'if_law_applied_llama', 'application_of_law_phrases_llama',
       'reason_llama', 'if_law_applied_claude',
       'application_of_law_phrases_claude', 'reason_claude', 'confidence',
       'agreement_with', 'final_annotation', 'case_name', 'para_text_length',
       'section_id', 'section_text', 'Unnamed: 0', 'section_id_standardized'],
      dtype='object')

In [146]:
df_original = df_original.drop(columns=['Unnamed: 0.1', 'para_text_length', 'Unnamed: 0'], errors='ignore')

In [149]:
len(df_original)

17470

In [153]:
df_original.columns

Index(['case_uri', 'para_id', 'paragraphs', 'references', 'if_law_applied',
       'application_of_law_phrases', 'reason', 'if_law_applied_llama',
       'application_of_law_phrases_llama', 'reason_llama',
       'if_law_applied_claude', 'application_of_law_phrases_claude',
       'reason_claude', 'confidence', 'agreement_with', 'final_annotation',
       'case_name', 'section_id_original', 'section_text', 'section_id'],
      dtype='object')

In [152]:
df_original.rename(columns={'section_id': 'section_id_original','section_id_standardized': 'section_id'}, inplace=True)

In [154]:
len(df_original)

17470

In [151]:
df_original.to_csv('../data/final_test/final/withsectionpositvefinal_cleaned.csv')

In [123]:
examples = df[(df['url'] == 'https://caselaw.nationalarchives.gov.uk/ewfc/b/2024/40') & (df['para_id'] == 'para_6')]
example = examples.iloc[0]
example

url                    https://caselaw.nationalarchives.gov.uk/ewfc/b...
para_id                                                           para_6
paragraphs             6. The father of the second oldest child holds...
case_term_phrases      [{"case_law_excerpt": "holds Parental Responsi...
legislation_id                                                 1989/41_2
section_text           2 Parental responsibility for children. (1) Wh...
case_term              holds Parental Responsibility for her by virtu...
legislation_term       the father shall have parental responsibility ...
confidence                                                          High
reasoning              Direct reference to parental responsibility ac...
key_phrases            the father shall have parental responsibility ...
standardized_act_id                                    1989/41_section_2
Name: 12474, dtype: object

In [124]:
from pprint import pprint

pprint(example['case_term'])
pprint(example['legislation_term'])
pprint(example['reasoning'])
pprint(example['standardized_act_id'])

('holds Parental Responsibility for her by virtue of being named on her birth '
 'certificate')
('the father shall have parental responsibility for the child if he has '
 'acquired it (and has not ceased to have it) in accordance with the '
 'provisions of this Act')
'Direct reference to parental responsibility acquisition under the law'
'1989/41_section_2'


In [88]:
# Find rows where if_law_applied_llama is False, if_law_applied is True, and if_law_applied_claude is True
filtered_rows = df_original[
    (df_original.if_law_applied_llama == True) &
    (df_original.if_law_applied == False) &
    (df_original.if_law_applied_claude == True)
]
filtered_rows.head(1)



Unnamed: 0.2,Unnamed: 0.1,case_uri,para_id,paragraphs,references,if_law_applied,application_of_law_phrases,reason,if_law_applied_llama,application_of_law_phrases_llama,...,application_of_law_phrases_claude,reason_claude,confidence,agreement_with,final_annotation,case_name,para_text_length,section_id,section_text,Unnamed: 0
34,2891,https://caselaw.nationalarchives.gov.uk/eat/20...,eat_2022_192#para_61,"61. In considering the ET’s reasoning, the res...",[],False,[],This paragraph discusses general legal princip...,True,['decisions are not to be scrutinised closely ...,...,['decisions are not to be scrutinised closely ...,Model B's analysis is more accurate. While thi...,High,Llama,True,eat_2022_192,,id/ukpga/2010/15_schedule-1-part-2,Part 2 Guidance Preliminary 10 This Part of th...,4340.0


In [89]:
filtered_rows.columns

Index(['Unnamed: 0.1', 'case_uri', 'para_id', 'paragraphs', 'references',
       'if_law_applied', 'application_of_law_phrases', 'reason',
       'if_law_applied_llama', 'application_of_law_phrases_llama',
       'reason_llama', 'if_law_applied_claude',
       'application_of_law_phrases_claude', 'reason_claude', 'confidence',
       'agreement_with', 'final_annotation', 'case_name', 'para_text_length',
       'section_id', 'section_text', 'Unnamed: 0'],
      dtype='object')

In [110]:
list(filtered_rows.case_uri.unique())[60:120]

['https://caselaw.nationalarchives.gov.uk/ewca/crim/2003/190',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2005/1722',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2006/2136',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2006/3301',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2006/3335',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2006/646',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2007/1165',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2007/2548',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2007/3223',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2007/3432',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2007/36',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2008/468',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2008/854',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2008/894',
 'https://caselaw.nationalarchives.gov.uk/ewca/crim/2009/1942',
 'https://caselaw.nationalarchives.gov.uk/ewca/

In [119]:
filtered_rows[(filtered_rows['case_uri'] == 'https://caselaw.nationalarchives.gov.uk/ewfc/b/2024/40')]

Unnamed: 0.2,Unnamed: 0.1,case_uri,para_id,paragraphs,references,if_law_applied,application_of_law_phrases,reason,if_law_applied_llama,application_of_law_phrases_llama,...,application_of_law_phrases_claude,reason_claude,confidence,agreement_with,final_annotation,case_name,para_text_length,section_id,section_text,Unnamed: 0
5874,2773,https://caselaw.nationalarchives.gov.uk/ewfc/b...,ewfc_b_2024_40#para_6,6. The father of the second oldest child holds...,[],False,[],This paragraph only states that the father hol...,True,['holds Parental Responsibility for her by vir...,...,['holds Parental Responsibility for her by vir...,Model B's analysis is more accurate. This para...,High,Llama,True,ewfc_b_2024_40,,id/ukpga/1989/41_section-4,4 Acquisition of parental responsibility by fa...,4154.0
5875,2772,https://caselaw.nationalarchives.gov.uk/ewfc/b...,ewfc_b_2024_40#para_6,6. The father of the second oldest child holds...,[],False,[],This paragraph only states that the father hol...,True,['holds Parental Responsibility for her by vir...,...,['holds Parental Responsibility for her by vir...,Model B's analysis is more accurate. This para...,High,Llama,True,ewfc_b_2024_40,,id/ukpga/1989/41_section-2,2 Parental responsibility for children. (1) Wh...,4153.0


In [130]:
for section_id in filtered_rows[(filtered_rows['case_uri'] == 'https://caselaw.nationalarchives.gov.uk/ewfc/b/2024/40')]['para_id']:
    print(section_id)

ewfc_b_2024_40#para_6
ewfc_b_2024_40#para_6


In [131]:
main_pointExampl = filtered_rows[(filtered_rows['case_uri'] == 'https://caselaw.nationalarchives.gov.uk/ewfc/b/2024/40')&(filtered_rows['para_id'] == 'ewfc_b_2024_40#para_6') & (filtered_rows['section_id'] == 'id/ukpga/1989/41_section-4') ]


In [132]:
main_pointExampl

Unnamed: 0.2,Unnamed: 0.1,case_uri,para_id,paragraphs,references,if_law_applied,application_of_law_phrases,reason,if_law_applied_llama,application_of_law_phrases_llama,...,application_of_law_phrases_claude,reason_claude,confidence,agreement_with,final_annotation,case_name,para_text_length,section_id,section_text,Unnamed: 0
5874,2773,https://caselaw.nationalarchives.gov.uk/ewfc/b...,ewfc_b_2024_40#para_6,6. The father of the second oldest child holds...,[],False,[],This paragraph only states that the father hol...,True,['holds Parental Responsibility for her by vir...,...,['holds Parental Responsibility for her by vir...,Model B's analysis is more accurate. This para...,High,Llama,True,ewfc_b_2024_40,,id/ukpga/1989/41_section-4,4 Acquisition of parental responsibility by fa...,4154.0


In [133]:
main_pointExampl = main_pointExampl.iloc[0]
pprint(main_pointExampl['reason'])
pprint(main_pointExampl['reason_llama'])
pprint(main_pointExampl['reason_claude'])

('This paragraph only states that the father holds Parental Responsibility due '
 'to being named on the birth certificate without applying any legal '
 'principles to the specific facts of the case.')
('The court applies the legal principle that a father acquires Parental '
 "Responsibility by being named on the child's birth certificate, as per the "
 'Children Act 1989, to the specific facts of this case.')
("Model B's analysis is more accurate. This paragraph contains an application "
 'of law to specific facts. The court is applying the legal principle that '
 "parental responsibility is acquired when a father is named on a child's "
 'birth certificate to the specific factual circumstance of this case - '
 'namely, that the father of the second oldest child was named on her birth '
 "certificate. The phrase 'by virtue of being named on her birth certificate' "
 'demonstrates the causal legal connection between the statutory provision '
 '(implicit reference to Children Act 1989) 