# Test Phrase Extraction with Fine-tuned Model

This notebook tests the new phrase extraction function with section text and section_id tracking using only 5 rows and your fine-tuned model.

In [1]:
import sys
sys.path.append('../src')
import pandas as pd
from make_batch_jsonl_law_application import create_batch_jsonl_for_phrase_extraction
from phrase_validator import PhraseValidator
import json

In [6]:
import sys
import os

# Add the src folder to Python path
sys.path.append(os.path.join(os.path.dirname(os.getcwd()), 'src'))

# Load environment variables from src/.env
from dotenv import load_dotenv
load_dotenv(os.path.join(os.path.dirname(os.getcwd()), 'src', '.env'))
import openAIHandler

In [None]:
# 1. Load data with only 5 rows for testing
csv_path = '../data/final_test/final/withsectionpositvefinal_cleaned.csv'
df = pd.read_csv(csv_path)

print(len(df))






17470


In [15]:
para_text = df.paragraphs[1]
section_text = df.section_text[1]
para_id = df.para_id[1]
section_id = df.section_id[1]

In [None]:
# Run validation on the results
validation_results = PhraseValidator.validate_extractions_df(merged_df)

print("üîç Validation Results:")
print(f"Total paragraphs: {len(validation_results)}")

# Show validation summary
summary = PhraseValidator.get_validation_summary(validation_results)
print(f"\nüìà Validation Summary:")
print(f"Total paragraphs: {summary['total_paragraphs']}")
print(f"Valid paragraphs: {summary['valid_paragraphs']}")
print(f"Invalid paragraphs: {summary['invalid_paragraphs']}")
print(f"Success rate: {summary['success_rate']:.1f}%")

print(f"\n‚ùå Failure reasons:")
for reason, count in summary['failure_reasons'].items():
    print(f"  {reason}: {count}")

üîç Validation Results:
Total paragraphs: 7

üìà Validation Summary:
Total paragraphs: 2
Valid paragraphs: 0
Invalid paragraphs: 2
Success rate: 0.0%

‚ùå Failure reasons:
  No extracted phrases found: 4
  No case law excerpt extracted: 3


In [10]:
import openAIHandler
extraction_chain = openAIHandler.getPhraseExtractionChain()

interpretations = openAIHandler.getInterPretations(section_text, para_text, para_id, section_id, extraction_chain)

In [12]:
interpretations

'[\n    {\n        "caselaw_term": "claimant ticked the relevant box to state he had a disability",\n        "legislation_term": "interested disabled person has a disability and is likely to be placed at the disadvantage referred to in the first, second or third requirement",\n        "key_phrases": ["interested disabled person has a disability"],\n        "reasoning": "Case law identifies the claimant as an interested disabled person, aligning with the legislative definition of disability.",\n        "confidence": "High"\n    },\n    {\n        "caselaw_term": "no apparent disadvantage or need for adjustments",\n        "legislation_term": "A is not subject to a duty to make reasonable adjustments if A does not know, and could not reasonably be expected to know",\n        "key_phrases": ["duty to make reasonable adjustments"],\n        "reasoning": "Case law indicates that the claimant did not experience a disadvantage, which relates to the lack of duty for reasonable adjustments unde

In [14]:
import ast 
interpretations_list = ast.literal_eval(interpretations)
for interpretation in interpretations_list:
    case_law_term = interpretation['caselaw_term']
    legislation_term = interpretation['legislation_term']
    confidence = interpretation['confidence']
    print(case_law_term in para_text)
    print(legislation_term in section_text)




True
True
False
True


In [8]:
# Print the prompt from the prompt_file
try:
    with open(prompt_file, 'r', encoding='utf-8') as f:
        prompt_content = f.read()
    print("Prompt from prompt_file:\n")
    print(prompt_content)
except Exception as e:
    print(f"‚ùå Could not read prompt file '{prompt_file}': {e}")


Prompt from prompt_file:

You are a legal expert tasked with extracting exact phrases from case law paragraphs that correspond to specific legislation sections.

Your task is to:
1. Analyze the case law paragraph
2. Compare it with the provided legislation section text
3. Extract exact phrases that show how the case law applies or interprets the legislation
4. Provide confidence levels for each extraction
5. Include the section_id for tracking purposes

Return your response in this JSON format:
{
    "para_id": "paragraph_identifier",
    "section_id": "legislation_section_identifier",
    "extracted_phrases": [
        {
            "case_law_term": "exact phrase from case law",
            "legislation_term": "corresponding phrase from legislation",
            "confidence": "High/Medium/Low",
            "reasoning": "explanation of the extraction"
        }
    ],
    "reason": "overall reasoning for the extractions"
}

Focus on finding exact or near-exact matches between the case 

In [9]:
# 6. Create the batch JSONL for phrase extraction
try:
    created_files = create_batch_jsonl_for_phrase_extraction(
        model_name=model_name,
        prompt_file=prompt_file,
        examples_file=examples_file,
        df=test_df,
        output_path=output_path
    )
    
    print("‚úÖ Successfully created JSONL files:")
    for file_path in created_files:
        print(f"   - {file_path}")
        
except Exception as e:
    print(f"‚ùå Error creating JSONL files: {e}")
    print("\nThis might be because the prompt or examples files don't exist yet.")

Detected provider: OpenAI
Limits: 50,000 requests, 100MB per file
Building phrase extraction JSONL requests...

Analysis:
  Total requests: 5
  Estimated size: 0.0 MB
  Splits needed: 1 (file within limits)

‚úÖ Creating single file: ../data/final_test/test_phrase_extraction.jsonl
   Actual size: 0.0 MB
   Requests: 5

üìã Summary:
   Provider: OpenAI
   Model: ft:gpt-4o-mini-2024-07-18:swansea-university::B3pbF9HD
   Files created: 1
   Total requests: 5
   Total size: 0.0 MB

üìÅ Output files:
   1. ../data/final_test/test_phrase_extraction.jsonl
‚úÖ Successfully created JSONL files:
   - ../data/final_test/test_phrase_extraction.jsonl


In [21]:
import sys
sys.path.append('../src')
import openAIHandler

# Execute the JSONL file using batch API
input_file = '../data/final_test/test_phrase_extraction.jsonl'
batch_job = openAIHandler.get_batch_job(input_file)

print("‚úÖ Batch job submitted successfully!")
print(f"Batch ID: {batch_job.id}")
print(f"Status: {batch_job.status}")
print(f"Input file ID: {batch_job.input_file_id}")
print(f"Completion window: {batch_job.completion_window}")



‚úÖ Batch job submitted successfully!
Batch ID: batch_687f851578108190a8fb55deaa561831
Status: validating
Input file ID: file-Qa5pJtPZPkKYAhazM9XGyq
Completion window: 24h


In [32]:
# You can check the status later with:
import openai
client = openai.OpenAI()
batch_status = client.batches.retrieve(batch_job.id)
print(f"Current status: {batch_status.status}")

Current status: completed


In [33]:
print(batch_job.id)

batch_687f851578108190a8fb55deaa561831


In [34]:
import openai
import os
from dotenv import load_dotenv
import json
import pandas as pd

# Load environment variables
load_dotenv('src/.env')
load_dotenv()

# Set up client
client = openai.OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# List recent batches to find the completed one
batches = client.batches.list(limit=5)
print('Recent batches:')
for batch in batches.data:
    print(f'ID: {batch.id}, Status: {batch.status}')
    if batch.status == 'completed':
        print(f'  ‚úÖ Completed batch found: {batch.id}')
        print(f'  Output file ID: {batch.output_file_id}')
        
        # Download the results
        result = client.files.content(batch.output_file_id)
        output_path = '../data/final_test/batch_output_results.jsonl'
        
        with open(output_path, 'wb') as f:
            f.write(result.read())
        
        print(f'  üì• Results downloaded to: {output_path}')
        break

Recent batches:
ID: batch_687f851578108190a8fb55deaa561831, Status: completed
  ‚úÖ Completed batch found: batch_687f851578108190a8fb55deaa561831
  Output file ID: file-7fCcDKU37Zi5xDmNhcPayY
  üì• Results downloaded to: ../data/final_test/batch_output_results.jsonl


In [5]:
import sys
sys.path.append('../src')
from phrase_validator import PhraseValidator
import json
import pandas as pd

# Parse the JSONL results
results = []
with open('../data/final_test/batch_output_results.jsonl', 'r') as f:
    for line in f:
        data = json.loads(line)
        
        # Extract the response
        try:
            response_content = data['response']['body']['choices'][0]['message']['content']
            # Parse the JSON response
            extracted_data = json.loads(response_content)
            results.append(extracted_data)
        except Exception as e:
            print(f"Error parsing response: {e}")
            continue

print(f"‚úÖ Parsed {len(results)} results")

# Show sample result
if results:
    print("\nüìã Sample result:")
    print(json.dumps(results[0], indent=2))

‚úÖ Parsed 5 results

üìã Sample result:
{
  "para_id": "ewhc_ch_2009_1229#para_35",
  "section_id": "id/ukpga/1998/42_section-8",
  "extracted_phrases": [
    {
      "case_law_term": "public interest generally requires the precise facts relevant to the decision to be a matter of public record",
      "legislation_term": "any act (or proposed act) of a public authority which the court finds is (or would be) unlawful, it may grant such relief or remedy",
      "confidence": "Medium",
      "reasoning": "The case law reflects the complexity and sensitivity of public authority decisions, aligning with the legislation's framework for judicial remedies concerning public authorities."
    }
  ],
  "reason": "The case law emphasizes the public interest and the importance of public record in tax disputes, which relates to judicial remedies against public authority actions deemed unlawful."
}


In [None]:
# Run validation on the results
validation_results = PhraseValidator.validate_extractions_df(merged_df)

print("üîç Validation Results:")
print(f"Total paragraphs: {len(validation_results)}")

# Show validation summary
summary = PhraseValidator.get_validation_summary(validation_results)
print(f"\nüìà Validation Summary:")
print(f"Total paragraphs: {summary['total_paragraphs']}")
print(f"Valid paragraphs: {summary['valid_paragraphs']}")
print(f"Invalid paragraphs: {summary['invalid_paragraphs']}")
print(f"Success rate: {summary['success_rate']:.1f}%")

print(f"\n‚ùå Failure reasons:")
for reason, count in summary['failure_reasons'].items():
    print(f"  {reason}: {count}")

üîç Validation Results:
Total paragraphs: 7

üìà Validation Summary:
Total paragraphs: 2
Valid paragraphs: 0
Invalid paragraphs: 2
Success rate: 0.0%

‚ùå Failure reasons:
  No extracted phrases found: 4
  No case law excerpt extracted: 3


In [None]:
# Run validation on the results
validation_results = PhraseValidator.validate_extractions_df(merged_df)

print("üîç Validation Results:")
print(f"Total paragraphs: {len(validation_results)}")

# Show validation summary
summary = PhraseValidator.get_validation_summary(validation_results)
print(f"\nüìà Validation Summary:")
print(f"Total paragraphs: {summary['total_paragraphs']}")
print(f"Valid paragraphs: {summary['valid_paragraphs']}")
print(f"Invalid paragraphs: {summary['invalid_paragraphs']}")
print(f"Success rate: {summary['success_rate']:.1f}%")

print(f"\n‚ùå Failure reasons:")
for reason, count in summary['failure_reasons'].items():
    print(f"  {reason}: {count}")

NameError: name 'merged_df' is not defined

In [None]:
# Run validation on the results
validation_results = PhraseValidator.validate_extractions_df(merged_df)

print("üîç Validation Results:")
print(f"Total paragraphs: {len(validation_results)}")

# Show validation summary
summary = PhraseValidator.get_validation_summary(validation_results)
print(f"\nüìà Validation Summary:")
print(f"Total paragraphs: {summary['total_paragraphs']}")
print(f"Valid paragraphs: {summary['valid_paragraphs']}")
print(f"Invalid paragraphs: {summary['invalid_paragraphs']}")
print(f"Success rate: {summary['success_rate']:.1f}%")

print(f"\n‚ùå Failure reasons:")
for reason, count in summary['failure_reasons'].items():
    print(f"  {reason}: {count}")

üîç Validation Results:
Total paragraphs: 7

üìà Validation Summary:
Total paragraphs: 2
Valid paragraphs: 0
Invalid paragraphs: 2
Success rate: 0.0%

‚ùå Failure reasons:
  No extracted phrases found: 4
  No case law excerpt extracted: 3


In [4]:
# Convert results to DataFrame format for validation
validation_data = []

for result in results:
    para_id = result.get('para_id', '')
    section_id = result.get('section_id', '')
    extracted_phrases = result.get('extracted_phrases', [])
    
    # Convert extracted_phrases back to string for validator
    validation_data.append({
        'para_id': para_id,
        'section_id': section_id,
        'extracted_phrases': json.dumps(extracted_phrases)
    })

results_df = pd.DataFrame(validation_data)
print(f"üìä Created DataFrame with {len(results_df)} rows")
print(f"Columns: {list(results_df.columns)}")

üìä Created DataFrame with 5 rows
Columns: ['para_id', 'section_id', 'extracted_phrases']


In [10]:
# 7. Test the validator function
print("Testing phrase validator...")

# Create a sample validation test
sample_text = "The court applied the principle of natural justice."
sample_section = "Natural justice requires fair procedures."
sample_case_law_term = "principle of natural justice"
sample_legislation_term = "natural justice"

# Test text cleaning
cleaned_text = PhraseValidator.clean_text(sample_text)
print(f"Original: '{sample_text}'")
print(f"Cleaned: '{cleaned_text}'")

# Test phrase validation
is_valid, reason = PhraseValidator.validate_phrase_match(
    sample_case_law_term, sample_legislation_term, sample_text, sample_section
)
print(f"\nValidation result: {is_valid} - {reason}")

Testing phrase validator...
Original: 'The court applied the principle of natural justice.'
Cleaned: 'the court applied the principle of natural justice'

Validation result: True - Both terms found in respective texts


In [None]:
# 9. Now try creating the JSONL again with the files created
try:
    created_files = create_batch_jsonl_for_phrase_extraction(
        model_name=model_name,
        prompt_file=prompt_file,
        examples_file=examples_file,
        df=test_df,
        output_path=output_path
    )
    
    print("‚úÖ Successfully created JSONL files:")
    for file_path in created_files:
        print(f"   - {file_path}")
        
        # Show sample content
        with open(file_path, 'r') as f:
            first_line = f.readline().strip()
            sample_data = json.loads(first_line)
            user_content = sample_data['body']['messages'][-1]['content']
            print(f"   Sample user content: {user_content[:150]}...")
            
            # Check if section_id is included
            if 'section_id:' in user_content:
                print(f"   ‚úÖ section_id is included in the prompt")
            else:
                print(f"   ‚ùå section_id is missing from the prompt")
        
except Exception as e:
    print(f"‚ùå Error creating JSONL files: {e}")