# Experiment Handler for Legal Text Analysis

This notebook implements experiments to measure the F1 score of law application detection and the accuracy of legal phrase extraction.

In [1]:
import os
import json
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import sys
import os
import json
import pandas as pd
import pickle

sys.path.append(os.getcwd())  # Add current directory to Python path

# Standard library imports
import matplotlib.pyplot as plt

# Import functions from experiment_utils.py
from experiment_utils import (
    load_csv_files,
    prepare_training_data,
    run_experiment,
    calculate_metrics,
    generate_experiment_report,
    visualize_results,
    process_legislation_references,
    extract_legislation_references,
    downloadThelegislationIfNotExist,
    measure_phrase_extraction_accuracy,
    run_full_case_experiment
)

## Step 1: Data Preparation

In this step, we prepare the training and testing files for our experiments. We'll use false positives as negative examples and false negatives as positive examples. For each case law, we'll use it as testing data and all other case laws as training examples.

In [2]:
# Define paths
notebook_dir = os.path.abspath('..')
print(notebook_dir)
input_folder_path = os.path.join(notebook_dir, 'data/test2/csv_cases/Experiment1-byPara')
output_folder_path = os.path.join(notebook_dir, 'data/test3')
training_data_path = os.path.join(notebook_dir, 'data/test2/training_data')
false_positives_path = os.path.join(notebook_dir, 'data/test2/csv_cases/Experiment2-fullcaselaw/false_positives.csv')
false_negatives_path = os.path.join(notebook_dir, 'data/test2/csv_cases/Experiment2-fullcaselaw/false_negatives.csv')

# Create output directories if they don't exist
os.makedirs(output_folder_path, exist_ok=True)
os.makedirs(training_data_path, exist_ok=True)

/Users/apple/Documents/Swansea/Projects/Odyssey-Terms-Extraction-Journal


In [3]:
# Load all CSV files from the input folder
case_files = load_csv_files(input_folder_path)
print(f"Loaded {len(case_files)} case files")

Loaded ewca_civ_2025_215 with 46 rows
Loaded ewhc_scco_2025_374 with 16 rows
Loaded ukftt_grc_2025_287 with 45 rows
Loaded ukftt_grc_2025_251 with 57 rows
Loaded ukftt_grc_2025_284 with 45 rows
Loaded ukftt_grc_2025_282 with 14 rows
Loaded ukftt_grc_2025_283 with 39 rows
Loaded eat_2025_29 with 57 rows
Loaded 8 case files


In [11]:
list(case_files.keys())

['ewca_civ_2025_215',
 'ewhc_scco_2025_374',
 'ukftt_grc_2025_287',
 'ukftt_grc_2025_251',
 'ukftt_grc_2025_284',
 'ukftt_grc_2025_282',
 'ukftt_grc_2025_283',
 'eat_2025_29']

In [4]:
# Load false positives and false negatives
try:
    false_positives_df = pd.read_csv(false_positives_path)
    print(f"Loaded {len(false_positives_df)} false positives")
except Exception as e:
    print(f"Error loading false positives: {str(e)}")
    false_positives_df = pd.DataFrame()

try:
    false_negatives_df = pd.read_csv(false_negatives_path)
    print(f"Loaded {len(false_negatives_df)} false negatives")
except Exception as e:
    print(f"Error loading false negatives: {str(e)}")
    false_negatives_df = pd.DataFrame()

Loaded 21 false positives
Loaded 3 false negatives


In [5]:
# Prepare training data for each case
#read data/test2/testing-data.json and data/test2/training-data.json
#combine them into one json file
with open(os.path.join(notebook_dir, 'data/test2/testing-data.json'), 'r') as f:
    testing_data = json.load(f)

with open(os.path.join(notebook_dir, 'data/test2/training-data.json'), 'r') as f:
    training_data = json.load(f)

# combine testing and training
all_cases_data = testing_data + training_data

#convert all_cases_data_list to dictionary by picking by para_id from each json object and making it a key in the dictionary
all_cases_data_dict = {case["para_id"]: case for case in all_cases_data}


for test_case_name in case_files.keys():
    print(f"\nPreparing training data for test case: {test_case_name}")
    training_examples = prepare_training_data(case_files, test_case_name, false_positives_df, false_negatives_df,all_cases_data_dict)
    
    # Save training data to JSON file
    training_file_path = os.path.join(training_data_path, f"{test_case_name}_training.json")
    with open(training_file_path, 'w') as f:
        json.dump(training_examples, f, indent=2)
    
    print(f"Saved training data to {training_file_path}")


Preparing training data for test case: ewca_civ_2025_215
Initial training data: 3 positive, 18 negative
para_13
para_14
para_15
para_32
para_33
para_35
para_42
para_43
para_44
para_45
para_46
para_47
para_48
para_36
para_37
Final training data: 18 positive, 18 negative
Saved training data to /Users/apple/Documents/Swansea/Projects/Odyssey-Terms-Extraction-Journal/data/test2/training_data/ewca_civ_2025_215_training.json

Preparing training data for test case: ewhc_scco_2025_374
Initial training data: 3 positive, 21 negative
para_15
para_20
para_25
para_26
para_32
para_33
para_35
para_42
para_43
para_44
para_45
para_46
para_47
para_48
para_36
para_37
para_12
para_13
Final training data: 21 positive, 21 negative
Saved training data to /Users/apple/Documents/Swansea/Projects/Odyssey-Terms-Extraction-Journal/data/test2/training_data/ewhc_scco_2025_374_training.json

Preparing training data for test case: ukftt_grc_2025_287
Initial training data: 2 positive, 18 negative
para_15
para_20
para

## Step 2: Run Experiments and Measure F1 Scores

In this step, we run the experiments using the training data prepared in Step 1. We'll use different LLMs (Claude, GPT-4o, GPT-4o-mini, and Llama-70B) and measure the F1 scores for each case law.

In [4]:
# Define experiment paths
experiment_folder_path = os.path.join(notebook_dir, 'data/test2/Experiment1')
os.makedirs(experiment_folder_path, exist_ok=True)

In [4]:
# Define models to test
models = ['gpt-4o-mini','gpt-4o','llama-3.3-70b-versatile']
input_folder_path = os.path.join(notebook_dir, 'data/test3')
experiment_folder_path= os.path.join(notebook_dir, 'data/test3')
# Run experiments for each case and model
all_results = []

for test_case_name in case_files.keys():
    for model_name in models:
        
        result = run_experiment(test_case_name, model_name, training_data_path, input_folder_path, experiment_folder_path)
        all_results.append(result)

# Save all results to a JSON file
results_file_path = os.path.join(experiment_folder_path, "experiment_results.json")
with open(results_file_path, 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"Saved all experiment results to {results_file_path}")

Running experiment for ewca_civ_2025_215 with gpt-4o-mini
batch size is 30
starting again...
starting again...
Done
Index(['case_uri', 'para_id', 'paragraphs', 'references',
       'if_law_applied_actual', 'application_of_law_phrases_actual',
       'if_law_applied_gpt-4o-mini', 'application_of_law_phrases_gpt-4o-mini',
       'reason_gpt-4o-mini'],
      dtype='object')
Running experiment for ewca_civ_2025_215 with gpt-4o
batch size is 30
starting again...
starting again...
Done
Index(['case_uri', 'para_id', 'paragraphs', 'references',
       'if_law_applied_actual', 'application_of_law_phrases_actual',
       'if_law_applied_gpt-4o-mini', 'application_of_law_phrases_gpt-4o-mini',
       'reason_gpt-4o-mini', 'if_law_applied_gpt-4o',
       'application_of_law_phrases_gpt-4o', 'reason_gpt-4o'],
      dtype='object')
Running experiment for ewca_civ_2025_215 with llama-3.3-70b-versatile
batch size is 30
starting again...
starting again...
Done
Index(['case_uri', 'para_id', 'paragraphs',

In [6]:
# Generate and display experiment report
report_df = generate_experiment_report(all_results)
report_df

Unnamed: 0,Test Case,Model,Precision,Recall,F1 Score,Accuracy,True Positives,False Positives,True Negatives,False Negatives
0,ewca_civ_2025_215,gpt-4o-mini,0.136364,0.75,0.230769,0.565217,3.0,19.0,23.0,1.0
1,ewca_civ_2025_215,gpt-4o,0.333333,0.75,0.461538,0.847826,3.0,6.0,36.0,1.0
2,ewca_civ_2025_215,llama-3.3-70b-versatile,0.333333,1.0,0.5,0.826087,4.0,8.0,34.0,0.0
3,ewhc_scco_2025_374,gpt-4o-mini,0.5,0.666667,0.571429,0.8125,2.0,2.0,11.0,1.0
4,ewhc_scco_2025_374,gpt-4o,1.0,0.666667,0.8,0.9375,2.0,0.0,13.0,1.0
5,ewhc_scco_2025_374,llama-3.3-70b-versatile,0.375,1.0,0.545455,0.6875,3.0,5.0,8.0,0.0
6,ukftt_grc_2025_287,gpt-4o-mini,0.083333,0.666667,0.148148,0.488889,2.0,22.0,20.0,1.0
7,ukftt_grc_2025_287,gpt-4o,0.0,0.0,0.0,0.711111,0.0,10.0,32.0,3.0
8,ukftt_grc_2025_287,llama-3.3-70b-versatile,0.09375,1.0,0.171429,0.355556,3.0,29.0,13.0,0.0
9,ukftt_grc_2025_251,gpt-4o-mini,0.363636,0.571429,0.444444,0.824561,4.0,7.0,43.0,3.0


In [7]:
# Visualize results
visualize_results(report_df, experiment_folder_path)

## Step 3: Process Legislation References and Extract Phrases

In this step, we process legislation references from the case laws and extract key phrases that show how the law is applied. We'll also measure the accuracy of the extracted phrases.

In [None]:
# Define paths for legislation processing
legislation_dir = os.path.join(notebook_dir, 'data/test2/csv_cases/legislation')
temp_folder_path = os.path.join(notebook_dir, 'data/test2/csv_cases/temp')
os.makedirs(legislation_dir, exist_ok=True)
os.makedirs(temp_folder_path, exist_ok=True)

In [None]:
# Process legislation references
process_legislation_references(input_folder_path, legislation_dir, temp_folder_path)

In [None]:
# Extract phrases from case laws
import keyPhraseExtractor

case_act_pickle_file = os.path.join(temp_folder_path, 'cleaned_case_legislation_map.pkl')
input_dir = temp_folder_path
output_dir = os.path.join(temp_folder_path, 'output')
output_folder_path_for_aggregated_result = output_dir

os.makedirs(output_dir, exist_ok=True)

keyPhraseExtractor.extractThePhrases(case_act_pickle_file, input_dir, output_dir, legislation_dir, output_folder_path_for_aggregated_result)

In [None]:
# Measure accuracy of phrase extraction
actual_file = os.path.join(notebook_dir, 'data/test2/Experiments2- Full caselaw/temp/output/ExplodedPhrases.csv')
predicted_file = os.path.join(output_dir, 'ExplodedPhrases.csv')

accuracy_metrics = measure_phrase_extraction_accuracy(actual_file, predicted_file)
print("Phrase extraction accuracy metrics:")
for metric, value in accuracy_metrics.items():
    print(f"{metric}: {value}")

## Step 4: Full Case Law Experiments

In this step, we run experiments with full case laws instead of sending paragraphs one by one. We'll measure the precision, recall, and F1 score for the law application detection and also measure the accuracy of the extracted phrases.

In [5]:
# Define paths for full case experiments
json_cases_dir = os.path.join(notebook_dir, 'data/test2/Json_cases')
full_case_output_dir = os.path.join(notebook_dir, 'data/test4/Full_case_experiments')
os.makedirs(json_cases_dir, exist_ok=True)
os.makedirs(full_case_output_dir, exist_ok=True)

In [6]:
# Convert XML cases to JSON
from JudgementHandler import JudgmentParser
import json

xml_cases_dir = os.path.join(notebook_dir, 'data/test2/xml_cases')
xml_files = [f for f in os.listdir(xml_cases_dir) if f.endswith('.xml')]

for xml_file in xml_files:
    xml_path = os.path.join(xml_cases_dir, xml_file)
    json_path = os.path.join(json_cases_dir, f"{xml_file.replace('.xml', '')}.json")
    
    handler = JudgmentParser(xml_path)
    results = handler.get_judgment_body_paragraphs_text()
    
    output = []
    for case_uri, para_id, text, refs in results:
        output.append({
            'caseUri': case_uri,
            'paragraphId': para_id, 
            'text': text,
            'references': refs
        })
    
    with open(json_path, 'w') as f:
        json.dump(output, f, indent=2)
    
    print(f"Converted {xml_file} to JSON")

Found paragraphs: 57
Converted eat_2025_29.xml to JSON
Found paragraphs: 52
Converted ewfc_2025_41.xml to JSON
Found paragraphs: 18
Converted ukftt_grc_2025_289.xml to JSON
Found paragraphs: 37
Converted ewhc_admin_2025_462.xml to JSON
Found paragraphs: 45
Converted ukftt_grc_2025_284.xml to JSON
Found paragraphs: 0
Converted ukftt_grc_2025_251.xml to JSON
Found paragraphs: 0
Converted ukftt_grc_2025_287.xml to JSON
Found paragraphs: 39
Converted ukftt_grc_2025_283.xml to JSON
Found paragraphs: 0
Converted ukftt_grc_2025_282.xml to JSON
Found paragraphs: 16
Converted ewhc_scco_2025_374.xml to JSON
Found paragraphs: 0
Converted ewca_civ_2025_215.xml to JSON


In [7]:
# Run full case experiments
full_case_results = []

for json_file in os.listdir(json_cases_dir):
    if json_file.endswith('.json'):
        case_name = json_file.replace('.json', '')
        json_path = os.path.join(json_cases_dir, json_file)
        
        # Load training examples for this case
        training_file_path = os.path.join(training_data_path, f"{case_name}_training.json")
        if os.path.exists(training_file_path):
            with open(training_file_path, 'r') as f:
                training_examples = json.load(f)
            
            # Run experiment for gpt-4o model
            output_path = os.path.join(full_case_output_dir, f"{case_name}.json")
            result = run_full_case_experiment(json_path, training_examples, output_path)
            full_case_results.append(result)
        else:
            print(f"No training data found for {case_name}")


content='```json\n[\n    {"para_id": "para_1", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_2", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_3", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_4", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_5", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_6", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_7", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_8", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_9", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_10", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_11", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_12", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_13", "if_law_applied": 0, "section": "Facts"},\n    {"para_id": "para_14", "if_law_applied": 0, "section": "Facts"},\n    {"pa

In [19]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Read the csv file with actual values
f1_list = []
base_input_folder = os.path.join(notebook_dir, 'data/test2/csv_cases/Experiment1-byPara')
experiment_folder_path = os.path.join(notebook_dir, 'data/test4/Full_case_experiments')

cases = list(case_files.keys())
for case_name in cases:
    base_input_file = os.path.join(base_input_folder, case_name)
    experiment_file = os.path.join(experiment_folder_path, case_name)

    base_df = pd.read_csv(base_input_file+'.csv',index_col=False)
    experiment_df = pd.read_csv(experiment_file+'.csv',index_col=False)
    # Merge the dataframes on 'para_id'

    merged_df = pd.merge(base_df[['para_id', 'if_law_applied_actual']],
                        experiment_df[['para_id', 'if_law_applied']],
                        on='para_id')
    
    



    # Find false negatives (actual=1, predicted=0)
    false_negatives = merged_df[
        (merged_df['if_law_applied_actual'] == 1) & 
        (merged_df['if_law_applied'] == False)
    ]

    # Find false positives (actual=0, predicted=1)
    false_positives = merged_df[
        (merged_df['if_law_applied_actual'] == 0) & 
        (merged_df['if_law_applied'] == True)
    ]

    print("False Negatives (missed applications of law):")
    print(false_negatives[['para_id']])
    
    print("\ncase_name:", case_name)
    #measure precision ,recall and f1 score using scikit-learn
    y_true = merged_df['if_law_applied_actual']
    y_pred = merged_df['if_law_applied'].astype(int)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    f1_list.append(f1)
    

False Negatives (missed applications of law):
    para_id
14  para_15
19  para_20

case_name: ewca_civ_2025_215
Precision: 0.2
Recall: 0.5
F1 Score: 0.2857142857142857
False Negatives (missed applications of law):
Empty DataFrame
Columns: [para_id]
Index: []

case_name: ewhc_scco_2025_374
Precision: 0.75
Recall: 1.0
F1 Score: 0.8571428571428571
False Negatives (missed applications of law):
Empty DataFrame
Columns: [para_id]
Index: []

case_name: ukftt_grc_2025_287
Precision: 0.23076923076923078
Recall: 1.0
F1 Score: 0.375
False Negatives (missed applications of law):
Empty DataFrame
Columns: [para_id]
Index: []

case_name: ukftt_grc_2025_251
Precision: 0.7
Recall: 1.0
F1 Score: 0.8235294117647058
False Negatives (missed applications of law):
Empty DataFrame
Columns: [para_id]
Index: []

case_name: ukftt_grc_2025_284
Precision: 0.2222222222222222
Recall: 1.0
F1 Score: 0.36363636363636365
False Negatives (missed applications of law):
Empty DataFrame
Columns: [para_id]
Index: []

case_nam

In [18]:
f1_list

[0.2857142857142857,
 0.8571428571428571,
 0.375,
 0.8235294117647058,
 0.36363636363636365,
 0.8,
 0.0,
 0.42857142857142855]

In [17]:
average_f1 = sum(f1_list) / len(f1_list)
print(f"Average F1 score across all cases: {average_f1}")

Average F1 score across all cases: 0.49169929335370516


In [None]:
# Compare full case results with paragraph-by-paragraph results
full_case_metrics = []

for result in full_case_results:
    if "error" in result:
        continue
    
    case_name = result["case"]
    model_name = result["model"]
    output_path = result["output_path"]
    
    # Load actual data
    actual_file_path = os.path.join(input_folder_path, f"{case_name}.csv")
    if os.path.exists(actual_file_path):
        actual_df = pd.read_csv(actual_file_path)
        
        # Load predicted data
        predicted_df = pd.read_csv(output_path)
        
        # Calculate metrics
        metrics = calculate_metrics(actual_df, predicted_df)
        
        full_case_metrics.append({
            "case": case_name,
            "model": model_name,
            "metrics": metrics
        })

# Generate report
full_case_report_df = generate_experiment_report(full_case_metrics)
full_case_report_df

In [None]:
# Compare paragraph-by-paragraph vs full case approach
paragraph_report_df = report_df[report_df["Test Case"] != "Average"].copy()
paragraph_report_df["Approach"] = "Paragraph-by-paragraph"

full_case_report_df_filtered = full_case_report_df[full_case_report_df["Test Case"] != "Average"].copy()
full_case_report_df_filtered["Approach"] = "Full case"

combined_df = pd.concat([paragraph_report_df, full_case_report_df_filtered])

# Visualize comparison
plt.figure(figsize=(14, 8))
sns.barplot(x="Model", y="F1 Score", hue="Approach", data=combined_df)
plt.title("F1 Score by Model and Approach")
plt.ylim(0, 1)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(title="Approach")
plt.tight_layout()
plt.savefig(os.path.join(full_case_output_dir, "approach_comparison.png"))
plt.show()