# Legal Data Viewer

This notebook visualizes the original legal documents and the extracted clauses filtered by topics (Privacy, Liability).

In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)

In [None]:
# Load Data
original_path = '../data/parquets/mistral_instruction_data.parquet'
processed_path = '../data/preprocessed/filtered_clauses.parquet'

try:
    df_orig = pd.read_parquet(original_path)
    print(f"Original Data Loaded: {df_orig.shape}")
except FileNotFoundError:
    print("Original data file not found.")

try:
    df_proc = pd.read_parquet(processed_path)
    print(f"Processed Data Loaded: {df_proc.shape}")
except FileNotFoundError:
    print("Processed data file not found. Please run src/preprocessing.py first.")

## Statistics

In [None]:
if 'df_proc' in locals():
    print("Topic Distribution:")
    print(df_proc['matched_topic'].value_counts())

## Sample Verification

Let's look at a few extracted clauses and their original context (if mapped).

In [None]:
if 'df_proc' in locals():
    print("--- Sample Extracted Clauses ---")
    display(df_proc.sample(5))

In [None]:
if 'df_proc' in locals() and 'df_orig' in locals():
    # Get one random index from processed
    sample_idx = df_proc.sample(1).iloc[0]['original_index']
    
    print(f"--- Original Document (Index: {sample_idx}) ---")
    print(df_orig.loc[sample_idx, 'input'][:1000] + "...") # Truncate for display
    
    print(f"\n--- Extracted Clauses for Index {sample_idx} ---")
    display(df_proc[df_proc['original_index'] == sample_idx])