# OPPT Dataset Analysis Examples

This notebook demonstrates how to analyze the OPPT dataset.

In [None]:
# Install dependencies if needed
# !pip install datasets pandas matplotlib

In [None]:
from datasets import load_dataset
from collections import Counter
import json

# Load the dataset
dataset = load_dataset("Open-Privacy-Policy-Taxonomy/oppt-privacy-policies")
print(f"Loaded {len(dataset['train'])} segments")

## Category Distribution

In [None]:
categories = Counter(dataset["train"]["primary_category"])

print("Category Distribution:")
print("-" * 40)
for cat, count in categories.most_common():
    pct = count / len(dataset["train"]) * 100
    print(f"{cat:25} {count:5} ({pct:5.1f}%)")

## Consensus Analysis

In [None]:
consensus = Counter(dataset["train"]["category_consensus_type"])

print("Consensus Types:")
print("-" * 40)
for ct, count in consensus.most_common():
    pct = count / len(dataset["train"]) * 100
    print(f"{ct:25} {count:5} ({pct:5.1f}%)")

## Company Analysis

In [None]:
company_counts = Counter(dataset["train"]["company"])

print(f"Total companies: {len(company_counts)}")
print(f"\nTop 10 by segment count:")
print("-" * 40)
for company, count in company_counts.most_common(10):
    print(f"{company:25} {count:5}")

## Finding Jurisdiction-Siloed Disclosures

Look for substantive categories appearing in REGIONAL sections.

In [None]:
# Categories that should be universal, not hidden in regional sections
substantive_categories = {
    "FIRST_PARTY", "THIRD_PARTY", "SALE_SHARING", 
    "AUTOMATED_DECISIONS", "SENSITIVE_DATA", "TRACKING"
}

# Find REGIONAL segments
regional_segments = dataset["train"].filter(
    lambda x: x["primary_category"] == "REGIONAL"
)

print(f"REGIONAL segments: {len(regional_segments)}")

# Check secondary categories for substantive content
siloed_count = 0
for segment in regional_segments:
    secondary = json.loads(segment["secondary_categories"]) if segment["secondary_categories"] else []
    if any(cat in substantive_categories for cat in secondary):
        siloed_count += 1

print(f"REGIONAL segments with substantive secondary categories: {siloed_count}")

## Working with Attributes

In [None]:
# Find all data types mentioned in FIRST_PARTY segments
data_types = Counter()

for segment in dataset["train"]:
    if segment["primary_category"] == "FIRST_PARTY":
        attrs = json.loads(segment["attributes_annotator_1"])
        if "FIRST_PARTY" in attrs:
            for dt in attrs["FIRST_PARTY"].get("personal_information_type", []):
                data_types[dt] += 1

print("Data types collected (FIRST_PARTY segments):")
print("-" * 40)
for dt, count in data_types.most_common(15):
    print(f"{dt:35} {count:5}")

## Export for Further Analysis

In [None]:
# Convert to pandas for more analysis options
import pandas as pd

df = dataset["train"].to_pandas()
print(df.head())

# Save to CSV if needed
# df.to_csv("oppt_dataset.csv", index=False)