# Explore Temporal Awareness Datasets

Interactive exploration of the CAA-style contrastive pairs for temporal scope detection.

In [None]:
import json
from pathlib import Path
import pandas as pd

DATA_DIR = Path("..") / "data" / "raw"
print("Available datasets:")
for f in sorted(DATA_DIR.glob("*.json")):
    size = f.stat().st_size / 1024
    print(f"  {f.name:45} ({size:.1f} KB)")

## 1. Main Dataset: Explicit Temporal Pairs (CAA Training)

In [None]:
with open(DATA_DIR / "temporal_scope_caa.json") as f:
    explicit = json.load(f)

print("Metadata:")
for k, v in explicit["metadata"].items():
    print(f"  {k}: {v}")

pairs = explicit["pairs"]
print(f"\nTotal pairs: {len(pairs)}")

In [None]:
# Convert to DataFrame for easy viewing
df_explicit = pd.DataFrame(pairs)
df_explicit["immediate_preview"] = df_explicit["immediate"].str[:60] + "..."
df_explicit["long_term_preview"] = df_explicit["long_term"].str[:60] + "..."

print("Categories:")
print(df_explicit["category"].value_counts())

In [None]:
# Show sample pairs
print("=" * 80)
print("SAMPLE EXPLICIT PAIRS (with temporal keywords)")
print("=" * 80)

for i, pair in enumerate(pairs[:5]):
    print(f"\n[{i+1}] Category: {pair['category']}")
    print(f"Question: {pair['question']}")
    print(f"  IMMEDIATE: {pair['immediate']}")
    print(f"  LONG-TERM: {pair['long_term']}")

## 2. Test Dataset: Implicit Temporal Pairs

These pairs use **semantic cues** without explicit temporal keywords.

In [None]:
with open(DATA_DIR / "temporal_scope_implicit.json") as f:
    implicit = json.load(f)

pairs_impl = implicit["pairs"]
print(f"Implicit pairs: {len(pairs_impl)}")

df_implicit = pd.DataFrame(pairs_impl)
print("\nCategories:")
print(df_implicit["category"].value_counts())

In [None]:
print("=" * 80)
print("SAMPLE IMPLICIT PAIRS (semantic cues only)")
print("=" * 80)

for i, pair in enumerate(pairs_impl[:5]):
    print(f"\n[{i+1}] Category: {pair['category']}")
    print(f"Question: {pair['question']}")
    print(f"  IMMEDIATE: {pair['immediate']}")
    print(f"  LONG-TERM: {pair['long_term']}")

## 3. Dataset Quality Check

In [None]:
# Check for keyword leakage in implicit dataset
temporal_keywords = [
    "now", "immediate", "urgent", "today", "soon", "quick", "instant",
    "future", "long-term", "years", "decade", "lasting", "permanent", "eventually"
]

print("Checking implicit dataset for temporal keyword contamination...\n")

contaminated = []
for i, pair in enumerate(pairs_impl):
    text = f"{pair['immediate']} {pair['long_term']}".lower()
    found = [kw for kw in temporal_keywords if kw in text]
    if found:
        contaminated.append((i, pair, found))

if contaminated:
    print(f"⚠️  Found {len(contaminated)} potentially contaminated pairs:\n")
    for idx, pair, keywords in contaminated[:5]:
        print(f"  [{idx}] Keywords: {keywords}")
        print(f"      Q: {pair['question'][:50]}...")
else:
    print("✓ No temporal keywords found in implicit dataset")

## 4. Full Dataset Table

In [None]:
# Combine for overview
df_explicit["dataset"] = "explicit"
df_implicit["dataset"] = "implicit"

df_all = pd.concat([df_explicit, df_implicit], ignore_index=True)
print(f"Total pairs: {len(df_all)}")
print(f"\nBy dataset and category:")
print(df_all.groupby(["dataset", "category"]).size().unstack(fill_value=0))

In [None]:
# Display full table (scrollable)
pd.set_option('display.max_colwidth', 100)
df_all[["dataset", "category", "question", "immediate", "long_term"]]

## 5. Export for Review

In [None]:
# Export to CSV for easy review
output_path = Path("..") / "results" / "tables" / "all_pairs.csv"
df_all.to_csv(output_path, index=False)
print(f"Exported to {output_path}")