# Data Quickstart & Sanity Checks
Use this notebook to confirm required datasets exist and preview a few rows per domain before running flows.

In [None]:
# Path setup
import sys
from pathlib import Path
project_root = Path('..').resolve()
sys.path.append(str(project_root / 'src'))
print('Project root:', project_root)

## Expected raw data paths
Update this list if you use different filenames.

In [None]:
from pprint import pprint
paths = {
    'fraud': project_root / 'data/raw/fraud/creditcard.csv',
    'cyber': project_root / 'data/raw/cyber/kitsune_mirai.csv',
    'behavior': project_root / 'data/raw/behavior/online_shoppers_intention.csv',
    'nlp_enron': project_root / 'data/raw/nlp/enron_emails.csv',
    'nlp_fakenews_dir': project_root / 'data/raw/nlp/fakenews',
    'vision_root': project_root / 'data/raw/vision',
}
missing = {k: p for k, p in paths.items() if not p.exists()}
print('Missing paths:' if missing else 'All listed paths exist')
pprint(missing)

## Preview tabular datasets (fraud, cyber, behavior)
Guarded reads: skips if file not found.

In [None]:
import pandas as pd
def safe_head(path, n=5):
    if not path.exists():
        print(f'Missing: {path}')
        return
    try:
        df = pd.read_csv(path)
        print(f'File: {path.name}, rows={len(df)}, cols={len(df.columns)}')
        display(df.head(n))
    except Exception as e:
        print(f'Failed to read {path}:', e)

safe_head(paths['fraud'])
safe_head(paths['cyber'])
safe_head(paths['behavior'])

## Preview NLP (Enron or Fake News)
Shows a few texts if available.

In [None]:
from uais.data.load_datasets import load_enron_emails
try:
    df_enron = load_enron_emails(subset=3)
    display(df_enron.head(3))
except Exception as e:
    print('Enron load skipped:', e)

fake_news_dir = paths['nlp_fakenews_dir']
if fake_news_dir.exists():
    fake_csv = list(fake_news_dir.rglob('Fake.csv'))
    true_csv = list(fake_news_dir.rglob('True.csv'))
    if fake_csv and true_csv:
        try:
            import pandas as pd
            df_fake = pd.read_csv(fake_csv[0]).head(2)
            df_true = pd.read_csv(true_csv[0]).head(2)
            display(df_fake.head(2))
            display(df_true.head(2))
        except Exception as e:
            print('Fake/True preview skipped:', e)
else:
    print('Fake news dataset not found')

## Preview vision layout
Lists a few subfolders/files (no heavy image load).

In [None]:
vision_root = paths['vision_root']
if vision_root.exists():
    entries = sorted([p for p in vision_root.iterdir() if p.is_dir()])[:5]
    print('Vision subdirs:', [e.name for e in entries])
else:
    print('Vision root not found')

## Optional: CIFAR-10 shape check (if downloaded via download_nlp_vision)

In [None]:
try:
    from uais.data.load_datasets import load_cifar10
    X, y = load_cifar10('train')
    print('CIFAR-10 train shape:', X.shape, y.shape)
except Exception as e:
    print('CIFAR check skipped:', e)

## Load all CSVs per domain (quick head)
Attempts to read every CSV under each domain folder and show the first few rows. Skips on errors/large files.

In [None]:
import pandas as pd
from itertools import islice
domains_dirs = {
    'fraud': project_root / 'data/raw/fraud',
    'cyber': project_root / 'data/raw/cyber',
    'behavior': project_root / 'data/raw/behavior',
    'nlp': project_root / 'data/raw/nlp',
    'vision': project_root / 'data/raw/vision',
}
for dom, root in domains_dirs.items():
    print(f"\n== {dom.upper()} ==")
    if not root.exists():
        print('missing root', root)
        continue
    csvs = list(root.rglob('*.csv'))
    if not csvs:
        print('no CSVs found')
        continue
    for csv_path in islice(csvs, 5):  # cap to 5 files to keep it light
        try:
            df_tmp = pd.read_csv(csv_path)
            print(f"{csv_path.name}: rows={len(df_tmp)}, cols={len(df_tmp.columns)}")
            display(df_tmp.head(2))
        except Exception as e:
            print(f"skip {csv_path.name}: {e}")
    if len(csvs) > 5:
        print(f"...skipped {len(csvs)-5} more files")

## Notes
- If a dataset is missing, use the download scripts under `src/uais/data/`.
- Re-run this notebook after adding data to confirm everything is in place.