In [None]:
# Clean EDA setup for entities_dataset_v2 (safe version)
import os
import json
import gzip
import subprocess
from pathlib import Path
from typing import List, Dict, Any, Tuple, Optional
import pandas as pd

DATA_DIR = Path("/home/sherin/Image_Captioning/entities_dataset_v2").resolve()
OUTPUT_DIR = Path("/home/sherin/Image_Captioning").resolve()
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_DIR:", DATA_DIR)
print("Exists:", DATA_DIR.exists())


def bytes_to_mb(num_bytes: int) -> float:
	return round(num_bytes / (1024 * 1024), 3)


def count_lines_fast(paths: List[Path]) -> Dict[str, int]:
	result: Dict[str, int] = {}
	if not paths:
		return result
	try:
		cmd = ["wc", "-l", *[str(p) for p in paths]]
		proc = subprocess.run(cmd, capture_output=True, text=True, check=True)
		for line in proc.stdout.strip().splitlines():
			line = line.strip()
			if not line or line.endswith(" total"):
				continue
			parts = line.split()
			if len(parts) >= 2:
				line_count = int(parts[0])
				file_path = parts[-1]
				result[Path(file_path).name] = line_count
	except Exception as e:
		print("[warn] wc -l failed, falling back to Python:", e)
		for p in paths:
			try:
				with open(p, "r", encoding="utf-8", errors="ignore") as f:
					result[p.name] = sum(1 for _ in f)
			except Exception as e2:
				print(f"[warn] count lines failed for {p.name}: {e2}")
				result[p.name] = -1
	return result


def try_parse_json(path: Path, sample_limit: int = 50) -> Tuple[str, Optional[int], List[Dict[str, Any]]]:
	"""
	Attempt to parse file as JSON array/object; fallback to JSONL sampling.
	Returns: (type_str, length_or_None, samples)
	"""
	# Try standard JSON
	try:
		with open(path, "r", encoding="utf-8", errors="replace") as f:
			data = json.load(f)
		if isinstance(data, list):
			return ("array", len(data), data[: sample_limit])
		if isinstance(data, dict):
			return ("object", None, [data])
		return (type(data).__name__, None, [])
	except Exception:
		pass

	# Try JSON Lines
	samples: List[Dict[str, Any]] = []
	try:
		with open(path, "r", encoding="utf-8", errors="replace") as f:
			for idx, line in enumerate(f):
				if not line.strip():
					continue
				try:
					obj = json.loads(line)
					if isinstance(obj, dict):
						samples.append(obj)
					else:
						samples.append({"value": obj})
				except Exception:
					break
				if len(samples) >= sample_limit:
					break
	except Exception:
		return ("unknown", None, [])
	return ("jsonl", None, samples)


def infer_keys(samples: List[Dict[str, Any]], max_keys: int = 200) -> List[str]:
	seen = []
	seen_set = set()
	for s in samples:
		if isinstance(s, dict):
			for k in s.keys():
				if k not in seen_set:
					seen.append(k)
					seen_set.add(k)
		if len(seen) >= max_keys:
			break
	return seen

print("Setup OK.")


In [None]:
# List JSON files and gather basic stats (size, lines)
from datetime import datetime

files = sorted([p for p in DATA_DIR.glob("*_entities_dataset_v2.json") if p.is_file()])
print(f"Found {len(files)} files")

sizes_mb = {p.name: bytes_to_mb(p.stat().st_size) for p in files}
line_counts = count_lines_fast(files[:200])  # count for first 200 to keep it quick; adjust as needed

rows = []
for p in files:
	rows.append({
		"filename": p.name,
		"size_mb": sizes_mb.get(p.name, None),
		"lines": line_counts.get(p.name, None),
		"modified": datetime.fromtimestamp(p.stat().st_mtime).isoformat(timespec="seconds"),
	})

if pd is not None:
	df_files = pd.DataFrame(rows)
	print(df_files.head(10))
	out_csv = OUTPUT_DIR / "entities_dataset_v2_file_stats.csv"
	df_files.to_csv(out_csv, index=False)
	print("Saved:", out_csv)
else:
	print(rows[:5])


In [None]:
# Peek into a few files to infer structure and show samples
from itertools import islice

sample_files = files[:5] if len(files) > 0 else []
summary = []

for p in sample_files:
	kind, length, samples = try_parse_json(p, sample_limit=5)
	keys_union = infer_keys(samples, max_keys=200)
	summary.append({
		"filename": p.name,
		"kind": kind,
		"length": length,
		"num_samples": len(samples),
		"keys": keys_union,
	})
	print("====", p.name)
	print("type:", kind, "length:", length, "keys (up to 20):", keys_union[:20])
	for i, s in enumerate(samples[:3]):
		print(f"sample[{i}] =", s)

if pd is not None and summary:
	df_summary = pd.DataFrame(summary)
	display(df_summary)


In [None]:
# Aggregate keys across many shards (union) to see overall schema
from collections import Counter

max_files = min(50, len(files))  # adjust for speed
key_counter = Counter()

for p in files[:max_files]:
	kind, length, samples = try_parse_json(p, sample_limit=30)
	for s in samples:
		if isinstance(s, dict):
			for k in s.keys():
				key_counter[k] += 1

key_stats = sorted(key_counter.items(), key=lambda x: (-x[1], x[0]))
print("Top 30 keys:")
for k, c in key_stats[:30]:
	print(f"{k}: {c}")

if pd is not None and key_stats:
	df_keys = pd.DataFrame(key_stats, columns=["key", "count_in_samples"])
	out_csv = OUTPUT_DIR / "entities_dataset_v2_key_counts.csv"
	df_keys.to_csv(out_csv, index=False)
	print("Saved:", out_csv)


In [None]:
# Optional: Build a tiny sample dataframe for downstream EDA
# Tries to normalize common fields if present

common_rows = []
max_items = 2000
col_guess = set()

for p in files[:10]:
	kind, length, samples = try_parse_json(p, sample_limit=300)
	for obj in samples:
		if not isinstance(obj, dict):
			continue
		col_guess.update(obj.keys())
		common_rows.append(obj)
		if len(common_rows) >= max_items:
			break
	if len(common_rows) >= max_items:
		break

if pd is not None and common_rows:
	df_sample = pd.DataFrame(common_rows)
	print(df_sample.shape)
	display(df_sample.head(5))
	out_csv = OUTPUT_DIR / "entities_dataset_v2_sample_rows.csv"
	df_sample.to_csv(out_csv, index=False)
	print("Saved:", out_csv)
else:
	print("No rows or pandas not available.")
