In [1]:
import pandas as pd
from pathlib import Path
import json

DATA_DIR = Path("data")

def summarize_df(df):
    if df is None or df.empty:
        return {"columns": list(df.columns) if df is not None else [], "first_row": None}
    return {"columns": df.columns.tolist(), "first_row": df.iloc[0].to_dict()}

def summarize_file(path: Path):
    ext = path.suffix.lower()
    summaries = []
    try:
        if ext in {".csv", ".txt"}:
            df = pd.read_csv(path, nrows=1)
            summaries.append((path.name, None, summarize_df(df)))
        elif ext == ".tsv":
            df = pd.read_csv(path, sep="\t", nrows=1)
            summaries.append((path.name, None, summarize_df(df)))
        elif ext in {".xls", ".xlsx"}:
            xls = pd.ExcelFile(path)
            for sheet in xls.sheet_names:
                df = pd.read_excel(path, sheet_name=sheet, nrows=1)
                summaries.append((path.name, sheet, summarize_df(df)))
        elif ext in {".parquet"}:
            try:
                df = pd.read_parquet(path)
                summaries.append((path.name, None, summarize_df(df.head(1))))
            except Exception:
                summaries.append((path.name, None, {"error": "failed to read parquet"}))
        elif ext in {".json"}:
            # try JSON lines first
            try:
                df = pd.read_json(path, lines=True)
                summaries.append((path.name, None, summarize_df(df.head(1))))
            except ValueError:
                try:
                    df = pd.read_json(path)
                    summaries.append((path.name, None, summarize_df(df.head(1))))
                except Exception:
                    # fallback: try to load and inspect top-level structure
                    with open(path, "r", encoding="utf-8") as f:
                        obj = json.load(f)
                    if isinstance(obj, list) and obj:
                        summaries.append((path.name, None, {"columns": list(obj[0].keys()) if isinstance(obj[0], dict) else [], "first_row": obj[0]}))
                    else:
                        summaries.append((path.name, None, {"info": f"json top-level type: {type(obj).__name__}"}))
        elif ext in {".feather"}:
            df = pd.read_feather(path)
            summaries.append((path.name, None, summarize_df(df.head(1))))
        else:
            # generic attempt: try read_csv for delimited files
            try:
                df = pd.read_csv(path, nrows=1)
                summaries.append((path.name, None, summarize_df(df)))
            except Exception:
                summaries.append((path.name, None, {"info": f"unsupported or unreadable file type: {ext}"}))
    except Exception as e:
        summaries.append((path.name, None, {"error": str(e)}))
    return summaries

results = []
if not DATA_DIR.exists() or not DATA_DIR.is_dir():
    print(f"Directory not found: {DATA_DIR}")
else:
    for p in sorted(DATA_DIR.iterdir()):
        if p.is_file():
            results.extend(summarize_file(p))

# Print concise summary
for fname, sheet, summary in results:
    header = f"{fname}" + (f" (sheet: {sheet})" if sheet else "")
    print("=" * 80)
    print(header)
    if "error" in summary:
        print("  Error:", summary["error"])
        continue
    if "info" in summary:
        print("  Info:", summary["info"])
        continue
    print("  Columns:", summary["columns"])
    print("  First row:", summary["first_row"])

appointments.csv
  Columns: ['Appointment ID', 'Name', 'Phone Number', 'Service Booked', 'Preferred Employee', 'Appointment Date', 'Time Slot', 'Duration', 'Status', 'Source']
  First row: {'Appointment ID': 'APT1000', 'Name': 'Divya Rao', 'Phone Number': 918221053161, 'Service Booked': 'Facial', 'Preferred Employee': 'Kavita', 'Appointment Date': '2025-11-06', 'Time Slot': '17:30', 'Duration': 90, 'Status': 'Cancelled', 'Source': 'Website'}
attendance.csv
  Columns: ['Staff Name', 'Date', 'Status', 'Check In', 'Check Out']
  First row: {'Staff Name': 'Priya Sharma', 'Date': '2025-09-05', 'Status': 'Present', 'Check In': '10:15', 'Check Out': '19:34'}
branches.csv
  Columns: ['Branch ID', 'Branch Name', 'Location', 'Manager']
  First row: {'Branch ID': 'BR001', 'Branch Name': 'Main Branch', 'Location': 'MG Road', 'Manager': 'Priya Sharma'}
employees.csv
  Columns: ['Employee Name', 'Role', 'Available']
  First row: {'Employee Name': 'Priya', 'Role': 'Senior Stylist', 'Available': True}