# TESS Confirmed Planets — Mini Dataset (300 TICs)

This notebook:
1) Loads the TESS Project Candidates (TOI) table and filters **confirmed planets** only  
2) Selects up to **684 unique TIC IDs**  
3) Downloads SPOC light curves with Lightkurve, keeps **PDCSAP** flux, masks **QUALITY**, lightly **flattens**  
4) Saves **one Parquet per TIC** (columns: `time, flux[, flux_err]`)  
5) Writes `labels.parquet` (all 1’s), `tic_meta.parquet`, and `manifest.json`

In [5]:
!pip -q install lightkurve astroquery pandas pyarrow fastparquet numpy

import os, json, datetime as dt
import numpy as np
import pandas as pd
from pathlib import Path

import lightkurve as lk
from astroquery.mast import Catalogs



In [6]:
# How many confirmed planets to keep (unique TICs)
N_TICS = 684

# Output folders (use Colab local disk; copy to Drive at the end if needed)
BASE = Path("/content/data")
RAW = BASE / "raw" / "tess"
PROC = BASE / "processed"
LC_OUT = PROC / "lightcurves"
META_OUT = BASE / "metadata"
for p in [RAW, LC_OUT, META_OUT]:
    p.mkdir(parents=True, exist_ok=True)

# Lightkurve search options (SPOC = official pipeline LCs)
SEARCH_KW = dict(mission="TESS", author="SPOC")

In [7]:
TOI_CSV_PATH = "/content/TOI_2025.10.04_08.58.14.csv"

df_toi = pd.read_csv(
    TOI_CSV_PATH,
    comment="#",
    engine="python",   # tolerant
    sep=None,          # sniff delimiter
    encoding="utf-8-sig",
    on_bad_lines="skip"
)
print(df_toi.shape)
df_toi.head(3)

(684, 65)


Unnamed: 0,toi,tid,tfopwg_disp,rastr,ra,decstr,dec,st_pmra,st_pmraerr1,st_pmraerr2,...,st_logg,st_loggerr1,st_loggerr2,st_logglim,st_rad,st_raderr1,st_raderr2,st_radlim,toi_created,rowupdate
0,1052.01,317060587,CP,22h30m02.47s,337.510274,-75d38m47.62s,-75.646561,-63.327,0.046,-0.046,...,4.34,0.12,-0.12,0,1.58,0.165123,-0.165123,0,2019-08-16 20:20:47,2023-07-24 12:03:31
1,1054.01,366989877,CP,20h08m27.4s,302.114174,-54d19m03s,-54.317501,1.594,0.054,-0.054,...,4.30881,0.268555,-0.268555,0,1.17,0.05,-0.05,0,2019-08-16 20:20:48,2023-09-14 16:02:01
2,1055.01,320004517,CP,19h33m08.79s,293.286615,-54d31m57.82s,-54.532728,108.439,0.074,-0.074,...,4.49925,0.019231,-0.019231,0,0.974669,0.054698,-0.054698,0,2019-08-16 20:20:47,2024-10-01 10:08:01


In [8]:
import re
cols_lower = {c.lower(): c for c in df_toi.columns}

# 1) disposition column (confirmed/CP)
disp_col = (
    cols_lower.get("tfopwg_disp")
    or cols_lower.get("disposition")
    or cols_lower.get("disp")
    or None
)
assert disp_col is not None, "Couldn't find a disposition column (tfopwg_disp / disposition / disp)."

def is_confirmed(val):
    s = str(val).strip().upper()
    return s in {"CP", "CONFIRMED"}

df_confirmed = df_toi[df_toi[disp_col].apply(is_confirmed)].copy()

# 2) TIC column: try many common spellings; else guess by substring 'tic'
tic_col = None
candidates = [
    "tic", "tic_id", "ticid", "tid", "target", "tic id", "tic-id", "tic number"
]
for cand in candidates:
    if cand in cols_lower:
        tic_col = cols_lower[cand]
        break

# If still not found, pick any column whose name contains 'tic'
if tic_col is None:
    for c in df_toi.columns:
        if "tic" in c.lower():
            tic_col = c
            break

assert tic_col is not None, "Couldn't find a TIC column—inspect df_toi.columns and pick the right one."
print("Using TIC column:", tic_col)

Using TIC column: tid


In [9]:
# assumes df_toi is already loaded

# Disposition column
cols_lower = {c.lower(): c for c in df_toi.columns}
disp_col = (cols_lower.get("tfopwg_disp")
            or cols_lower.get("disposition")
            or cols_lower.get("disp"))
assert disp_col, "Couldn't find a disposition column."

# Keep confirmed rows only
def is_confirmed(v):
    return (str(v).strip().upper() in {"CP", "CONFIRMED"}) if pd.notna(v) else False
df_confirmed = df_toi[df_toi[disp_col].apply(is_confirmed)].copy()

# Your TIC column is 'tid'
N_TICS = 684
tic_list = (
    df_confirmed["tid"].astype(str)            # handles "TIC 123..."
    .str.extract(r"(\d+)", expand=False)       # take the digits
    .dropna().astype("int64")
    .drop_duplicates()
    .head(N_TICS)
    .tolist()
)
print(f"Selected {len(tic_list)} TICs (confirmed). Example:", tic_list[:10])


Selected 584 TICs (confirmed). Example: [317060587, 366989877, 320004517, 299799658, 79748331, 158297421, 351601843, 370133522, 360630575, 383390264]


In [10]:
from pathlib import Path
import pandas as pd, numpy as np, time, json
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import lightkurve as lk
from astroquery.mast import conf as mast_conf
mast_conf.timeout = 60  # avoid long hangs

# paths
BASE = Path("/content/data")
RAW = BASE / "raw"                   # local cache for FITS (speeds reruns)
PROC = BASE / "processed"
LC_OUT = PROC / "lightcurves"
META = BASE / "metadata"
for p in [RAW, LC_OUT, META]: p.mkdir(parents=True, exist_ok=True)

# speed knobs
WORKERS = 4                 # try 3–6; network-bound
MAX_SECTORS_PER_TIC = 1     # <-- key speedup
PREFER_EXPTIME = 120        # prefer 2-min cadence

In [11]:
def process_one_tic_fast(tic_id: int) -> dict:
    """Download one fast SPOC LC (1 sector), quick clean, save TIC-<id>.parquet."""
    try:
        out_path = LC_OUT / f"TIC-{tic_id}.parquet"
        if out_path.exists():
            return {"tic_id": tic_id, "status": "cached"}

        # Prefer 2-min cadence; fall back to any SPOC LC
        sr = lk.search_lightcurve(f"TIC {tic_id}", mission="TESS", author="SPOC", exptime=PREFER_EXPTIME)
        if len(sr) == 0:
            sr = lk.search_lightcurve(f"TIC {tic_id}", mission="TESS", author="SPOC")
            if len(sr) == 0:
                return {"tic_id": tic_id, "status": "no_lc"}

        sr_use = sr[:MAX_SECTORS_PER_TIC]  # only one sector
        lcs = [it.download(download_dir=str(RAW)) for it in sr_use]  # local cache

        lc = lk.LightCurveCollection(lcs).stitch().remove_nans()

        # QUALITY mask + quick clean
        try:
            if getattr(lc, "quality", None) is not None:
                lc = lc[lc.quality == 0]
        except Exception:
            pass
        try:
            lc = lc.remove_outliers(sigma=10).flatten(window_length=401)
        except Exception:
            pass

        df = pd.DataFrame({
            "time": np.asarray(lc.time.value, "float64"),
            "flux": np.asarray(lc.flux.value, "float32")
        })
        if getattr(lc, "flux_err", None) is not None:
            df["flux_err"] = np.asarray(lc.flux_err.value, "float32")

        df.to_parquet(out_path, index=False)
        return {"tic_id": tic_id, "status": "ok", "rows": len(df)}
    except Exception as e:
        return {"tic_id": tic_id, "status": "error", "error": str(e)}

In [12]:
import logging, lightkurve as lk
logging.getLogger("astroquery").setLevel(logging.ERROR)
lk.log.setLevel(logging.ERROR)

In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
import pandas as pd, time, logging, sys
import lightkurve as lk

logging.getLogger("astroquery").setLevel(logging.ERROR)
lk.log.setLevel(logging.ERROR)

results = []
t0 = time.time()

print("Step 1: Creating executor...")
executor = ThreadPoolExecutor(max_workers=WORKERS)

print("Step 2: Submitting jobs...")
futs = [executor.submit(process_one_tic_fast, t) for t in tic_list]

print("Step 3: Processing results...")
completed_count = 0
pbar = tqdm(total=len(futs), unit="star", desc="Downloading + cleaning")

for f in as_completed(futs):
    try:
        r = f.result(timeout=0.1)  # very short since it's already done
        results.append(r)
    except Exception as e:
        results.append({"tic_id": None, "status": "error"})

    completed_count += 1
    pbar.update(1)

    # FORCE EXIT when all done
    if completed_count >= len(futs):
        pbar.close()
        break  # <-- THIS IS KEY

print("Step 4: Loop exited successfully!")
sys.stdout.flush()

executor.shutdown(wait=False, cancel_futures=True)
print("Step 5: Executor shut down")

res_df = pd.DataFrame(results)
print("\n--- Summary ---")
print(res_df.value_counts(["status"]))
print(f"Elapsed: {time.time()-t0:.1f}s")

from glob import glob
saved = len(glob(str(LC_OUT / "TIC-*.parquet")))
print(f"Saved parquet files: {saved}/{len(tic_list)}")
print("✅ All done - cell complete!")

Step 1: Creating executor...
Step 2: Submitting jobs...
Step 3: Processing results...


Downloading + cleaning:   0%|          | 0/584 [00:00<?, ?star/s]

No data found for target "TIC 70524163".
ERROR:lightkurve.search:No data found for target "TIC 70524163".
No data found for target "TIC 70524163".
ERROR:lightkurve.search:No data found for target "TIC 70524163".
No data found for target "TIC 268532343".
ERROR:lightkurve.search:No data found for target "TIC 268532343".
No data found for target "TIC 268532343".
ERROR:lightkurve.search:No data found for target "TIC 268532343".
No data found for target "TIC 176314383".
ERROR:lightkurve.search:No data found for target "TIC 176314383".
No data found for target "TIC 332534326".
ERROR:lightkurve.search:No data found for target "TIC 332534326".
No data found for target "TIC 176314383".
ERROR:lightkurve.search:No data found for target "TIC 176314383".
No data found for target "TIC 332534326".
ERROR:lightkurve.search:No data found for target "TIC 332534326".
No data found for target "TIC 443556801".
ERROR:lightkurve.search:No data found for target "TIC 443556801".
No data found for target "TIC 44

In [13]:
from pathlib import Path
import re

# paths (adjust if different)
BASE = Path("/content/data")
PROC = BASE / "processed"
LC_OUT = PROC / "lightcurves"
META = BASE / "metadata"
PROC.mkdir(parents=True, exist_ok=True); META.mkdir(parents=True, exist_ok=True)

# find TIC-<id>.parquet files and extract the numeric TIC ID
saved_files = list(LC_OUT.glob("TIC-*.parquet"))
tic_ok = []
for p in saved_files:
    m = re.search(r"TIC-(\d+)\.parquet$", p.name)
    if m: tic_ok.append(int(m.group(1)))

tic_ok = sorted(set(tic_ok))
print(f"Saved light curves found: {len(tic_ok)}")
print("Example TICs:", tic_ok[:10])

Saved light curves found: 558
Example TICs: [1003831, 1167538, 1528696, 4646810, 4897275, 4918918, 7548817, 8260536, 8348911, 8599009]


In [14]:
import pandas as pd

labels_path = PROC / "labels.parquet"
labels = pd.DataFrame({"tic_id": tic_ok, "label": 1})
labels.to_parquet(labels_path, index=False)
print("Wrote:", labels_path, "| rows:", len(labels))
labels.head()

Wrote: /content/data/processed/labels.parquet | rows: 558


Unnamed: 0,tic_id,label
0,1003831,1
1,1167538,1
2,1528696,1
3,4646810,1
4,4897275,1


In [16]:
# --- REPLACEMENT FOR STEP C: fetch tic_meta safely, per TIC ID ---

from astroquery.mast import Catalogs
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd, numpy as np, time, logging

logging.getLogger("astroquery").setLevel(logging.ERROR)

def fetch_one_tic_meta(tic_id: int):
    """Fetch metadata for a single TIC ID, with a fallback method."""
    try:
        # Primary: by numeric ID
        r = Catalogs.query_criteria(catalog="TIC", ID=int(tic_id))
        if len(r) > 0:
            return r.to_pandas()
    except Exception:
        pass
    try:
        # Fallback: by object name "TIC <id>"
        r = Catalogs.query_object(f"TIC {int(tic_id)}", catalog="TIC")
        if len(r) > 0:
            return r.to_pandas()
    except Exception:
        pass
    return pd.DataFrame()  # nothing found / error

# Run in parallel (be polite; 4–6 workers is fine)
WORKERS_META = 5
frames = []
with ThreadPoolExecutor(max_workers=WORKERS_META) as ex:
    futs = {ex.submit(fetch_one_tic_meta, t): t for t in tic_ok}
    for fut in as_completed(futs):
        df = fut.result()
        if not df.empty:
            frames.append(df)

# Combine and clean
if frames:
    tic_meta = pd.concat(frames, ignore_index=True)
    tic_meta.columns = [c.lower() for c in tic_meta.columns]
    # Choose commonly useful fields if present
    wanted = ["tic_id","tic","id","tmag","ra","dec","teff","rad","crowdsap","contratio"]
    keep = [c for c in wanted if c in tic_meta.columns]
    tic_meta = tic_meta[keep].copy() if keep else tic_meta.copy()
    # Standardize TIC column name
    for cand in ["tic_id","tic","id"]:
        if cand in tic_meta.columns:
            tic_meta.rename(columns={cand: "tic_id"}, inplace=True)
            break
    tic_meta["tic_id"] = tic_meta["tic_id"].astype(int)
    tic_meta = tic_meta.drop_duplicates(subset=["tic_id"])
else:
    tic_meta = pd.DataFrame(columns=["tic_id","tmag","ra","dec","teff","rad","crowdsap","contratio"])

# Save
tic_meta_path = PROC / "tic_meta.parquet"
tic_meta.to_parquet(tic_meta_path, index=False)
print("Wrote:", tic_meta_path, "| rows:", len(tic_meta))
display(tic_meta.head())

Wrote: /content/data/processed/tic_meta.parquet | rows: 558


Unnamed: 0,tic_id,tmag,ra,dec,teff,rad,contratio
0,1003831,10.6701,130.295165,-16.03628,5550.0,1.12196,0.034352
1,1167538,10.0127,70.997589,-31.906501,5825.0,1.03876,0.008923
2,4646810,8.8723,38.272011,-10.351781,4884.0,0.74413,4.2e-05
3,4897275,7.6474,148.160537,35.111655,5853.7,1.09271,0.000418
4,1528696,13.1686,75.795429,-30.399366,4669.0,0.782888,


In [17]:
import json, datetime as dt

manifest = {
    "created_utc": dt.datetime.utcnow().isoformat() + "Z",
    "counts": {
        "lightcurves_saved": len(tic_ok),
        "labels_rows": len(labels),
        "tic_meta_rows": len(tic_meta),
    },
    "paths": {
        "lightcurves_dir": str(LC_OUT),
        "labels": str(PROC / "labels.parquet"),
        "tic_meta": str(PROC / "tic_meta.parquet"),
    },
    "notes": "Light curves are PDCSAP-based, QUALITY-masked, lightly flattened. Labels=1 (confirmed).",
}
(manifest_path := META / "manifest.json").write_text(json.dumps(manifest, indent=2))
print("Wrote:", manifest_path)
print(manifest)

Wrote: /content/data/metadata/manifest.json
{'created_utc': '2025-10-04T18:55:30.515013Z', 'counts': {'lightcurves_saved': 558, 'labels_rows': 558, 'tic_meta_rows': 558}, 'paths': {'lightcurves_dir': '/content/data/processed/lightcurves', 'labels': '/content/data/processed/labels.parquet', 'tic_meta': '/content/data/processed/tic_meta.parquet'}, 'notes': 'Light curves are PDCSAP-based, QUALITY-masked, lightly flattened. Labels=1 (confirmed).'}


  "created_utc": dt.datetime.utcnow().isoformat() + "Z",


In [18]:
import pandas as pd, random
sample_tic = random.choice(tic_ok) if tic_ok else None
if sample_tic:
    fp = LC_OUT / f"TIC-{sample_tic}.parquet"
    print("Sample:", fp)
    display(pd.read_parquet(fp).head())
else:
    print("No TICs found in", LC_OUT)

Sample: /content/data/processed/lightcurves/TIC-101011575.parquet


Unnamed: 0,time,flux,flux_err
0,1517.921417,1.000773,0.000498
1,1517.922806,0.999864,0.000497
2,1517.924195,1.000023,0.000497
3,1517.925584,1.000243,0.000497
4,1517.926973,1.000691,0.000497


In [19]:
# 1) Check size first
!du -sh /content/data

# 2) Zip the folder (excludes raw FITS cache to keep it smaller; remove --exclude to include)
!cd /content && zip -r data_processed_only.zip data -x "data/raw/*"

# 3) Download to your computer
from google.colab import files
files.download("/content/data_processed_only.zip")

1.2G	/content/data
  adding: data/ (stored 0%)
  adding: data/processed/ (stored 0%)
  adding: data/processed/lightcurves/ (stored 0%)
  adding: data/processed/lightcurves/TIC-327369524.parquet (deflated 19%)
  adding: data/processed/lightcurves/TIC-29960110.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-169765334.parquet (deflated 19%)
  adding: data/processed/lightcurves/TIC-119584412.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-229742722.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-257060897.parquet (deflated 18%)
  adding: data/processed/lightcurves/TIC-467179528.parquet (deflated 19%)
  adding: data/processed/lightcurves/TIC-306263608.parquet (deflated 21%)
  adding: data/processed/lightcurves/TIC-36724087.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-349488688.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-219016883.parquet (deflated 18%)
  adding: data/processed/lightcurves/TIC-230001847.pa

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [20]:
!cd /content && zip -r tess-data.zip data -x "data/raw/*"
!mv /content/tess-data.zip "/content/drive/MyDrive/tess-data.zip"

  adding: data/ (stored 0%)
  adding: data/processed/ (stored 0%)
  adding: data/processed/lightcurves/ (stored 0%)
  adding: data/processed/lightcurves/TIC-327369524.parquet (deflated 19%)
  adding: data/processed/lightcurves/TIC-29960110.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-169765334.parquet (deflated 19%)
  adding: data/processed/lightcurves/TIC-119584412.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-229742722.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-257060897.parquet (deflated 18%)
  adding: data/processed/lightcurves/TIC-467179528.parquet (deflated 19%)
  adding: data/processed/lightcurves/TIC-306263608.parquet (deflated 21%)
  adding: data/processed/lightcurves/TIC-36724087.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-349488688.parquet (deflated 20%)
  adding: data/processed/lightcurves/TIC-219016883.parquet (deflated 18%)
  adding: data/processed/lightcurves/TIC-230001847.parquet (deflated 18%