In [None]:
import os
import re
import lightkurve as lk
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# ------------------------------
# Parameters
# ------------------------------
NUM_WORKERS = 12
OUTPUT_DIR = "output_lightcurves"

# ------------------------------
# Load dataset (ALL KOIs)
# ------------------------------
df = pd.read_csv("kepler_core.csv")   # update path
all_kepids = df["kepid"].unique()
print(f"🔍 Found {len(all_kepids)} total Kepler IDs in dataset")

# ------------------------------
# Output folder
# ------------------------------
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ------------------------------
# Track already downloaded
# ------------------------------
existing_files = os.listdir(OUTPUT_DIR)
already_downloaded = {
    int(re.search(r"KIC(\d+)", f).group(1))
    for f in existing_files if "KIC" in f
}
remaining_kepids = [kid for kid in all_kepids if kid not in already_downloaded]

print(f"✅ {len(already_downloaded)} already downloaded")
print(f"⬇️ {len(remaining_kepids)} left to download")

# ------------------------------
# Download + Merge function
# ------------------------------
def download_lightcurve_merged(kepid, idx):
    try:
        search = lk.search_lightcurve(f"KIC {kepid}", mission="Kepler")
        if len(search) == 0:
            return f"❌ No data for KIC {kepid}"

        lcs = search.download_all()
        if lcs is None or len(lcs) == 0:
            return f"⚠️ Failed for KIC {kepid}"

        # Merge all available sectors into one DataFrame
        merged_df = pd.concat([lc.to_pandas() for lc in lcs if lc is not None], ignore_index=True)

        # Save single merged CSV
        outpath = os.path.join(OUTPUT_DIR, f"lightcurve_{idx}_KIC{kepid}.csv")
        merged_df.to_csv(outpath, index=False)

        return f"✅ Saved merged lightcurve ({len(merged_df)} rows) for KIC {kepid}"
    except Exception as e:
        return f"❌ Error for KIC {kepid}: {str(e)}"

# ------------------------------
# Run parallel downloads
# ------------------------------
results = []
if len(remaining_kepids) > 0:
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        futures = {
            executor.submit(download_lightcurve_merged, kepid, i): kepid
            for i, kepid in enumerate(remaining_kepids, start=1)
        }
        for future in tqdm(as_completed(futures), total=len(futures), desc="Downloading ALL KOIs"):
            results.append(future.result())

# ------------------------------
# Summary
# ------------------------------
print("\nDownload Summary:")
for r in results[:30]:  # show first 30 logs
    print(r)

print(f"\n✅ Completed: {sum('Saved' in r for r in results)}")
print(f"❌ Failed: {sum('❌' in r for r in results)}")




🔍 Found 8214 total Kepler IDs in dataset
✅ 1752 already downloaded
⬇️ 6462 left to download


Downloading ALL KOIs:   3%|▎         | 205/6462 [14:06<3:38:24,  2.09s/it]