In [None]:
import os
import glob
import numpy as np
from astropy.io import fits

# === CONFIG ===
root_folder = "transit/kepler"
desired_length = 2000


flux_sequences = []
kepler_ids = []


for dirpath, dirnames, filenames in os.walk(root_folder):
    fits_files = [f for f in filenames if f.endswith(".fits")]

    if len(fits_files) != 5:
        continue

    star_id = os.path.basename(dirpath)  # e.g., '000757076'
    full_paths = [os.path.join(dirpath, f) for f in fits_files]

    for file_path in full_paths:
        try:
            with fits.open(file_path, memmap=False) as hdul:
                data = hdul[1].data

                if 'PDCSAP_FLUX' not in data.columns.names or 'SAP_QUALITY' not in data.columns.names:
                    print(f"{file_path} skipped — missing required columns")
                    continue

                flux = data['PDCSAP_FLUX']
                quality = data['SAP_QUALITY']
                mask = (quality == 0) & ~np.isnan(flux)
                flux_clean = flux[mask]

                if len(flux_clean) < desired_length:
                    print(f"{file_path} skipped — not enough clean data")
                    continue

                start_idx = np.random.randint(0, len(flux_clean) - desired_length + 1)
                flux_segment = flux_clean[start_idx:start_idx + desired_length]
                flux_segment = (flux_segment - np.mean(flux_segment)) / np.std(flux_segment)

                flux_sequences.append(flux_segment)
                kepler_ids.append(int(star_id))

                print(f"{file_path}: Valid points = {len(flux_clean)}")

        except Exception as e:
            print(f"Error processing {file_path}: {e}")

# === SAVE ===
flux_sequences = np.array(flux_sequences)
kepler_ids = np.array(kepler_ids)

np.save("flux_sequences.npy", flux_sequences)
np.save("flux_kepler_ids.npy", kepler_ids)

print(f" flux_sequences.npy: shape = {flux_sequences.shape}")
print(f"flux_kepler_ids.npy: shape = {kepler_ids.shape}")


transit/kepler\0007\000757076\kplr000757076-2009166043257_llc.fits skipped — not enough clean data
transit/kepler\0007\000757076\kplr000757076-2009259160929_llc.fits: Valid points = 2366
transit/kepler\0007\000757076\kplr000757076-2009350155506_llc.fits: Valid points = 3407
transit/kepler\0007\000757076\kplr000757076-2010078095331_llc.fits: Valid points = 3897
transit/kepler\0007\000757076\kplr000757076-2010174085026_llc.fits: Valid points = 3718
transit/kepler\0007\000757099\kplr000757099-2009166043257_llc.fits skipped — not enough clean data
transit/kepler\0007\000757099\kplr000757099-2009259160929_llc.fits skipped — not enough clean data
transit/kepler\0007\000757099\kplr000757099-2009350155506_llc.fits: Valid points = 3648
transit/kepler\0007\000757099\kplr000757099-2010078095331_llc.fits: Valid points = 3778
transit/kepler\0007\000757099\kplr000757099-2010174085026_llc.fits: Valid points = 4103
transit/kepler\0007\000757137\kplr000757137-2009166043257_llc.fits skipped — not enough



Error processing transit/kepler\0020\002011214\kplr002011214-2009350155506_llc.fits: cannot reshape array of size 3702 into shape (4370,)
transit/kepler\0020\002011214\kplr002011214-2010078095331_llc.fits: Valid points = 3390
transit/kepler\0020\002011214\kplr002011214-2010174085026_llc.fits: Valid points = 4117
transit/kepler\0020\002011347\kplr002011347-2009166043257_llc.fits skipped — not enough clean data
transit/kepler\0020\002011347\kplr002011347-2009259160929_llc.fits: Valid points = 3830
transit/kepler\0020\002011347\kplr002011347-2009350155506_llc.fits: Valid points = 3704
transit/kepler\0020\002011347\kplr002011347-2010078095331_llc.fits: Valid points = 3861
transit/kepler\0020\002011347\kplr002011347-2010174085026_llc.fits: Valid points = 4045
transit/kepler\0020\002011352\kplr002011352-2009166043257_llc.fits skipped — not enough clean data
transit/kepler\0020\002011352\kplr002011352-2009259160929_llc.fits: Valid points = 3756
transit/kepler\0020\002011352\kplr002011352-2009

KeyboardInterrupt: 

In [14]:
import numpy as np

# Load the Kepler IDs file
kepler_ids = np.load("flux_kepler_ids.npy")

# Get unique IDs
unique_ids = np.unique(kepler_ids)

# Save to CSV for convenience
np.savetxt("filtered_kepler_ids.csv", unique_ids, fmt='%d', delimiter=',')

print(f"✅ Done! Saved {len(unique_ids)} unique Kepler IDs to filtered_kepler_ids.csv")


✅ Done! Saved 5519 unique Kepler IDs to filtered_kepler_ids.csv
