In [1]:
import os, re
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed

import numpy as np
import polars as pl
from tqdm import tqdm

# Try optional CuPy (GPU arrays)
try:
    import cupy as cp
    HAS_CUPY = True
except ImportError:
    HAS_CUPY = False
    cp = None


# -----------------------
# Helper functions
# -----------------------
def extract_id_from_filename(filename: str):
    """Extract kepid/KIC ID from filename like lightcurve_0_KIC6020753.csv"""
    bn = os.path.basename(filename).replace(".csv", "")
    return bn.split("_")[-1]  # e.g. KIC6020753


def normalize_and_resample_gpu(flux, target_length=2000):
    flux = cp.asarray(flux, dtype=cp.float32)

    if flux.size < 10 or cp.all(cp.isnan(flux)):
        raise ValueError("Flux array too short or all NaNs")

    # Clean bad values
    flux = cp.nan_to_num(flux, nan=0.0, posinf=0.0, neginf=0.0)
    flux = cp.clip(flux, -5, 5)

    # Normalize
    med = cp.nanmedian(flux)
    if not cp.isfinite(med) or med == 0:
        med = 1.0
    flux = (flux / med) - 1.0

    # Resample to fixed length
    x_old = cp.arange(flux.size, dtype=cp.float32)
    x_new = cp.linspace(0, flux.size - 1, target_length, dtype=cp.float32)
    flux_res = cp.interp(x_new, x_old, flux)

    return cp.asnumpy(flux_res)



def process_file(file_path, catalog_df, target_length, tabular_cols, use_gpu=False):
    """Process single lightcurve file and join with catalog"""
    try:
        # Use Polars instead of pandas for speed
        df = pl.read_csv(file_path)
        if "flux" not in df.columns:
            print(f"⚠️ Skipping {file_path}: no 'flux' column")
            return None

        flux = df["flux"].to_numpy()
        token = extract_id_from_filename(file_path)

        # Match with catalog
        kepid_digits = re.sub(r"\D", "", token)
        match = catalog_df[catalog_df["kepid"].astype(str).str.contains(kepid_digits)]
        if len(match) == 0:
            print(f"⚠️ No catalog match for {file_path} (token={token})")
            return None
        row = match.iloc[0]

        # Normalize + resample
        if use_gpu and HAS_CUPY:
            flux_res = normalize_and_resample_gpu(flux, target_length)
        else:
            # CPU fallback
            flux = flux.astype(np.float32)
            med = np.nanmedian(flux) if np.isfinite(np.nanmedian(flux)) else 1.0
            if med == 0: med = 1.0
            flux = (flux / med) - 1.0
            x_old = np.arange(len(flux))
            x_new = np.linspace(0, len(flux) - 1, target_length)
            flux_res = np.interp(x_new, x_old, flux).astype(np.float32)

        # Collect
        label = int(row["target_encoded"])
        tabular = row[tabular_cols].to_numpy(dtype=np.float32)

        return flux_res, tabular, label, row["rowid"], row["kepid"], row["kepoi_name"], os.path.basename(file_path)

    except Exception as e:
        print(f"❌ Error processing {file_path}: {e}")
        return None


# -----------------------
# Main Builder
# -----------------------
def build_dataset(catalog_path="kepler_with_lightcurves.csv",
                  lc_folder="folder",
                  out_dir="dataset_out",
                  target_length=2000,
                  workers=8,
                  use_gpu=True):

    print("📂 Loading catalog...")
    import pandas as pd
    catalog_df = pd.read_csv(catalog_path)
    print(f"✅ Catalog loaded with {len(catalog_df)} rows")

    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    exclude_cols = set(["rowid", "kepid", "kepoi_name", "koi_disposition", "target_encoded"])
    tabular_cols = [c for c in catalog_df.columns if c not in exclude_cols and 
                    np.issubdtype(catalog_df[c].dtype, np.number)]
    print(f"📊 Using {len(tabular_cols)} tabular features")

    files = sorted(Path(lc_folder).glob("*.csv"))
    print(f"🔍 Found {len(files)} lightcurve files")

    X_lc, X_tab, y, meta = [], [], [], []
    error_count = 0

    print(f"🚀 Starting sequential processing (GPU={use_gpu and HAS_CUPY})...")
    for f in tqdm(files, desc="Processing"):
        res = process_file(f, catalog_df, target_length, tabular_cols, use_gpu)
        if res is None:
            error_count += 1
            continue
        flux_res, tabular, label, rowid, kepid, kepoi_name, fname = res
        X_lc.append(flux_res)
        X_tab.append(tabular)
        y.append(label)
        meta.append([rowid, kepid, kepoi_name, fname])

    print(f"✅ Finished. Successful: {len(X_lc)}, Errors: {error_count}")

    # Convert to arrays
    print("📦 Stacking arrays...")
    X_lc = np.stack(X_lc).astype(np.float32)
    X_tab = np.stack(X_tab).astype(np.float32)
    y = np.array(y, dtype=np.int8)

    # Save
    print(f"💾 Saving to {out_dir} ...")
    np.save(out_dir / "X_lightcurves.npy", X_lc)
    np.save(out_dir / "X_catalog.npy", X_tab)
    np.save(out_dir / "y.npy", y)

    import pandas as pd
    meta_df = pd.DataFrame(meta, columns=["rowid", "kepid", "kepoi_name", "filename"])
    meta_df.to_csv(out_dir / "meta.csv", index=False)

    print("🎉 Done.")
    print(f"Shapes: X_lightcurves={X_lc.shape}, X_catalog={X_tab.shape}, y={y.shape}")

    return X_lc, X_tab, y, meta_df


In [2]:
X_lc, X_tab, y, meta = build_dataset(
    catalog_path="kepler_with_lightcurves.csv",
    lc_folder="small",
    out_dir="dataset_out",
    target_length=2000,
    workers=12,
    use_gpu=True  # set True if you want CuPy arrays
)   


📂 Loading catalog...
✅ Catalog loaded with 2165 rows
📊 Using 51 tabular features
🔍 Found 5019 lightcurve files
🚀 Starting sequential processing (GPU=True)...


Processing:  50%|████▉     | 2494/5019 [00:15<00:16, 152.90it/s]

❌ Error processing small\lightcurve_4230_KIC3749365.csv: Flux array too short or all NaNs


Processing: 100%|██████████| 5019/5019 [00:35<00:00, 142.38it/s]


✅ Finished. Successful: 5018, Errors: 1
📦 Stacking arrays...
💾 Saving to dataset_out ...
🎉 Done.
Shapes: X_lightcurves=(5018, 2000), X_catalog=(5018, 51), y=(5018,)


In [4]:
import cupy as cp
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader, random_split
from sklearn.metrics import confusion_matrix
import numpy as np

# ======================
# Load Preprocessed Data
# ======================
print("📂 Loading dataset (GPU)...")
X_lc = cp.load("dataset_out/X_lightcurves.npy")
X_tab = cp.load("dataset_out/X_catalog.npy")
y = cp.load("dataset_out/y.npy")

print("✅ Shapes:", X_lc.shape, X_tab.shape, y.shape)

# Convert to torch tensors (GPU)
X_lc_t = torch.tensor(cp.asnumpy(X_lc), dtype=torch.float32, device="cuda")
X_tab_t = torch.tensor(cp.asnumpy(X_tab), dtype=torch.float32, device="cuda")
y_t = torch.tensor(cp.asnumpy(y), dtype=torch.long, device="cuda")

# ======================
# Dataset + Dataloader
# ======================
dataset = TensorDataset(X_lc_t, X_tab_t, y_t)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=64)

# ======================
# Model Definition
# ======================
class DualInputModel(nn.Module):
    def __init__(self, lc_length, tab_features, num_classes=3):
        super().__init__()
        # Lightcurve branch (1D CNN)
        self.lc_branch = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(16, 32, kernel_size=5, stride=2, padding=2),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(16)
        )
        self.lc_fc = nn.Linear(64 * 16, 128)

        # Tabular branch
        self.tab_branch = nn.Sequential(
            nn.Linear(tab_features, 64),
            nn.ReLU(),
            nn.Linear(64, 64),
            nn.ReLU()
        )

        # Combined
        self.combined_fc = nn.Sequential(
            nn.Linear(128 + 64, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, lc, tab):
        lc = lc.unsqueeze(1)  # (batch, 1, length)
        lc = self.lc_branch(lc)
        lc = lc.view(lc.size(0), -1)
        lc = self.lc_fc(lc)

        tab = self.tab_branch(tab)

        combined = torch.cat([lc, tab], dim=1)
        out = self.combined_fc(combined)
        return out

# ======================
# Training Setup
# ======================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DualInputModel(lc_length=X_lc.shape[1], tab_features=X_tab.shape[1]).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.CrossEntropyLoss()

# ======================
# Training Loop
# ======================
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for lc_batch, tab_batch, y_batch in train_loader:
        optimizer.zero_grad()
        out = model(lc_batch, tab_batch)
        loss = criterion(out, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{EPOCHS} - Loss: {total_loss/len(train_loader):.4f}")

# ======================
# Evaluation
# ======================
model.eval()
all_preds, all_labels = [], []
with torch.no_grad():
    for lc_batch, tab_batch, y_batch in val_loader:
        out = model(lc_batch, tab_batch)
        preds = out.argmax(dim=1)
        all_preds.append(preds)
        all_labels.append(y_batch)

all_preds = torch.cat(all_preds).cpu().numpy()
all_labels = torch.cat(all_labels).cpu().numpy()

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0,1,2])  # CANDIDATE=0, CONFIRMED=1, FALSE POSITIVE=2
from sklearn.metrics import classification_report

# Confusion matrix
cm = confusion_matrix(all_labels, all_preds, labels=[0,1,2])
print("\n📊 Confusion Matrix (rows=true, cols=pred):")
print(cm)

# Detailed metrics per class
print("\n📊 Classification Report:")
print(classification_report(all_labels, all_preds, target_names=["CANDIDATE","CONFIRMED","FALSE POSITIVE"]))

# Manual summary (macro-averaged)
accuracy = np.trace(cm) / np.sum(cm)
error_rate = 1 - accuracy

print(f"\n✅ Overall Accuracy: {accuracy:.4f}")
print(f"❌ Error Rate: {error_rate:.4f}")



📂 Loading dataset (GPU)...
✅ Shapes: (5018, 2000) (5018, 51) (5018,)
Epoch 1/10 - Loss: nan
Epoch 2/10 - Loss: nan
Epoch 3/10 - Loss: nan
Epoch 4/10 - Loss: nan
Epoch 5/10 - Loss: nan
Epoch 6/10 - Loss: nan
Epoch 7/10 - Loss: nan
Epoch 8/10 - Loss: nan
Epoch 9/10 - Loss: nan
Epoch 10/10 - Loss: nan

📊 Confusion Matrix (rows=true, cols=pred):
[[  7   0   0]
 [992   0   0]
 [  5   0   0]]

📊 Classification Report:
                precision    recall  f1-score   support

     CANDIDATE       0.01      1.00      0.01         7
     CONFIRMED       0.00      0.00      0.00       992
FALSE POSITIVE       0.00      0.00      0.00         5

      accuracy                           0.01      1004
     macro avg       0.00      0.33      0.00      1004
  weighted avg       0.00      0.01      0.00      1004


✅ Overall Accuracy: 0.0070
❌ Error Rate: 0.9930


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
