In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Load raw data
flux = np.load("flux_sequences.npy")
ids = np.load("flux_kepler_ids.npy")
df = pd.read_csv("stellar_parameters_filtered.csv")

# Filter rows and map labels
df = df[df['kepid'].isin(ids)]
label_map = {"CONFIRMED": 1, "FALSE POSITIVE": 0}
df["label"] = df["koi_disposition"].map(label_map).fillna(-1)

# Required features
features = ["koi_period", "koi_duration", "koi_prad", "koi_steff", "koi_srad"]
df = df.dropna(subset=features)

# Align
id_to_features = {row['kepid']: row[features].values for _, row in df.iterrows()}
id_to_label = {row['kepid']: row['label'] for _, row in df.iterrows()}

aligned_flux = []
aligned_tabular = []
aligned_labels = []

for i, kepid in enumerate(ids):
    if kepid in id_to_features:
        aligned_flux.append(flux[i])
        aligned_tabular.append(id_to_features[kepid])
        aligned_labels.append(id_to_label.get(kepid, -1))

X_flux = np.array(aligned_flux)
X_flux = (X_flux - X_flux.mean(axis=1, keepdims=True)) / (X_flux.std(axis=1, keepdims=True) + 1e-8)
X_tabular_raw = np.array(aligned_tabular)
y = np.array(aligned_labels).astype(int)

# Scale tabular
scaler = StandardScaler()
X_tabular = scaler.fit_transform(X_tabular_raw)

# Save all npy files
np.save("X_flux_aligned.npy", X_flux)
np.save("X_tabular.npy", X_tabular)
np.save("y.npy", y)

print("✅ Saved:")
print(" - X_flux_aligned.npy:", X_flux.shape)
print(" - X_tabular.npy:", X_tabular.shape)
print(" - y.npy:", y.shape)


✅ Saved:
 - X_flux_aligned.npy: (1088, 2000)
 - X_tabular.npy: (1088, 5)
 - y.npy: (1088,)


In [15]:
import numpy as np
y = np.load("y.npy")
print(np.unique(y, return_counts=True))


(array([-1,  0,  1]), array([259, 632, 197], dtype=int64))
