In [1]:
import os
import cupy as cp
import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import xgboost as xgb

# =====================================================
# 1. Load & preprocess train_ready.csv
# =====================================================
print("📂 Loading train_ready.csv ...")
df = pd.read_csv("train_ready.csv")

# Encode categorical/text columns
cat_cols = df.select_dtypes(include=["object"]).columns.tolist()
for col in cat_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

y = df["target"]
X = df.drop(["target"], axis=1)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

print("✅ Tabular data shape:", X_train.shape)

# =====================================================
# 2. XGBoost GPU model
# =====================================================
print("🚀 Training XGBoost (GPU) ...")
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    "objective": "binary:logistic",
    "tree_method": "gpu_hist",
    "predictor": "gpu_predictor",
    "eval_metric": "logloss",
    "scale_pos_weight": len(y_train[y_train==0]) / len(y_train[y_train==1])
}

bst = xgb.train(params, dtrain, num_boost_round=200, evals=[(dtest, "test")])
preds_xgb = (bst.predict(dtest) > 0.5).astype(int)

print("📊 XGBoost Classification Report:")
print(classification_report(y_test, preds_xgb))


📂 Loading train_ready.csv ...
✅ Tabular data shape: (7651, 117)
🚀 Training XGBoost (GPU) ...
[0]	test-logloss:0.47778
[1]	test-logloss:0.35735
[2]	test-logloss:0.28115
[3]	test-logloss:0.23098
[4]	test-logloss:0.19511
[5]	test-logloss:0.17033
[6]	test-logloss:0.15121
[7]	test-logloss:0.13867
[8]	test-logloss:0.12907



    E.g. tree_method = "hist", device = "cuda"

Parameters: { "predictor" } are not used.



[9]	test-logloss:0.12312
[10]	test-logloss:0.11719
[11]	test-logloss:0.11301
[12]	test-logloss:0.10921
[13]	test-logloss:0.10649
[14]	test-logloss:0.10458
[15]	test-logloss:0.10332
[16]	test-logloss:0.10279
[17]	test-logloss:0.10224
[18]	test-logloss:0.10128
[19]	test-logloss:0.10012
[20]	test-logloss:0.09988
[21]	test-logloss:0.09947
[22]	test-logloss:0.09942
[23]	test-logloss:0.09964
[24]	test-logloss:0.09947
[25]	test-logloss:0.09956
[26]	test-logloss:0.09924
[27]	test-logloss:0.09879
[28]	test-logloss:0.09936
[29]	test-logloss:0.09997
[30]	test-logloss:0.10102
[31]	test-logloss:0.10098
[32]	test-logloss:0.10119
[33]	test-logloss:0.10093
[34]	test-logloss:0.10090
[35]	test-logloss:0.10103
[36]	test-logloss:0.10120
[37]	test-logloss:0.10127
[38]	test-logloss:0.10241
[39]	test-logloss:0.10206
[40]	test-logloss:0.10226
[41]	test-logloss:0.10223
[42]	test-logloss:0.10226
[43]	test-logloss:0.10234
[44]	test-logloss:0.10266
[45]	test-logloss:0.10250
[46]	test-logloss:0.10253
[47]	test-log


    E.g. tree_method = "hist", device = "cuda"



In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder

# ======================
# Load Dataset
# ======================
df = pd.read_csv("train_ready.csv")

# Drop columns that are identifiers or comments
drop_cols = ["rowid", "kepid", "kepoi_name", "koi_disposition", 
             "koi_vet_date", "koi_comment"]  # adjust if needed
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# Encode categorical columns
label_encoders = {}
for col in df.select_dtypes(include=["object"]).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

# Separate target
y = df["target"]
X = df.drop(columns=["target"], errors="ignore")

# ======================
# Train/Val Split
# ======================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ======================
# RandomForest Model
# ======================
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    class_weight="balanced",
    n_jobs=-1,
    random_state=42
)

print("🚀 Training RandomForest...")
rf.fit(X_train, y_train)

# ======================
# Evaluation
# ======================
y_pred = rf.predict(X_val)

print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_val, y_pred))

print("\n📊 Classification Report:")
print(classification_report(y_val, y_pred, digits=4))


🚀 Training RandomForest...

📊 Confusion Matrix:
[[1336   28]
 [  52  497]]

📊 Classification Report:
              precision    recall  f1-score   support

           0     0.9625    0.9795    0.9709      1364
           1     0.9467    0.9053    0.9255       549

    accuracy                         0.9582      1913
   macro avg     0.9546    0.9424    0.9482      1913
weighted avg     0.9580    0.9582    0.9579      1913



In [3]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class KeplerDataset(Dataset):
    def __init__(self, csv_file, lightcurve_dir, seq_len=2000):
        """
        Args:
            csv_file (str): Path to train_ready.csv
            lightcurve_dir (str): Directory with lightcurve CSVs
            seq_len (int): Fixed sequence length for lightcurves
        """
        self.df = pd.read_csv(csv_file)
        self.lightcurve_dir = lightcurve_dir
        self.seq_len = seq_len

        # Drop text/date fields
        drop_cols = ["rowid", "kepid", "kepoi_name", "koi_disposition", 
                     "koi_vet_date", "koi_comment"]
        self.df = self.df.drop(columns=[c for c in drop_cols if c in self.df.columns], errors="ignore")

        # Encode categoricals
        from sklearn.preprocessing import LabelEncoder
        for col in self.df.select_dtypes(include=["object"]).columns:
            le = LabelEncoder()
            self.df[col] = le.fit_transform(self.df[col].astype(str))

        # Split features/target
        self.y = torch.tensor(self.df["target"].values, dtype=torch.float32)
        self.X = self.df.drop(columns=["target"], errors="ignore").values.astype(np.float32)

        self.kepids = self.df["kepid"].values if "kepid" in self.df.columns else np.arange(len(self.df))

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Tabular features
        tabular = torch.tensor(self.X[idx], dtype=torch.float32)

        # Try to load lightcurve
        kepid = self.kepids[idx]
        lc_file = os.path.join(self.lightcurve_dir, f"lightcurve_{idx}_KIC{kepid}.csv")
        if os.path.exists(lc_file):
            lc = pd.read_csv(lc_file)
            flux = lc["flux"].values.astype(np.float32)

            # Pad or truncate
            if len(flux) < self.seq_len:
                pad = np.zeros(self.seq_len, dtype=np.float32)
                pad[:len(flux)] = flux
                flux = pad
            else:
                flux = flux[:self.seq_len]
        else:
            # Missing LC → return zeros
            flux = np.zeros(self.seq_len, dtype=np.float32)

        flux = torch.tensor(flux, dtype=torch.float32).unsqueeze(0)  # (1, seq_len)

        return flux, tabular, self.y[idx]


In [4]:
import torch.nn as nn
import torch.nn.functional as F

class DualBranchNN(nn.Module):
    def __init__(self, tabular_dim, seq_len=2000, num_classes=1):
        super(DualBranchNN, self).__init__()

        # Lightcurve CNN branch
        self.cnn_branch = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(16),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(16, 32, kernel_size=5, stride=2, padding=2),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(32)
        )
        self.cnn_fc = nn.Linear(64 * 32, 128)

        # Tabular MLP branch
        self.mlp_branch = nn.Sequential(
            nn.Linear(tabular_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),

            nn.Linear(256, 128),
            nn.ReLU()
        )

        # Fusion
        self.fusion = nn.Sequential(
            nn.Linear(128 + 128, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, num_classes)
        )

    def forward(self, flux, tabular):
        cnn_out = self.cnn_branch(flux)
        cnn_out = cnn_out.view(cnn_out.size(0), -1)
        cnn_out = self.cnn_fc(cnn_out)

        mlp_out = self.mlp_branch(tabular)

        combined = torch.cat((cnn_out, mlp_out), dim=1)
        out = self.fusion(combined)

        return torch.sigmoid(out).squeeze()


In [7]:
import torch
from torch.utils.data import random_split, DataLoader
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm

# ====================
# Dataset & DataLoader
# ====================
dataset = KeplerDataset("train_ready.csv", "output_lightcurves", seq_len=2000)

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(
    train_dataset, batch_size=16, shuffle=True, num_workers=0, pin_memory=True
)
val_loader = DataLoader(
    val_dataset, batch_size=16, shuffle=False, num_workers=0, pin_memory=True
)

# ====================
# Model
# ====================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"🚀 Using {device}")

tabular_dim = dataset.X.shape[1]
model = DualBranchNN(tabular_dim).to(device)

# use BCEWithLogitsLoss (more stable than BCELoss with sigmoid)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

# ====================
# Training Loop
# ====================
EPOCHS = 10
for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0

    train_pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Train]", leave=False)
    for flux, tab, labels in train_pbar:
        flux = flux.to(device, non_blocking=True)
        tab = tab.to(device, non_blocking=True)
        labels = labels.float().unsqueeze(1).to(device, non_blocking=True)  # (B,1)

        optimizer.zero_grad()
        outputs = model(flux, tab).view(-1, 1)   # ensure (B,1)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        train_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

    avg_train_loss = running_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    y_true, y_pred = [], []
    val_pbar = tqdm(val_loader, desc=f"Epoch {epoch+1}/{EPOCHS} [Val]", leave=False)
    with torch.no_grad():
        for flux, tab, labels in val_pbar:
            flux = flux.to(device, non_blocking=True)
            tab = tab.to(device, non_blocking=True)
            labels = labels.float().unsqueeze(1).to(device, non_blocking=True)

            outputs = model(flux, tab).view(-1, 1)   # ensure (B,1)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            preds = (torch.sigmoid(outputs) > 0.5).int().cpu().numpy()
            y_true.extend(labels.cpu().numpy())
            y_pred.extend(preds)

    avg_val_loss = val_loss / len(val_loader)

    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

# ====================
# Final Evaluation
# ====================
print("\n📊 Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\n📊 Classification Report:")
print(classification_report(y_true, y_pred, digits=4))


🚀 Using cuda


                                                                                   

Epoch 1/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 2/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 3/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 4/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 5/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 6/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 7/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 8/10 | Train Loss: nan | Val Loss: nan


                                                                                

Epoch 9/10 | Train Loss: nan | Val Loss: nan


                                                                                 

Epoch 10/10 | Train Loss: nan | Val Loss: nan

📊 Confusion Matrix:
[[1357    0]
 [ 556    0]]

📊 Classification Report:
              precision    recall  f1-score   support

         0.0     0.7094    1.0000    0.8300      1357
         1.0     0.0000    0.0000    0.0000       556

    accuracy                         0.7094      1913
   macro avg     0.3547    0.5000    0.4150      1913
weighted avg     0.5032    0.7094    0.5887      1913



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
