In [6]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    classification_report, confusion_matrix, roc_curve, auc
)
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier

*A prediktáláshoz szükséges bementi featurek:*

koi_period – keringési periódus (napokban).

koi_time0bk / koi_time0 – az első tranzit kezdőidőpontja (BJD = Barycentric Julian Date).

koi_eccen – excentricitás (pálya elliptikussága).

koi_longp – periasztron hossztengelye.

koi_impact – a tranzit ütközési paramétere (központi vagy széli áthaladás).

koi_duration – a tranzit időtartama (órákban).

koi_depth – a fényességcsökkenés mértéke ppm-ben (parts per million).

koi_ror – a bolygó és a csillag sugarának aránya.

koi_prad – bolygó sugara (Föld-sugarakban).

koi_sma – fél nagytengely (csillag–bolygó távolság, csillag-sugarakban).

koi_incl – inklináció (pályahajlás fokban).

*Amire predikálunk:*

koi_disposition – végső besorolás:

CANDIDATE = bolygójelölt

CONFIRMED = megerősített exobolygó

FALSE POSITIVE = hamis találat

In [7]:
def load_dataset(split: bool = False) -> pd.DataFrame:
    df = pd.read_csv("C:\\Users\\Bence\\Documents\\Github\\NASA_SpaceApps_challenge_2025\\data\\kepler_KOI_full_dataset.csv")
    features = [
        "koi_period",
        "koi_time0bk",
        "koi_impact",
        "koi_duration",
        "koi_depth",
        "koi_ror",
        "koi_prad",
        "koi_sma",
        "koi_incls",
        "koi_disposition",
    ]
    df = df.filter(features)
    df["koi_disposition"] = np.where(df["koi_disposition"] == "CONFIRMED", 1, 0)

    if split:
        y = df["koi_disposition"]
        X = df.drop(columns=["koi_disposition"])
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        return X_train, X_test, y_train, y_test
    
    return df

def correlation_heatmap(df: pd.DataFrame) -> None:
    corr = df.corr()
    sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm")
    sns.set(rc={"figure.figsize":(11.7,8.27)})

def oversample_dataframe(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    ros = RandomOverSampler(random_state=42)
    X = df.drop(columns=[label_col])
    y = df[label_col]
    
    X_resampled, y_resampled = ros.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_resampled, columns=X.columns)
    df_resampled[label_col] = y_resampled
    return df_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

def balanced_sampler(df: pd.DataFrame, label_col: str) -> pd.DataFrame:
    grouped = df.groupby(label_col)
    min_count = grouped.size().min()
    
    # Minden csoportból min_count darabot véletlenszerűen mintavételezünk
    sampled = grouped.apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)
    
    # Az eredmény sorainak véletlenszerű összekeverése, hogy ne legyen blokkonként 0 vagy 1
    shuffled = sampled.sample(frac=1, random_state=42).reset_index(drop=True)
    return shuffled

def evaluate_random_forest(model, X_test, y_test, class_names=None, show_plot: bool = False) -> None:
    # Predikciók
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:,1] if hasattr(model, "predict_proba") else None
    
    # Metrikák számítása
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    roc_auc = roc_auc_score(y_test, y_proba) if y_proba is not None else None
    
    # Kiírás
    print("Classification Report:\n", classification_report(y_test, y_pred, target_names=class_names))
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    if roc_auc is not None:
        print(f"ROC AUC Score: {roc_auc:.4f}")
    if show_plot:
        # Confusion matrix kirajzolás
        cm = confusion_matrix(y_test, y_pred)
        plt.figure(figsize=(6,5))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_names, yticklabels=class_names)
        plt.xlabel('Predicted')
        plt.ylabel('Actual')
        plt.title('Confusion Matrix')
        plt.show()
        
        # ROC görbe kirajzolása ha lehetséges
        if y_proba is not None:
            fpr, tpr, _ = roc_curve(y_test, y_proba)
            plt.figure(figsize=(6,5))
            plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
            plt.plot([0,1], [0,1], linestyle='--', color='gray')
            plt.xlabel('False Positive Rate')
            plt.ylabel('True Positive Rate')
            plt.title('ROC Curve')
            plt.legend(loc='lower right')
            plt.show()

In [8]:
df = load_dataset()
#correlation_heatmap(df)
print(df["koi_disposition"].value_counts())
print(df.head())

koi_disposition
0    6818
1    2746
Name: count, dtype: int64
   koi_period  koi_time0bk  koi_impact  koi_duration  koi_depth   koi_ror  \
0    9.488036   170.538750       0.146       2.95750      615.8  0.022344   
1   54.418383   162.513840       0.586       4.50700      874.8  0.027954   
2   19.899140   175.850252       0.969       1.78220    10829.0  0.154046   
3    1.736952   170.307565       1.276       2.40641     8079.2  0.387394   
4    2.525592   171.595550       0.701       1.65450      603.3  0.024064   

   koi_prad  koi_sma  koi_disposition  
0      2.26   0.0853                1  
1      2.83   0.2734                1  
2     14.60   0.1419                0  
3     33.46   0.0267                0  
4      2.75   0.0374                1  


In [9]:
X_train, X_test, y_train, y_test = load_dataset(split=True)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
evaluate_random_forest(model, X_test, y_test, class_names=["FALSE POSITIVE", "CONFIRMED"])

Classification Report:
                 precision    recall  f1-score   support

FALSE POSITIVE       0.87      0.91      0.89      1344
     CONFIRMED       0.77      0.67      0.71       569

      accuracy                           0.84      1913
     macro avg       0.82      0.79      0.80      1913
  weighted avg       0.84      0.84      0.84      1913

Accuracy: 0.8400
Precision: 0.7657
Recall: 0.6661
F1 Score: 0.7124
ROC AUC Score: 0.9156


In [10]:
df = load_dataset()
df = balanced_sampler(df=df, label_col="koi_disposition")
y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
evaluate_random_forest(model, X_test, y_test, class_names=["FALSE POSITIVE", "CONFIRMED"])

  sampled = grouped.apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)


Classification Report:
                 precision    recall  f1-score   support

FALSE POSITIVE       0.87      0.79      0.82       547
     CONFIRMED       0.81      0.88      0.84       552

      accuracy                           0.83      1099
     macro avg       0.84      0.83      0.83      1099
  weighted avg       0.84      0.83      0.83      1099

Accuracy: 0.8335
Precision: 0.8060
Recall: 0.8804
F1 Score: 0.8416
ROC AUC Score: 0.9162


In [11]:
df = load_dataset()
df = oversample_dataframe(df=df, label_col="koi_disposition")
y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
evaluate_random_forest(model, X_test, y_test, class_names=["FALSE POSITIVE", "CONFIRMED"])



Classification Report:
                 precision    recall  f1-score   support

FALSE POSITIVE       0.97      0.89      0.93      1378
     CONFIRMED       0.90      0.97      0.93      1350

      accuracy                           0.93      2728
     macro avg       0.93      0.93      0.93      2728
  weighted avg       0.93      0.93      0.93      2728

Accuracy: 0.9322
Precision: 0.9003
Recall: 0.9704
F1 Score: 0.9340
ROC AUC Score: 0.9837


In [12]:
X_train, X_test, y_train, y_test = load_dataset(split=True)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
evaluate_random_forest(model, X_test, y_test, class_names=["FALSE POSITIVE", "CONFIRMED"])

Classification Report:
                 precision    recall  f1-score   support

FALSE POSITIVE       0.88      0.90      0.89      1344
     CONFIRMED       0.74      0.70      0.72       569

      accuracy                           0.84      1913
     macro avg       0.81      0.80      0.80      1913
  weighted avg       0.84      0.84      0.84      1913

Accuracy: 0.8374
Precision: 0.7407
Recall: 0.6977
F1 Score: 0.7186
ROC AUC Score: 0.9146


Parameters: { "use_label_encoder" } are not used.



In [13]:
df = load_dataset()
df = balanced_sampler(df=df, label_col="koi_disposition")
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=["koi_disposition"]), df["koi_disposition"], test_size=0.2, random_state=42)
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
evaluate_random_forest(model, X_test, y_test, class_names=["FALSE POSITIVE", "CONFIRMED"])

Classification Report:
                 precision    recall  f1-score   support

FALSE POSITIVE       0.87      0.79      0.83       547
     CONFIRMED       0.81      0.88      0.85       552

      accuracy                           0.84      1099
     macro avg       0.84      0.84      0.84      1099
  weighted avg       0.84      0.84      0.84      1099

Accuracy: 0.8380
Precision: 0.8106
Recall: 0.8841
F1 Score: 0.8458
ROC AUC Score: 0.9140


  sampled = grouped.apply(lambda x: x.sample(min_count, random_state=42)).reset_index(drop=True)
Parameters: { "use_label_encoder" } are not used.



Buckle up for this rage induced down spiraling shit tornado called deep learning

*yeeeeehaaaw*

In [14]:
import tensorflow as tf

In [56]:
df = load_dataset()
y = df["koi_disposition"]
X = df.drop(columns=["koi_disposition"])
y = y.to_numpy()
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [57]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

def build_mlp(input_dim=8):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001),
                  loss='binary_crossentropy',
                  metrics=['accuracy', 'Precision', 'Recall'])
    return model

# Tanítás
model = build_mlp()
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2, callbacks=[early_stop])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
