In [1]:
pip install pandas numpy scikit-learn joblib plotly ipywidgets

Note: you may need to restart the kernel to use updated packages.




In [2]:
pip install google-genai

Collecting google-genai
  Downloading google_genai-1.41.0-py3-none-any.whl.metadata (45 kB)
Downloading google_genai-1.41.0-py3-none-any.whl (245 kB)
Installing collected packages: google-genai
Successfully installed google-genai-1.41.0
Note: you may need to restart the kernel to use updated packages.




In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
import numpy as np
import joblib

# --- 1. Data Import (Kepler/KOI Data) ---
# We use skiprows=53 to bypass the metadata at the top of the NASA CSV file.
KEPLER_FILE = "cumulative_2025.10.03_00.23.38.csv"
SKIP_ROWS = 53

# Load the data
df = pd.read_csv(KEPLER_FILE, skiprows=SKIP_ROWS)

print("--- 1. Data Imported Successfully ---")
print(f"Initial shape: {df.shape}")

# --- 2. Feature and Target Selection (Simplification for MVP MLP) ---
# TARGET: koi_pdisposition (Disposition using Kepler Data: CANDIDATE or FALSE POSITIVE)
# FEATURES: A concise set of key physical/observable parameters.

TARGET_COLUMN = 'koi_pdisposition'
FEATURE_COLUMNS = [
    'koi_period',     # Orbital Period [days]
    'koi_prad',       # Planet Radius [Earth radii]
    'koi_teq',        # Equilibrium Temperature [K]
    'koi_duration',   # Transit Duration [hrs]
    'koi_impact',     # Impact Parameter
    'koi_insol',      # Insolation Flux [Earth flux]
]

# Select necessary columns
df_model = df[[TARGET_COLUMN] + FEATURE_COLUMNS].copy()

# --- 3. Data Cleaning (Crucial Step for ML) ---
# For MVP simplicity, we drop rows with ANY missing values in the selected columns.
# (In a real scenario, you might use imputation, but dropping is faster here.)
df_model.dropna(inplace=True)

print(f"2. Shape after cleaning missing values: {df_model.shape}")

# --- 4. Target Transformation (Creating Binary Y) ---
# Convert the categorical target ('CANDIDATE'/'FALSE POSITIVE') to binary (1/0)
df_model['y'] = df_model[TARGET_COLUMN].apply(lambda x: 1 if x == 'CANDIDATE' else 0)

# --- 5. Data Splitting and Scaling (Prep for MLP) ---
X = df_model[FEATURE_COLUMNS]
y = df_model['y']

# Split data into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Scale the data (StandardScaler is mandatory for MLP/Neural Networks performance)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- 6. MLP Model Setup (Ready for Training) ---
# Setup a simple Multi-Layer Perceptron (MLP) Classifier:
# - Two hidden layers with 16 neurons each: (16, 16)
# - Max iterations set to 500 for stable convergence.
mlp = MLPClassifier(hidden_layer_sizes=(16, 16), max_iter=500, random_state=42)

print("\n--- Data Preparation Complete. Ready for MLP Training ---")
print(f"X_train_scaled shape: {X_train_scaled.shape}")

--- 1. Data Imported Successfully ---
Initial shape: (9564, 49)
2. Shape after cleaning missing values: (9201, 7)

--- Data Preparation Complete. Ready for MLP Training ---
X_train_scaled shape: (7360, 6)


In [5]:
# Train the MLP model
mlp.fit(X_train_scaled, y_train)

# Evaluate model performance (Crucial for judging the 'Working' part of MVP)
accuracy = mlp.score(X_test_scaled, y_test)
print(f"MLP Model Accuracy on Test Set: {accuracy:.4f}")

from sklearn.metrics import classification_report
y_pred = mlp.predict(X_test_scaled)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

MLP Model Accuracy on Test Set: 0.8121

Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.77      0.80       918
           1       0.79      0.86      0.82       923

    accuracy                           0.81      1841
   macro avg       0.81      0.81      0.81      1841
weighted avg       0.81      0.81      0.81      1841



In [6]:
# Save the trained model and the scaler object
joblib.dump(mlp, 'mlp_exoplanet_model.pkl')
joblib.dump(scaler, 'scaler_object.pkl')
print("\nModel and Scaler saved successfully.")


Model and Scaler saved successfully.


In [7]:
# --- STEP 7: Train the MLP Model ---
# This is the training process itself. It should run quickly since we used a simple model.
print("Starting MLP training...")
mlp.fit(X_train_scaled, y_train)
print("MLP Training Complete.")

# --- STEP 8: Evaluate Model Performance (Check the 'Working' part of the MVP) ---
from sklearn.metrics import classification_report, accuracy_score

# Make predictions on the unseen test set
y_pred = mlp.predict(X_test_scaled)
y_prob = mlp.predict_proba(X_test_scaled)[:, 1] # Probability of being a 'CANDIDATE' (class 1)

# Report results
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy on Test Set: {accuracy:.4f}")
print("\nClassification Report (Key Metric for Judges):")
print(classification_report(y_test, y_pred))

# Note: Focus on the 'precision' and 'recall' for class 1 (CANDIDATE) 
# as this shows how well the model finds true exoplanets.

Starting MLP training...
MLP Training Complete.

Model Accuracy on Test Set: 0.8121

Classification Report (Key Metric for Judges):
              precision    recall  f1-score   support

           0       0.84      0.77      0.80       918
           1       0.79      0.86      0.82       923

    accuracy                           0.81      1841
   macro avg       0.81      0.81      0.81      1841
weighted avg       0.81      0.81      0.81      1841



In [8]:
# --- STEP 9: Identify New Candidates ---
# Re-run prediction on the *original full, cleaned dataframe* to get confidence scores for all objects.

# 1. Scale the full cleaned feature set
X_all_scaled = scaler.transform(X) # Use the same scaler object trained previously

# 2. Get the probability of being a CANDIDATE (class 1)
df_model['confidence'] = mlp.predict_proba(X_all_scaled)[:, 1]

# 3. Filter for candidates with high confidence but not yet confirmed
# We look for objects currently labeled as 'FALSE POSITIVE' or those we want to re-examine, 
# where the AI confidence is high (e.g., > 90% confidence).
new_candidates = df_model[
    (df_model['koi_pdisposition'] == 'FALSE POSITIVE') & 
    (df_model['confidence'] >= 0.90)
].sort_values(by='confidence', ascending=False)

print("\n--- AI-Identified High-Confidence Candidates (Top 5) ---")
print(new_candidates.head(5))

# --- 10. Save Candidate List for the Web App ---
# This file will be the primary data source for your Streamlit/Flask app visualization.
candidates_file_name = 'ai_identified_candidates.csv'
new_candidates.to_csv(candidates_file_name, index=False)
print(f"\nCandidate list saved to: {candidates_file_name}")


--- AI-Identified High-Confidence Candidates (Top 5) ---
     koi_pdisposition  koi_period  koi_prad  koi_teq  koi_duration  \
9486   FALSE POSITIVE   14.340036      0.68    248.0         5.700   
6245   FALSE POSITIVE   40.880154      0.92    265.0         5.472   
6913   FALSE POSITIVE   24.500270      0.73    409.0         4.079   
2963   FALSE POSITIVE   25.920009      1.28    515.0         5.609   
6240   FALSE POSITIVE   37.078668      0.80    419.0         4.403   

      koi_impact  koi_insol  y  confidence  
9486      0.0060       0.89  0    0.911346  
6245      0.0160       1.17  0    0.908679  
6913      0.4050       6.61  0    0.907768  
2963      0.0540      16.59  0    0.906995  
6240      0.5101       7.27  0    0.903501  

Candidate list saved to: ai_identified_candidates.csv


In [9]:
# --- STEP 11: Save Model and Scaler ---
# Import the joblib library (you should have done this in the first step's imports)
import joblib 

# Save the trained MLP model
joblib.dump(mlp, 'mlp_exoplanet_model.pkl')

# Save the scaler object (MANDATORY, as your web app must scale new data before prediction)
joblib.dump(scaler, 'scaler_object.pkl')

print("\nModel ('mlp_exoplanet_model.pkl') and Scaler ('scaler_object.pkl') saved successfully.")


Model ('mlp_exoplanet_model.pkl') and Scaler ('scaler_object.pkl') saved successfully.
