# Exoplanet Detection with Machine Learning

## Dataset Exploration and ML Model Design

This notebook explores the NASA Kepler Objects of Interest (KOI) dataset and designs a machine learning model to classify exoplanets.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set style for plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

In [None]:
# Load the Kepler dataset
df = pd.read_csv('data/koi_data.csv')

print("Dataset shape:", df.shape)
print("\nDataset info:")
df.info()

In [None]:
# Display first few rows
df.head(10)

In [None]:
# Check for unique values in the target variable (koi_disposition)
print("Unique values in koi_disposition:", df['koi_disposition'].unique())
print("\nDisposition counts:")
print(df['koi_disposition'].value_counts())

# Visualize the distribution of the target variable
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='koi_disposition')
plt.title('Distribution of Exoplanet Disposition')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Check for missing values
print("Missing values in each column:")
print(df.isnull().sum())

# Display statistics for numerical columns
print("\nNumerical columns statistics:")
df.describe()

In [None]:
# Distribution of numerical features
numerical_cols = ['koi_period', 'koi_time0bk', 'koi_duration', 'koi_depth', 'koi_prad', 'koi_teq']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    axes[i].hist(df[col].dropna(), bins=50, edgecolor='black')
    axes[i].set_title(f'Distribution of {col}')
    axes[i].set_xlabel(col)
    axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

In [None]:
# Correlation matrix for numerical features
corr = df[numerical_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', center=0, square=True)
plt.title('Correlation Matrix of Numerical Features')
plt.show()

In [None]:
# Analyze how features vary by disposition
# We'll create boxplots for each numerical feature by disposition
fig, axes = plt.subplots(2, 3, figsize=(16, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    sns.boxplot(data=df, x='koi_disposition', y=col, ax=axes[i])
    axes[i].set_title(f'{col} by Disposition')
    axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

In [None]:
# Data preprocessing for ML model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.impute import SimpleImputer

# Prepare features and target variable
X = df[numerical_cols].copy()
y = df['koi_disposition'].copy()

# Handle missing values with median imputation
imputer = SimpleImputer(strategy='median')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Encode target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)

print(f"Training set size: {X_train.shape}")
print(f"Test set size: {X_test.shape}")
print(f"Classes: {label_encoder.classes_}")

In [None]:
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train multiple models
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000)
}

results = {}

for name, model in models.items():
    # Train the model
    if name == 'Logistic Regression':
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:  # Random Forest
        model.fit(X_train, y_train)  # Random Forest doesn't need scaling
        y_pred = model.predict(X_test)
    
    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    
    print(f"\n{name} Results:")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Classification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Feature importance (for Random Forest)
rf_model = models['Random Forest']
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Feature Importance for Exoplanet Classification (Random Forest)')
plt.show()

print("Feature Importance:")
print(feature_importance)

In [None]:
# Try a more sophisticated model with ensemble methods
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier

# Adding Gradient Boosting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)

if 'Gradient Boosting' not in models:
    models['Gradient Boosting'] = gb_model

# Train Gradient Boosting
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)
accuracy_gb = accuracy_score(y_test, y_pred_gb)
results['Gradient Boosting'] = accuracy_gb

print(f"\nGradient Boosting Results:")
print(f"Accuracy: {accuracy_gb:.4f}")
print(f"Classification Report:")
print(classification_report(y_test, y_pred_gb, target_names=label_encoder.classes_))

In [None]:
# Create ensemble of models
ensemble_model = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('lr', LogisticRegression(random_state=42, max_iter=1000)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
    ],
    voting='soft'
)

# Train ensemble model
if 'Logistic Regression' in models:
    # Logistic Regression needs scaled data
    ensemble_model.fit(X_train_scaled, y_train)
    y_pred_ensemble = ensemble_model.predict(X_test_scaled)
else:
    ensemble_model.fit(X_train, y_train)
    y_pred_ensemble = ensemble_model.predict(X_test)

accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
results['Ensemble'] = accuracy_ensemble

print(f"\nEnsemble Model Results:")
print(f"Accuracy: {accuracy_ensemble:.4f}")
print(f"Classification Report:")
print(classification_report(y_test, y_pred_ensemble, target_names=label_encoder.classes_))

In [None]:
# Model comparison
print("Model Comparison:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

# Plot model comparison
plt.figure(figsize=(10, 6))
sns.barplot(x=list(results.keys()), y=list(results.values()))
plt.title('Model Accuracy Comparison')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Try Deep Learning approach with TensorFlow/Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

# Encode labels for neural network
from sklearn.preprocessing import LabelBinarizer
label_binarizer = LabelBinarizer()
y_train_nn = label_binarizer.fit_transform(y_train)
y_test_nn = label_binarizer.transform(y_test)

# Define and compile the neural network
model_nn = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    layers.Dropout(0.3),
    layers.Dense(32, activation='relu'),
    layers.Dropout(0.3),
    layers.Dense(16, activation='relu'),
    layers.Dense(y_train_nn.shape[1], activation='softmax')  # Output layer with softmax for multi-class
])

model_nn.compile(optimizer='adam',
                 loss='categorical_crossentropy',
                 metrics=['accuracy'])

print(model_nn.summary())

In [None]:
# Train the neural network
history = model_nn.fit(
    X_train_scaled, y_train_nn,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    verbose=1
)

# Evaluate the neural network
test_loss, test_accuracy = model_nn.evaluate(X_test_scaled, y_test_nn, verbose=0)
results['Neural Network'] = test_accuracy

print(f"\nNeural Network Results:")
print(f"Test Accuracy: {test_accuracy:.4f}")

# Plot training history
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Final model comparison
print("Final Model Comparison:")
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

# Save the best model
best_model_name = max(results, key=results.get)
print(f"\nBest Model: {best_model_name} with accuracy: {results[best_model_name]:.4f}")

In [None]:
# Final model selection and recommendation
print("\n")
print("=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#\n")
print("MODEL DESIGN SUMMARY FOR EXOPLANET DETECTION")
print("=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#\n")
print("Dataset: NASA Kepler Objects of Interest (KOI)")
print(f"Dataset Size: {len(df)} samples")
print(f"Features: {len(numerical_cols)} numerical features")
print(f"Target Classes: {list(label_encoder.classes_)}")
print("")
print("Features Used:")
for feature in numerical_cols:
    print(f"- {feature}")
print("")
print("Best Performing Model:")
print(f"- {best_model_name}")
print(f"- Accuracy: {results[best_model_name]:.4f}")
print("")
print("Recommended Approach:")
if best_model_name == 'Ensemble':
    print("- Use ensemble method combining multiple algorithms for robustness")
elif best_model_name == 'Random Forest':
    print("- Use Random Forest for good balance of accuracy and interpretability")
elif best_model_name == 'Neural Network':
    print("- Use Neural Network for potentially higher accuracy with more data")
else:
    print(f"- Use {best_model_name} model")
print("")
print("Additional Considerations:")
print("- Implement feature engineering to extract more meaningful patterns from light curves")
print("- Consider using CNNs or LSTMs for direct analysis of light curve time-series data")
print("- Apply cross-validation for more robust model evaluation")
print("=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#=#\n")