# Cosmic Planet Classification

This notebook implements a machine learning pipeline to classify planets based on their features using a combination of deep learning (TensorFlow), Random Forest, and XGBoost, followed by an ensemble prediction. The goal is to predict planet types from a dataset and generate a submission file.

In [19]:
import pandas as pd #type: ignore
import numpy as np #type: ignore
import tensorflow as tf #type: ignore
from sklearn.preprocessing import StandardScaler #type: ignore
from sklearn.impute import KNNImputer #type: ignore
from sklearn.model_selection import train_test_split #type: ignore
from sklearn.metrics import accuracy_score #type: ignore
import xgboost as xgb #type: ignore
import lightgbm as lgb
from tensorflow.keras.utils import to_categorical #type: ignore
from tensorflow.keras.regularizers import l2 #type: ignore
import os

PLANET_TYPES = {
    1: "Bewohnbar",
    2: "Terraformierbar",
    3: "Rohstoffreich",
    4: "Wissenschaftlich",
    5: "Gasriese",
    6: "Wüstenplanet",
    7: "Eiswelt",
    8: "Toxischetmosäre",
    9: "Hohestrahlung",
    10: "Toterahswelt"
}
np.random.seed(42)
tf.random.set_seed(42)

imputer = KNNImputer(n_neighbors=5)

## Data Preprocessing Function

This function loads and preprocesses the data, handling missing values, generating interaction features, and scaling numerical columns.

In [20]:
def load_and_preprocess_data(filepath, is_train=True, imputer=None, scaler=None, feature_cols=None):
    df = pd.read_csv(filepath)
    print(f"\nData loaded from {filepath}.")
    print(f"Dataset dimensions: {df.shape[0]} rows, {df.shape[1]} columns")

    if is_train:
        df = df[df['Prediction'] >= 0]

    planet_ids = np.arange(len(df)) + 1

    # Clean categorical features encoded as strings
    for col in ['Magnetic Field Strength', 'Radiation Levels']:
        if col in df.columns:
            df[col] = df[col].apply(lambda x: float(str(x).replace('Category_', '')) if isinstance(x, str) and 'Category_' in x else np.nan)

    # Handle missing values with KNN imputation
    if is_train:
        feature_cols = [col for col in df.columns if col != 'Prediction']
        imputer = KNNImputer(n_neighbors=5)
        X_imputed = pd.DataFrame(imputer.fit_transform(df[feature_cols]), columns=feature_cols, index=df.index)
        df_imputed = X_imputed.join(df['Prediction'])
    else:
        X_imputed = pd.DataFrame(imputer.transform(df), columns=df.columns, index=df.index)
        df_imputed = X_imputed

    # Generate interaction features
    if is_train:
        feature_cols = [col for col in df_imputed.columns if col != 'Prediction']
        for i, col1 in enumerate(feature_cols):
            for col2 in feature_cols[i+1:]:
                df_imputed[f'{col1}_{col2}_interaction'] = df_imputed[col1] * df_imputed[col2]
    else:
        for i, col1 in enumerate(feature_cols):
            for col2 in feature_cols[i+1:]:
                df_imputed[f'{col1}_{col2}_interaction'] = df_imputed[col1] * df_imputed[col2]

    # Scale numerical features
    numerical_cols = df_imputed.select_dtypes(include=['float64', 'int64']).columns.tolist()
    if 'Prediction' in numerical_cols:
        numerical_cols.remove('Prediction')

    if is_train:
        scaler = StandardScaler()
        df_imputed[numerical_cols] = scaler.fit_transform(df_imputed[numerical_cols])
    else:
        df_imputed[numerical_cols] = scaler.transform(df_imputed[numerical_cols])

    if is_train:
        return df_imputed, imputer, scaler, feature_cols, planet_ids
    else:
        return df_imputed, planet_ids

## Deep Learning Model

Define a deep neural network with skip connections, batch normalization, and dropout for regularization.

In [21]:
def build_deep_model(input_shape):
    inputs = tf.keras.Input(shape=input_shape)
    
    # Add Gaussian noise to handle data noise
    x = layers.GaussianNoise(stddev=0.01)(inputs)
    
    # First hidden layer
    x = layers.Dense(256, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.ELU(alpha=0.1)(x)
    x = layers.Dropout(0.3)(x)
    
    # Second hidden layer
    x = layers.Dense(128, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.ELU(alpha=0.1)(x)
    x = layers.Dropout(0.2)(x)
    
    # Third hidden layer
    x = layers.Dense(64, kernel_regularizer=regularizers.l2(0.001))(x)
    x = layers.BatchNormalization()(x)
    x = layers.ELU(alpha=0.1)(x)
    x = layers.Dropout(0.1)(x)
    
    # Output layer
    outputs = layers.Dense(10, activation='softmax')(x)
    
    # Compile the model
    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        optimizer=tf.keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

## Evaluation and Training Functions

Functions to evaluate models and train classifiers (Neural Network, Random Forest, XGBoost).

In [22]:
def evaluate_model(y_true, y_pred, model_name="Neural Network"):
    y_true_classes = np.argmax(y_true, axis=1)
    y_pred_classes = np.argmax(y_pred, axis=1)
    accuracy = accuracy_score(y_true_classes, y_pred_classes)
    print(f'\n{model_name} Performance Metrics:')
    print(f'Accuracy: {accuracy:.4f}')
    print("\nSample Predictions (first 5):")
    for true_val, pred_val in zip(y_true_classes[:5], y_pred_classes[:5]):
        print(f'True: {PLANET_TYPES[true_val + 1]}, Predicted: {PLANET_TYPES[pred_val + 1]}')
    return accuracy

## Ensemble Prediction

Combine predictions from all models using a weighted ensemble approach.

## Main Execution

The main function orchestrates the entire pipeline: loading data, training models, evaluating performance, and generating predictions.

In [23]:
def main():
    print("\nStarting Cosmic Planet Classification with Neural Network...")

    # Load and preprocess training data
    train_filepath = 'train/train.csv'
    df_train, imputer, scaler, feature_cols, _ = load_and_preprocess_data(train_filepath, is_train=True)

    # Prepare features and target
    X_train_full = df_train.drop('Prediction', axis=1)
    y_train_full = to_categorical(df_train['Prediction'].astype(int), num_classes=10)

    # Remove highly correlated features
    corr_matrix = X_train_full.corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
    X_train_full = X_train_full.drop(to_drop, axis=1)
    print(f"Removed {len(to_drop)} highly correlated features")

    # Split into training and validation sets
    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full, y_train_full, test_size=12000, random_state=42,
        stratify=np.argmax(y_train_full, axis=1)
    )

    # Build and train the neural network model
    nn_model = build_deep_model((X_train.shape[1],))
    nn_model.fit(
        X_train, y_train, epochs=150, batch_size=64, validation_data=(X_val, y_val),
        callbacks=[
            tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=20, restore_best_weights=True, mode='max'),
            tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=1e-7),
            tf.keras.callbacks.LearningRateScheduler(lambda epoch: 0.0005 * (0.9 ** (epoch // 10)))
        ],
        verbose=1
    )

    # Evaluate the model on validation data
    nn_pred = nn_model.predict(X_val)
    nn_accuracy = evaluate_model(y_val, nn_pred, "Neural Network")
    print(f"\nNeural Network Accuracy: {nn_accuracy:.4f}")

    # Generate predictions for test data if available
    test_filepath = 'test_data/cosmictest.csv'
    if os.path.exists(test_filepath):
        df_test, planet_ids = load_and_preprocess_data(
            test_filepath, is_train=False, imputer=imputer, scaler=scaler, feature_cols=feature_cols
        )
        
        # Use the same features as in training
        common_features = set(df_test.columns).intersection(set(X_train_full.columns))
        df_test = df_test[list(common_features)]
        
        # Generate predictions
        nn_test_pred = nn_model.predict(df_test)
        predicted_classes = np.argmax(nn_test_pred, axis=1) + 1

        # Create submission file
        submission_df = pd.DataFrame({'Planet_ID': planet_ids, 'Predicted_Class': predicted_classes})
        submission_df.to_csv('submission.csv', index=False)
        print("\nSubmission file created: submission.csv")
        print("\nSample predictions (first 5):")
        for idx, pred in enumerate(predicted_classes[:5]):
            print(f"Planet ID: {planet_ids[idx]}, Predicted Class: {pred} ({PLANET_TYPES[pred]})")
    else:
        print(f"\nWARNING: Test file not found.")

    print("\nExecution completed!")

if __name__ == "__main__":
    main()


Starting Cosmic Planet Classification with Neural Network...

Data loaded from train/train.csv.
Dataset dimensions: 60000 rows, 11 columns
Removed 16 highly correlated features
Epoch 1/150
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6622 - loss: 1.3992 - val_accuracy: 0.8783 - val_loss: 0.6534 - learning_rate: 5.0000e-04
Epoch 2/150
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8447 - loss: 0.7449 - val_accuracy: 0.8904 - val_loss: 0.5530 - learning_rate: 5.0000e-04
Epoch 3/150
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8606 - loss: 0.6420 - val_accuracy: 0.8969 - val_loss: 0.4906 - learning_rate: 5.0000e-04
Epoch 4/150
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8704 - loss: 0.5731 - val_accuracy: 0.9016 - val_loss: 0.4471 - learning_rate: 5.0000e-04
Epoch 5/150
[1m703/703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m