In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score

def fill_nan_with_statistic(df, stat_type='mean'):
    if stat_type not in ['mean', 'median']:
        raise ValueError("stat_type must be either 'mean' or 'median'")

    numeric_df = df.select_dtypes(include=[np.number])
    fill_values = numeric_df.mean() if stat_type == 'mean' else numeric_df.median()
    filled_df = df.copy()
    filled_df[numeric_df.columns] = filled_df[numeric_df.columns].fillna(fill_values)

    return filled_df

data_path = '/content/cumulative.csv'
data = pd.read_csv(data_path)
data = data.drop(columns=['kepler_name', 'koi_tce_delivname', 'koi_teq_err1', 'koi_teq_err2'])
data = fill_nan_with_statistic(data, stat_type='median')

label_encoder = LabelEncoder()
data['koi_disposition'] = label_encoder.fit_transform(data['koi_disposition'])
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_encoded = one_hot_encoder.fit_transform(data[['koi_disposition']])

X = data.drop(columns=['koi_disposition'])
non_numeric_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(16, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(y_encoded.shape[1], activation='softmax')
])

model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5, verbose=1)

history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[reduce_lr]
)

def preprocess_input_data(input_df):
    input_df = fill_nan_with_statistic(input_df, stat_type='median')
    input_df = input_df.drop(columns=['kepler_name', 'koi_tce_delivname', 'koi_teq_err1', 'koi_teq_err2'], errors='ignore')
    input_df = pd.get_dummies(input_df, columns=non_numeric_cols, drop_first=True)
    input_df = input_df.reindex(columns=X.columns, fill_value=0)
    input_df = scaler.transform(input_df)
    return input_df

def evaluate_model(input_df=None):
    if input_df is not None:
        X_eval = preprocess_input_data(input_df)
        y_eval = None
    else:
        X_eval = X_test
        y_eval = y_test

    y_pred_prob = model.predict(X_eval)
    y_pred = np.argmax(y_pred_prob, axis=1)

    if y_eval is not None:
        y_true = np.argmax(y_eval, axis=1)
        test_accuracy = np.mean(y_true == y_pred)
        f1 = f1_score(y_true, y_pred, average='weighted')
        print(f"Test Accuracy: {test_accuracy:.4f}")
        print(f"F1 Score: {f1:.4f}")
    else:
        print("No true labels provided. Displaying predicted probabilities:")

    class_labels = label_encoder.inverse_transform(np.arange(len(one_hot_encoder.categories_[0])))  # Get original labels
    for i in range(len(y_pred)):
        predicted_class_index = y_pred[i]
        predicted_probability = y_pred_prob[i][predicted_class_index]

        print(f"{predicted_probability * 100:.2f}% Class {predicted_class_index}: {class_labels[predicted_class_index]}")


model.save('trained_model.h5')
print("Model saved as 'trained_model.h5'")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 19ms/step - accuracy: 0.3592 - loss: 1.9634 - val_accuracy: 0.5269 - val_loss: 1.1521 - learning_rate: 0.0010
Epoch 2/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 17ms/step - accuracy: 0.4460 - loss: 1.4658 - val_accuracy: 0.5269 - val_loss: 1.1973 - learning_rate: 0.0010
Epoch 3/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.4887 - loss: 1.3270 - val_accuracy: 0.5269 - val_loss: 1.2244 - learning_rate: 0.0010
Epoch 4/50
[1m237/240[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 13ms/step - accuracy: 0.5224 - loss: 1.2723
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 14ms/step - accuracy: 0.5224 - loss: 1.2722 - val_accuracy: 0.5269 - val_loss: 1.1968 - learning_rate: 0.0010
Epoch 5/50
[1m240/240[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m



Model saved as 'trained_model.h5'


In [None]:
sample_input_df = pd.read_csv('/content/sample_input.csv')
print(sample_input_df)

    rowid     kepid kepoi_name koi_pdisposition  koi_score  koi_fpflag_nt  \
0       1  10797460  K00752.01        CANDIDATE      1.000              0   
1       2  10797460  K00752.02        CANDIDATE      0.969              0   
2       3  10811496  K00753.01   FALSE POSITIVE      0.000              0   
3       4  10848459  K00754.01   FALSE POSITIVE      0.000              0   
4       5  10854555  K00755.01        CANDIDATE      1.000              0   
5       6  10872983  K00756.01        CANDIDATE      1.000              0   
6       7  10872983  K00756.02        CANDIDATE      1.000              0   
7       8  10872983  K00756.03        CANDIDATE      0.992              0   
8       9   6721123  K00114.01   FALSE POSITIVE      0.000              0   
9      10  10910878  K00757.01        CANDIDATE      1.000              0   
10     11  11446443  K00001.01        CANDIDATE      0.811              0   
11     12  10666592  K00002.01        CANDIDATE      1.000              0   

In [None]:
evaluate_model(sample_input_df)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 174ms/step
No true labels provided. Displaying predicted probabilities:
97.74% Class 1: CONFIRMED
99.90% Class 1: CONFIRMED
99.90% Class 2: FALSE POSITIVE
99.85% Class 2: FALSE POSITIVE
99.31% Class 1: CONFIRMED
99.93% Class 1: CONFIRMED
99.81% Class 1: CONFIRMED
99.90% Class 1: CONFIRMED
99.90% Class 2: FALSE POSITIVE
99.79% Class 1: CONFIRMED
97.61% Class 1: CONFIRMED
99.93% Class 1: CONFIRMED
95.61% Class 1: CONFIRMED
99.83% Class 1: CONFIRMED
99.86% Class 2: FALSE POSITIVE


In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import f1_score

# Load and preprocess dataset
data_path = '/content/SDSS_DR18.csv'  # Replace with your actual file path
data = pd.read_csv(data_path)

# Drop rows with missing values
data = data.dropna()

# Encode target variable
label_encoder = LabelEncoder()
data['class'] = label_encoder.fit_transform(data['class'])
one_hot_encoder = OneHotEncoder(sparse_output=False)
y_encoded = one_hot_encoder.fit_transform(data[['class']])

# Separate features and target variable
X = data.drop(columns=['class', 'objid', 'specobjid', 'run'])  # Exclude non-feature columns
non_numeric_cols = X.select_dtypes(include=['object']).columns
X = pd.get_dummies(X, columns=non_numeric_cols, drop_first=True)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the ANN model with 5 hidden layers
model = Sequential([
    Dense(64, input_dim=X_train.shape[1], activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(32, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(16, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),
    Dense(y_encoded.shape[1], activation='softmax')
])

# Compile model
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Callback for learning rate reduction
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-5, verbose=1)

# Train the model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=64,
    validation_data=(X_test, y_test),
    callbacks=[reduce_lr],
    verbose=2
)

# Evaluate the model on the test set
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"Test accuracy: {accuracy * 100:.2f}%")

# Predict the classes for the test set
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(y_test, axis=1)

# Calculate F1 score
f1 = f1_score(y_true_classes, y_pred_classes, average='weighted')
print(f"F1 Score: {f1:.2f}")
model.save('trained_model_Galaxy_classification.h5')
print("Model saved as 'trained_model_Galaxy_classification.h5'")

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


1250/1250 - 10s - 8ms/step - accuracy: 0.7906 - loss: 0.7454 - val_accuracy: 0.9780 - val_loss: 0.7557 - learning_rate: 0.0010
Epoch 2/50
1250/1250 - 4s - 3ms/step - accuracy: 0.9618 - loss: 0.3064 - val_accuracy: 0.9849 - val_loss: 0.5374 - learning_rate: 0.0010
Epoch 3/50
1250/1250 - 6s - 5ms/step - accuracy: 0.9762 - loss: 0.2106 - val_accuracy: 0.9855 - val_loss: 0.3745 - learning_rate: 0.0010
Epoch 4/50
1250/1250 - 9s - 7ms/step - accuracy: 0.9792 - loss: 0.1638 - val_accuracy: 0.9877 - val_loss: 0.2343 - learning_rate: 0.0010
Epoch 5/50
1250/1250 - 6s - 5ms/step - accuracy: 0.9810 - loss: 0.1399 - val_accuracy: 0.9876 - val_loss: 0.2358 - learning_rate: 0.0010
Epoch 6/50
1250/1250 - 9s - 7ms/step - accuracy: 0.9814 - loss: 0.1232 - val_accuracy: 0.9872 - val_loss: 0.2264 - learning_rate: 0.0010
Epoch 7/50
1250/1250 - 5s - 4ms/step - accuracy: 0.9822 - loss: 0.1225 - val_accuracy: 0.9877 - val_loss: 0.1416 - learning_rate: 0.0010
Epoch 8/50
1250/1250 - 5s - 4ms/step - accuracy: 0.



F1 Score: 0.99
Model saved as 'trained_model_Galaxy_classification.h5'


In [None]:
import pandas as pd
import numpy as np

sample_input_df = pd.read_csv('/content/sample_input.csv')


sample_input_df = sample_input_df.drop(columns=['class', 'objid', 'specobjid', 'run'], errors='ignore')

non_numeric_cols = sample_input_df.select_dtypes(include=['object']).columns
sample_input_df = pd.get_dummies(sample_input_df, columns=non_numeric_cols, drop_first=True)


X_train_columns = X.columns

missing_cols = set(X_train_columns) - set(sample_input_df.columns)
for col in missing_cols:
    sample_input_df[col] = 0
extra_cols = set(sample_input_df.columns) - set(X_train_columns)
sample_input_df = sample_input_df.drop(columns=extra_cols, errors='ignore')
sample_input_df = sample_input_df[X_train_columns]

sample_input_df = scaler.transform(sample_input_df)


sample_predictions = model.predict(sample_input_df)
sample_pred_classes = np.argmax(sample_predictions, axis=1)
predicted_classes = label_encoder.inverse_transform(sample_pred_classes)

sample_input_df = pd.DataFrame(sample_input_df, columns=X_train_columns)
sample_input_df['predicted_class'] = predicted_classes


sample_input_df.to_csv('predicted_classes_output.csv', index=False)
print("Predictions for the first 5 rows:")
print(predicted_classes[:5])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 212ms/step
Predictions for the first 5 rows:
['STAR' 'STAR' 'STAR' 'STAR' 'STAR']
