In [11]:
# import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers.legacy import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import LearningRateScheduler, EarlyStopping, ModelCheckpoint
import math

# References : Multi-Class Classification Tutorial with the Keras Deep Learning Library - Machine Learning Mastery,neural-network-multiclass-classification-model-using-tensorflow,Multiclass Classification with Keras | HackerNoon,

train_data = pd.read_csv('/Users/arina/Downloads/celestial_train.csv') # pls change this with the path from your local @ datathon team

# Prepare training data
train_data = train_data.drop(columns=['id'])
X = train_data.drop('class', axis=1)
y = train_data['class']
label_encoder = LabelEncoder()# Encode the target variable
y_encoded = label_encoder.fit_transform(y)
y_encoded = to_categorical(y_encoded)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Compute class weights for handling class imbalance calculated using the library 
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

# the neural network 
def create_model(input_shape, num_classes):
    model = Sequential([
        Dense(128, input_shape=input_shape, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(64, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Function to define a learning rate schedule using common standards
def step_decay(epoch): 
    initial_lr = 0.001
    drop = 0.5
    epochs_drop = 10.0
    lr = initial_lr * math.pow(drop, math.floor((1+epoch)/epochs_drop))
    return lr

# Learning rate schedule callback to prevent overfitting 
lr_schedule = LearningRateScheduler(step_decay)

# Early stopping and model checkpointing to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=1)
model_checkpoint = ModelCheckpoint('best_model.keras', monitor='val_loss', save_best_only=True, verbose=1)

# K-Fold Cross-Validation to improve the model 
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cvscores = []

for train, val in kfold.split(X_scaled, y_encoded.argmax(axis=1)):
    # Split data
    X_train, X_val = X_scaled[train], X_scaled[val]
    y_train, y_val = y_encoded[train], y_encoded[val]

    # Create model
    model = create_model((X_train.shape[1],), y_encoded.shape[1])

    # Train the model with learning rate scheduler, early stopping, and model checkpointing
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), 
              class_weight=class_weights_dict, verbose=1, callbacks=[lr_schedule, early_stopping, model_checkpoint])

    # Load the best model 
    model.load_weights('best_model.keras')

    # Evaluate the model
    scores = model.evaluate(X_val, y_val, verbose=0)
    print(f"{model.metrics_names[1]}: {scores[1] * 100}%")
    cvscores.append(scores[1] * 100)

# Calculate and print average performance
print(f"Average Accuracy: {np.mean(cvscores)}% (+/- {np.std(cvscores)})%")


Epoch 1/100
Epoch 1: val_loss improved from inf to 0.18947, saving model to best_model.keras
Epoch 2/100
Epoch 2: val_loss improved from 0.18947 to 0.15358, saving model to best_model.keras
Epoch 3/100
Epoch 3: val_loss improved from 0.15358 to 0.15173, saving model to best_model.keras
Epoch 4/100
Epoch 4: val_loss did not improve from 0.15173
Epoch 5/100
Epoch 5: val_loss did not improve from 0.15173
Epoch 6/100
Epoch 6: val_loss did not improve from 0.15173
Epoch 7/100
Epoch 7: val_loss did not improve from 0.15173
Epoch 8/100
Epoch 8: val_loss improved from 0.15173 to 0.12835, saving model to best_model.keras
Epoch 9/100
Epoch 9: val_loss did not improve from 0.12835
Epoch 10/100
Epoch 10: val_loss improved from 0.12835 to 0.12339, saving model to best_model.keras
Epoch 11/100
Epoch 11: val_loss did not improve from 0.12339
Epoch 12/100
Epoch 12: val_loss improved from 0.12339 to 0.12063, saving model to best_model.keras
Epoch 13/100
Epoch 13: val_loss improved from 0.12063 to 0.120

In [17]:
test_data = pd.read_csv('/Users/arina/Downloads/celestial_test.csv') # load the location of test file 
test_ids = test_data['id']  
test_data = test_data.drop(columns=['id'])
# use the model for prediction 
X_test_scaled = scaler.transform(test_data)
y_pred_probabilities = model.predict(X_test_scaled)
y_pred_classes = y_pred_probabilities.argmax(axis=1)

# Get the labels back 
predicted_class_labels = label_encoder.inverse_transform(y_pred_classes)

submission = pd.DataFrame({
    'id': test_ids,
    'output': predicted_class_labels
})

#  path for saving the submission file
submission_file_path = '/Users/arina/Desktop/celestial_submission-2.csv'

# Save a new CSV file
submission.to_csv(submission_file_path, index=False)

print(f"Submission file saved to {submission_file_path}")

Submission file saved to /Users/arina/Desktop/celestial_submission-2.csv
