In [1]:
import pandas as pd
import numpy as np

In [2]:
df=pd.read_csv('Asteroid_Modified.csv',low_memory=False)

# Test-Train Splitting

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
input_features = df.columns.values.tolist()
input_features.remove('diameter')

In [5]:
X = df[[i for i in input_features]]

In [6]:
y = df[['diameter']]

In [7]:
X_train, X_rem, y_train, y_rem = train_test_split(X, y , random_state=42, train_size=0.7, shuffle=True)

In [8]:
X_valid, X_test, y_valid, y_test = train_test_split(X, y , random_state=42, train_size=0.3, shuffle=True)

In [9]:
X_train.shape

(96345, 17)

# Prediction Metrics

In [10]:
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, median_absolute_error, explained_variance_score, r2_score

In [11]:
def prediction_metrics(prediction):
    print("Mean Absolute Error :     ",mean_absolute_error(y_test, prediction))
    print("Mean Squared Error :      ",mean_squared_error(y_test, prediction))
    print("Median Absolute Error :   ",median_absolute_error(y_test, prediction))
    print("Explained Variance Score :",explained_variance_score(y_test, prediction))
    print("r2-Score :                ",r2_score(y_test, prediction))

In [12]:
def cv_display_scores(scores):
    print("Scores: ", scores)
    print("Mean:              ", scores.mean())
    print("Standard deviation:", scores.std())

# Neural Network

In [13]:
import tensorflow as tf
from tensorflow import keras

2022-10-28 16:59:32.428815: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## DNN 1

In [14]:
def build_model(n_hidden=1, n_neurons=30, learning_rate=3e-3, input_shape=X_train.shape[1:]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, activation="linear", **options))
    lr_adp = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=learning_rate,decay_steps=100,decay_rate=0.9)
    optimizer = keras.optimizers.SGD(lr_adp,momentum=0.9)
    model.compile(loss="mean_absolute_error", optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

In [15]:
keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)

  keras_reg = keras.wrappers.scikit_learn.KerasRegressor(build_model)


In [16]:
keras_reg.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=6)])

2022-10-28 16:59:45.833138: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.callbacks.History at 0x1333ab9a0>

In [17]:
y_pred = keras_reg.predict(X_test)



In [18]:
y_pred

array([3.9322813, 3.9322813, 3.9322813, ..., 3.9322813, 3.9322813,
       3.9322813], dtype=float32)

In [19]:
prediction_metrics(y_pred)

Mean Absolute Error :      2.5385208360572893
Mean Squared Error :       64.39782464999193
Median Absolute Error :    1.3692812557220457
Explained Variance Score : 0.33302221172337076
r2-Score :                 0.3200793609938918


## DNN 2

In [20]:
def build_model2(n_hidden=1, n_neurons=30, learning_rate=0.05, input_shape=X_train.shape[1:]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, activation="linear", **options))
    lr_adp = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=learning_rate,decay_steps=100,decay_rate=0.9)
    optimizer = keras.optimizers.SGD(lr_adp,momentum=0.9)
    model.compile(loss="mean_absolute_error", optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

In [21]:
keras_reg2 = keras.wrappers.scikit_learn.KerasRegressor(build_model2)

  keras_reg2 = keras.wrappers.scikit_learn.KerasRegressor(build_model2)


In [22]:
keras_reg2.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=6)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


<keras.callbacks.History at 0x133855f00>

In [23]:
y_pred2 = keras_reg2.predict(X_test)



In [24]:
y_pred2

array([3.955882, 3.955882, 3.955882, ..., 3.955882, 3.955882, 3.955882],
      dtype=float32)

In [25]:
prediction_metrics(y_pred2)

Mean Absolute Error :      2.7597784843957167
Mean Squared Error :       97.03110376805024
Median Absolute Error :    1.3771179275512697
Explained Variance Score : 0.0
r2-Score :                 -0.024467059811607905


## DNN 3

In [26]:
def build_model3(n_hidden=1, n_neurons=25, learning_rate=1e-3, input_shape=X_train.shape[1:]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, activation="linear", **options))
    lr_adp = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=learning_rate,decay_steps=100,decay_rate=0.99)
    optimizer = keras.optimizers.SGD(lr_adp,momentum=0.9999)
    model.compile(loss="mean_absolute_error", optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

In [27]:
keras_reg3 = keras.wrappers.scikit_learn.KerasRegressor(build_model3)

  keras_reg3 = keras.wrappers.scikit_learn.KerasRegressor(build_model3)


In [28]:
keras_reg3.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=6)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100


<keras.callbacks.History at 0x133856440>

In [29]:
y_pred3 = keras_reg3.predict(X_test)



In [30]:
y_pred3

array([4.500564, 4.500564, 4.500564, ..., 4.500564, 4.500564, 4.500564],
      dtype=float32)

In [31]:
prediction_metrics(y_pred3)

Mean Absolute Error :      2.8136640159658874
Mean Squared Error :       95.66945433705428
Median Absolute Error :    1.5865640983581542
Explained Variance Score : 0.0
r2-Score :                 -0.010090587372407311


## DNN 4

In [32]:
def build_model4(n_hidden=1, n_neurons=50, learning_rate=0.01, input_shape=X_train.shape[1:]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, activation="linear", **options))
    lr_adp = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=learning_rate,decay_steps=100,decay_rate=0.9)
    optimizer = keras.optimizers.SGD(lr_adp,momentum=0.999)
    model.compile(loss="mean_absolute_error", optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

In [33]:
keras_reg4 = keras.wrappers.scikit_learn.KerasRegressor(build_model4)

  keras_reg4 = keras.wrappers.scikit_learn.KerasRegressor(build_model4)


In [34]:
keras_reg4.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=6)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100


<keras.callbacks.History at 0x133d93400>

In [35]:
y_pred4 = keras_reg4.predict(X_test)



In [36]:
y_pred4

array([3.6142397, 3.6142397, 3.6142397, ..., 3.6142397, 3.6142397,
       3.6142397], dtype=float32)

In [37]:
prediction_metrics(y_pred4)

Mean Absolute Error :      2.780967273249466
Mean Squared Error :       98.18798080132058
Median Absolute Error :    1.2982396926879884
Explained Variance Score : 0.0
r2-Score :                 -0.03668151854507906


## DNN 5

In [38]:
def build_model5(n_hidden=1, n_neurons=50, learning_rate=0.1, input_shape=X_train.shape[1:]):
    model = keras.models.Sequential()
    options = {"input_shape": input_shape}
    for layer in range(n_hidden):
        model.add(keras.layers.Dense(n_neurons, activation="relu", **options))
        options = {}
    model.add(keras.layers.Dense(1, activation="linear", **options))
    lr_adp = keras.optimizers.schedules.ExponentialDecay(initial_learning_rate=learning_rate,decay_steps=100,decay_rate=0.99)
    optimizer = keras.optimizers.SGD(lr_adp,momentum=0.999)
    model.compile(loss="mean_absolute_error", optimizer=optimizer, metrics=['mean_absolute_error'])
    return model

In [39]:
keras_reg5 = keras.wrappers.scikit_learn.KerasRegressor(build_model5)

  keras_reg5 = keras.wrappers.scikit_learn.KerasRegressor(build_model5)


In [40]:
keras_reg5.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=6)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100


<keras.callbacks.History at 0x1340c7f40>

In [41]:
y_pred5 = keras_reg5.predict(X_test)



In [42]:
y_pred5

array([3.69353, 3.69353, 3.69353, ..., 3.69353, 3.69353, 3.69353],
      dtype=float32)

In [43]:
prediction_metrics(y_pred5)

Mean Absolute Error :      2.77198418081098
Mean Squared Error :       97.89868392560474
Median Absolute Error :    1.3115300827026366
Explained Variance Score : 0.0
r2-Score :                 -0.03362708436708872


# Randomized Search CV

In [44]:
from scipy.stats import reciprocal
from sklearn.model_selection import RandomizedSearchCV

In [45]:
param_distr = { "n_hidden": [0, 1, 2, 3], "n_neurons": np.arange(1, 100), "learning_rate": reciprocal(3e-4, 3e-2)}

In [46]:
rnd_search_cv = RandomizedSearchCV(keras_reg, param_distr, n_iter=10, cv=3)

In [47]:
rnd_search_cv.fit(X_train, y_train, epochs=100, validation_data=(X_valid, y_valid), callbacks=[keras.callbacks.EarlyStopping(patience=4)])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100


Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 1/100


 -2.62507176e+00 -2.86290606e+00             nan             nan
             nan -2.57551678e+00]


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100


In [48]:
rnd_search_cv.best_params_

{'learning_rate': 0.0006624546927521231, 'n_hidden': 1, 'n_neurons': 36}

In [49]:
rnd_search_cv.best_score_

-2.564793268839518

In [50]:
nn_model = rnd_search_cv.best_estimator_.model

In [51]:
y_pred_best = nn_model.predict(X_test)



In [52]:
y_pred_best

array([[3.5938022],
       [3.5938022],
       [3.5938022],
       ...,
       [3.5938022],
       [3.5938022],
       [3.5938022]], dtype=float32)

In [53]:
prediction_metrics(y_pred_best)

Mean Absolute Error :      2.5484457546463024
Mean Squared Error :       75.40833696900332
Median Absolute Error :    1.2918022136688232
Explained Variance Score : 0.2249331798100721
r2-Score :                 0.20382893464151997


# Inference

The best model was the Random Forest Regressor with the following statistics:

### Cross Validation:
* Scores:  0.68802357, 0.28007676, 0.06561844, 0.0558667,  0.04707463, 0.03946168, 0.03526181, 0.04808714, 0.06202095, 0.06453636
* Mean:               0.13860280285633117
* Standard deviation: 0.19557082408696733


### Test Data:
* Mean Absolute Error :      0.04189141065507549
* Mean Squared Error :       0.06317341583802455
* Median Absolute Error :    0.02113960507459816
* Explained Variance Score : 0.9432657426983421
* r2-Score :                 0.9432657325718565