In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import tensorflow as tf

In [2]:
# Data loading
df_raw = pd.read_csv("star_classification_raw.csv")
df_raw.head(10)

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
5,1.23768e+18,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,5.658977e+18,QSO,1.424659,5026,55855,741
6,1.237679e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,1.246262e+19,QSO,0.586455,11069,58456,113
7,1.237679e+18,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,7773,301,2,346,6.961443e+18,GALAXY,0.477009,6183,56210,15
8,1.237661e+18,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,3716,301,5,108,7.459285e+18,GALAXY,0.660012,6625,56386,719
9,1.237671e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,2.751763e+18,STAR,-8e-06,2444,54082,232


In [3]:
df_raw.dtypes

obj_ID         float64
alpha          float64
delta          float64
u              float64
g              float64
r              float64
i              float64
z              float64
run_ID           int64
rerun_ID         int64
cam_col          int64
field_ID         int64
spec_obj_ID    float64
class           object
redshift       float64
plate            int64
MJD              int64
fiber_ID         int64
dtype: object

In [4]:
# Remove the Object ID, Run ID, Rerun ID, Unique ID, and MJD Columns. ID and date columns were
    # determined to be irrelevant to catigorization of stellar objects.
df_mod = df_raw.drop(columns=["obj_ID","run_ID","rerun_ID","spec_obj_ID","MJD","cam_col","plate","field_ID","fiber_ID"])
df_mod.head(10)

Unnamed: 0,alpha,delta,u,g,r,i,z,class,redshift
0,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,GALAXY,0.634794
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,GALAXY,0.779136
2,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,GALAXY,0.644195
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,GALAXY,0.932346
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,GALAXY,0.116123
5,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,QSO,1.424659
6,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,QSO,0.586455
7,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,GALAXY,0.477009
8,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,GALAXY,0.660012
9,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,STAR,-8e-06


In [5]:
df_mod = df_mod.sort_values(by='z')
df_mod = df_mod.iloc[1:]
df_mod

Unnamed: 0,alpha,delta,u,g,r,i,z,class,redshift
14498,84.230172,7.138590,14.15199,10.73097,9.82207,9.469903,9.612333,STAR,0.000051
4472,239.423127,28.548438,12.26240,10.51139,10.06854,13.417860,10.225510,STAR,0.000025
75784,239.473361,27.594369,12.10168,10.49820,10.11604,10.008650,10.441310,STAR,0.000008
75620,239.207155,27.898444,12.30349,10.67180,10.19460,10.055090,10.650560,STAR,0.000003
85285,164.321732,40.221137,15.44840,11.79892,10.86379,10.566470,10.778890,STAR,-0.000159
...,...,...,...,...,...,...,...,...,...
29689,146.345791,24.566331,18.50142,18.51986,20.75484,30.163590,27.673360,GALAXY,1.086811
85950,255.213997,28.549537,20.65719,19.14651,20.99419,29.889210,27.805190,GALAXY,0.144278
40201,36.908884,-1.982394,20.00673,20.29219,18.54149,17.813190,28.238290,GALAXY,0.593243
93321,16.455371,0.321688,18.64831,18.43714,17.49240,32.141470,28.790550,QSO,1.773661


In [6]:
df_mod.isnull().sum()

alpha       0
delta       0
u           0
g           0
r           0
i           0
z           0
class       0
redshift    0
dtype: int64

In [7]:
# Transform Class column
def changeStatus(stellar_class):
    if stellar_class == "GALAXY":
        return 0
    elif stellar_class == "STAR":
        return 1
    else:
        return 2
    
df_mod["class"] = df_mod["class"].apply(changeStatus)
df_mod.head(10)

Unnamed: 0,alpha,delta,u,g,r,i,z,class,redshift
14498,84.230172,7.13859,14.15199,10.73097,9.82207,9.469903,9.612333,1,5.1e-05
4472,239.423127,28.548438,12.2624,10.51139,10.06854,13.41786,10.22551,1,2.5e-05
75784,239.473361,27.594369,12.10168,10.4982,10.11604,10.00865,10.44131,1,8e-06
75620,239.207155,27.898444,12.30349,10.6718,10.1946,10.05509,10.65056,1,3e-06
85285,164.321732,40.221137,15.4484,11.79892,10.86379,10.56647,10.77889,1,-0.000159
81298,8.51163,-9.705282,14.5601,12.68849,11.77229,11.31937,10.89738,0,0.01245
4401,184.28205,7.624347,14.50678,12.67902,11.74664,11.29956,10.91847,0,0.00759
75583,239.702668,27.467439,12.99664,11.33897,10.98255,10.87374,11.19448,1,0.000174
57763,208.358688,33.825699,10.99623,13.66217,12.35763,12.63744,11.30247,2,3.955592
76986,240.602807,26.945961,15.32899,11.47435,11.09069,10.95665,11.41484,1,4.9e-05


In [8]:
# Define the features set.
X = df_mod.copy()
X = X.drop(columns=["class"])

In [9]:
# Define the target set.
y = df_mod["class"]

In [10]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [11]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Random Forest Model

In [12]:
# Define number of models to create
n = 10

# Creating multiple iterations of Random Forest classifiers
counter = 0
for i in range(n):
    # Create a random forest classifier.
    rf_model = RandomForestClassifier(n_estimators=128, random_state=counter)
    
    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)
    
    # Making predictions using the testing data.
    predictions = rf_model.predict(X_test_scaled)
    print(accuracy_score(y_test, predictions))
    
    counter = counter + 1

0.97824
0.97764
0.97816
0.97776
0.97764
0.97804
0.97796
0.9774
0.97756
0.97804


In [13]:
# Creating multiple iterations of Random Forest classifiers to determine average feature ranking
counter = 0
for i in range(n):
    # Create a random forest classifier.
    rf_model = RandomForestClassifier(n_estimators=128, random_state=counter)
    
    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)
    
    # Calculate and print feature importance in the Random Forest model.
    importances = rf_model.feature_importances_
    print(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
    
    counter = counter + 1

[(0.6106375079796702, 'redshift'), (0.09637759773203174, 'z'), (0.07362710620907867, 'g'), (0.06981839386304108, 'u'), (0.06623430095659907, 'i'), (0.05419064608624569, 'r'), (0.01483185522970595, 'alpha'), (0.01428259194362764, 'delta')]
[(0.6001849232698282, 'redshift'), (0.09945183916440586, 'z'), (0.07677320035787322, 'g'), (0.07593210049324095, 'i'), (0.06899735246750055, 'u'), (0.04949194032770027, 'r'), (0.01464431490629991, 'alpha'), (0.014524329013151014, 'delta')]
[(0.6177446158639454, 'redshift'), (0.10850317037944229, 'z'), (0.07616897814754314, 'g'), (0.062294508100581486, 'u'), (0.061822650892336586, 'i'), (0.04450552372623519, 'r'), (0.014720061977755093, 'alpha'), (0.01424049091216085, 'delta')]
[(0.6147785017963691, 'redshift'), (0.09794093233547778, 'z'), (0.07276711749336114, 'g'), (0.06717036454935843, 'u'), (0.061477562695789224, 'i'), (0.056569507255350975, 'r'), (0.014920496704573651, 'alpha'), (0.014375517169719645, 'delta')]
[(0.6159400513973021, 'redshift'), (

# K-Means Model

In [14]:
# Standardize the data with StandardScaler().
X_scaled_K = StandardScaler().fit_transform(X)

In [15]:
# Using PCA to reduce dimension to three principal components.
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled_K)

In [16]:
# Create a DataFrame with the three principal components.
pcs_df = pd.DataFrame(data=X_pca, index=df_mod.index, columns=["PC 1", "PC 2", "PC 3"])

In [17]:
# Define number of models to create
n = 10

# Creating multiple iterations of K-Means classifiers
counter = 0
for i in range(n):
    # Initialize the K-Means model.
    model = KMeans(n_clusters=3, random_state=counter)

    # Fit the model
    model.fit(pcs_df)

    # Predict clusters
    predictions = model.predict(pcs_df)

    # Determine accuracy of K-Means
    acc = predictions-y
    print(np.count_nonzero(acc)/999.99)
    
    counter = counter + 1

84.73684736847369
84.75784757847579
55.213552135521354
55.307553075530755
82.72882728827288
35.20435204352044
55.32055320553206
87.03987039870398
78.34278342783428
85.10785107851079


# Neural Network Model

In [18]:
# Define the number of input features and hidden layers
number_input_features = 8
hidden_nodes_layer1 = [25,25,50,50,75,75,100,100,150,150]
hidden_nodes_layer2 = [5,9,5,9,5,9,5,9,5,9]

# Predefine arrays
model_loss = [0,0,0,0,0,0,0,0,0,0]
model_accuracy = [0,0,0,0,0,0,0,0,0,0]

In [19]:
# Creating multiple iterations of Neural Networks to determine 
counter = 0
for i in hidden_nodes_layer1:
    # Define the model - deep neural net
    nn = tf.keras.models.Sequential()

    # First hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer1[counter], input_dim=number_input_features, activation="relu"))

    # Second hidden layer
    nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2[counter], activation="relu"))

    # Output layer
    nn.add(tf.keras.layers.Dense(units=1, activation="tanh"))

    # Compile the model
    nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

    # Train the model
    fit_model = nn.fit(X_train_scaled,y_train,epochs=25)

    # Evaluate the model using the test data
    model_loss[counter], model_accuracy[counter] = nn.evaluate(X_test_scaled,y_test,verbose=2)
    
    print(f"\n Iteration {counter} completed. \n")
    counter = counter + 1

# Neatly display model accuracies
counter = 0
for j in model_accuracy:
    print(model_accuracy[counter])
    counter = counter + 1

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 1s - loss: -2.3479e+00 - accuracy: 0.7732 - 544ms/epoch - 696us/step

 Iteration 0 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 1s - loss: -2.4000e+00 - accuracy: 0.7823 - 517ms/epoch - 660us/step

 Iteration 1 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25


 Iteration 2 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 1s - loss: -2.3227e+00 - accuracy: 0.7803 - 524ms/epoch - 671us/step

 Iteration 3 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 1s - loss: -2.4302e+00 - accuracy: 0.7851 - 527ms/epoch - 673us/step

 Iteration 4 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16


 Iteration 5 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 1s - loss: -2.4690e+00 - accuracy: 0.7892 - 531ms/epoch - 680us/step

 Iteration 6 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 0s - loss: -2.1525e+00 - accuracy: 0.7876 - 489ms/epoch - 625us/step

 Iteration 7 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16


 Iteration 8 completed. 

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
782/782 - 1s - loss: 9.1846 - accuracy: 0.5950 - 525ms/epoch - 671us/step

 Iteration 9 completed. 

0.7731599807739258
0.7822800278663635
0.7759199738502502
0.7802799940109253
0.7851200103759766
0.5950000286102295
0.7892000079154968
0.7875999808311462
0.6960399746894836
0.5950000286102295
