In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import plotly.express as px
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Data loading
df_raw = pd.read_csv("star_classification_raw.csv")
df_raw.head(10)

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
5,1.23768e+18,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,5.658977e+18,QSO,1.424659,5026,55855,741
6,1.237679e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,1.246262e+19,QSO,0.586455,11069,58456,113
7,1.237679e+18,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,7773,301,2,346,6.961443e+18,GALAXY,0.477009,6183,56210,15
8,1.237661e+18,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,3716,301,5,108,7.459285e+18,GALAXY,0.660012,6625,56386,719
9,1.237671e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,2.751763e+18,STAR,-8e-06,2444,54082,232


In [3]:
# Remove the Object ID, Run ID, Rerun ID, Unique ID, and MJD Columns. ID and date columns were
    # determined to be irrelevant to catigorization of stellar objects.
df_mod = df_raw.drop(columns=["obj_ID","run_ID","rerun_ID","spec_obj_ID","redshift","MJD","cam_col","plate","field_ID","fiber_ID"])
df_mod.head(10)

Unnamed: 0,alpha,delta,u,g,r,i,z,class
0,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,GALAXY
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,GALAXY
2,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,GALAXY
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,GALAXY
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,GALAXY
5,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,QSO
6,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,QSO
7,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,GALAXY
8,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,GALAXY
9,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,STAR


In [4]:
df_mod = df_mod.sort_values(by='z')
df_mod = df_mod.iloc[1:]
df_mod

Unnamed: 0,alpha,delta,u,g,r,i,z,class
14498,84.230172,7.138590,14.15199,10.73097,9.82207,9.469903,9.612333,STAR
4472,239.423127,28.548438,12.26240,10.51139,10.06854,13.417860,10.225510,STAR
75784,239.473361,27.594369,12.10168,10.49820,10.11604,10.008650,10.441310,STAR
75620,239.207155,27.898444,12.30349,10.67180,10.19460,10.055090,10.650560,STAR
85285,164.321732,40.221137,15.44840,11.79892,10.86379,10.566470,10.778890,STAR
...,...,...,...,...,...,...,...,...
29689,146.345791,24.566331,18.50142,18.51986,20.75484,30.163590,27.673360,GALAXY
85950,255.213997,28.549537,20.65719,19.14651,20.99419,29.889210,27.805190,GALAXY
40201,36.908884,-1.982394,20.00673,20.29219,18.54149,17.813190,28.238290,GALAXY
93321,16.455371,0.321688,18.64831,18.43714,17.49240,32.141470,28.790550,QSO


In [5]:
# Transform Class column
def changeStatus(stellar_class):
    if stellar_class == "GALAXY":
        return 0
    elif stellar_class == "STAR":
        return 1
    else:
        return 2
    
df_mod["class"] = df_mod["class"].apply(changeStatus)
df_mod.head(10)

Unnamed: 0,alpha,delta,u,g,r,i,z,class
14498,84.230172,7.13859,14.15199,10.73097,9.82207,9.469903,9.612333,1
4472,239.423127,28.548438,12.2624,10.51139,10.06854,13.41786,10.22551,1
75784,239.473361,27.594369,12.10168,10.4982,10.11604,10.00865,10.44131,1
75620,239.207155,27.898444,12.30349,10.6718,10.1946,10.05509,10.65056,1
85285,164.321732,40.221137,15.4484,11.79892,10.86379,10.56647,10.77889,1
81298,8.51163,-9.705282,14.5601,12.68849,11.77229,11.31937,10.89738,0
4401,184.28205,7.624347,14.50678,12.67902,11.74664,11.29956,10.91847,0
75583,239.702668,27.467439,12.99664,11.33897,10.98255,10.87374,11.19448,1
57763,208.358688,33.825699,10.99623,13.66217,12.35763,12.63744,11.30247,2
76986,240.602807,26.945961,15.32899,11.47435,11.09069,10.95665,11.41484,1


In [6]:
# Define the features set.
X = df_mod.copy()
X = X.drop(columns=["class"])

In [7]:
# Define the target set.
y = df_mod["class"]

In [8]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [9]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [10]:
# Define number of models to create
n = 10

# Creating multiple iterations of Random Forest classifiers
counter = 0
for i in range(n):
    # Create a random forest classifier.
    rf_model = RandomForestClassifier(n_estimators=128, random_state=counter)
    
    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)
    
    # Making predictions using the testing data.
    predictions = rf_model.predict(X_test_scaled)
    print(accuracy_score(y_test, predictions))
    
    counter = counter + 1

0.87624
0.87644
0.87692
0.87544
0.87616
0.87564
0.87568
0.87612
0.8762
0.87552


In [11]:
# Creating multiple iterations of Random Forest classifiers to determine 
counter = 0
for i in range(n):
    # Create a random forest classifier.
    rf_model = RandomForestClassifier(n_estimators=128, random_state=counter)
    
    # Fitting the model
    rf_model = rf_model.fit(X_train_scaled, y_train)
    
    # Calculate and print feature importance in the Random Forest model.
    importances = rf_model.feature_importances_
    print(sorted(zip(rf_model.feature_importances_, X.columns), reverse=True))
    
    counter = counter + 1

[(0.21309824532689198, 'z'), (0.17822857464860886, 'g'), (0.17440300819766755, 'u'), (0.1530850431704202, 'i'), (0.13970103772461936, 'r'), (0.07301326715122956, 'alpha'), (0.06847082378056267, 'delta')]
[(0.21706263897925282, 'z'), (0.1778391007876177, 'g'), (0.17760651824787574, 'u'), (0.1456936904395251, 'i'), (0.13954996535019487, 'r'), (0.07374258190117544, 'alpha'), (0.06850550429435827, 'delta')]
[(0.22360883164726608, 'z'), (0.1799681339574319, 'g'), (0.1681555854182293, 'u'), (0.14835525421304604, 'i'), (0.13717254555096858, 'r'), (0.07354513866349029, 'alpha'), (0.06919451054956775, 'delta')]
[(0.21740779320226683, 'z'), (0.17965155345259715, 'g'), (0.17675147659122142, 'u'), (0.14356698418739416, 'r'), (0.14116023370630396, 'i'), (0.07279187138060032, 'alpha'), (0.06867008747961624, 'delta')]
[(0.2160202269422846, 'z'), (0.1792618582194088, 'g'), (0.17385634786737772, 'u'), (0.1447273875816456, 'i'), (0.1439067536263959, 'r'), (0.07351105848197104, 'alpha'), (0.0687163672809