In [1]:
# Initial imports.
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Data loading
df_raw = pd.read_csv("star_classification_raw.csv")
df_raw.head(10)

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.1522e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.23768e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
5,1.23768e+18,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,8102,301,3,110,5.658977e+18,QSO,1.424659,5026,55855,741
6,1.237679e+18,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,7773,301,2,462,1.246262e+19,QSO,0.586455,11069,58456,113
7,1.237679e+18,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,7773,301,2,346,6.961443e+18,GALAXY,0.477009,6183,56210,15
8,1.237661e+18,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,3716,301,5,108,7.459285e+18,GALAXY,0.660012,6625,56386,719
9,1.237671e+18,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,5934,301,4,122,2.751763e+18,STAR,-8e-06,2444,54082,232


In [3]:
df_raw.dtypes

obj_ID         float64
alpha          float64
delta          float64
u              float64
g              float64
r              float64
i              float64
z              float64
run_ID           int64
rerun_ID         int64
cam_col          int64
field_ID         int64
spec_obj_ID    float64
class           object
redshift       float64
plate            int64
MJD              int64
fiber_ID         int64
dtype: object

In [4]:
# Remove the Object ID, Run ID, Rerun ID, Unique ID, and MJD Columns. ID and date columns were
    # determined to be irrelevant to catigorization of stellar objects.
df_mod = df_raw.drop(columns=["obj_ID","run_ID","rerun_ID","spec_obj_ID","MJD"])
df_mod.head(10)

Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,field_ID,class,redshift,plate,fiber_ID
0,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,2,79,GALAXY,0.634794,5812,171
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,5,119,GALAXY,0.779136,10445,427
2,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,2,120,GALAXY,0.644195,4576,299
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,3,214,GALAXY,0.932346,9149,775
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,3,137,GALAXY,0.116123,6121,842
5,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,3,110,QSO,1.424659,5026,741
6,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,2,462,QSO,0.586455,11069,113
7,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,2,346,GALAXY,0.477009,6183,15
8,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,5,108,GALAXY,0.660012,6625,719
9,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,4,122,STAR,-8e-06,2444,232


In [5]:
df_mod.isnull().sum()

alpha       0
delta       0
u           0
g           0
r           0
i           0
z           0
cam_col     0
field_ID    0
class       0
redshift    0
plate       0
fiber_ID    0
dtype: int64

In [6]:
# Transform Class column
def changeStatus(stellar_class):
    if stellar_class == "GALAXY":
        return 0
    elif stellar_class == "STAR":
        return 1
    else:
        return 2
    
df_mod["class"] = df_mod["class"].apply(changeStatus)
df_mod.head(10)

Unnamed: 0,alpha,delta,u,g,r,i,z,cam_col,field_ID,class,redshift,plate,fiber_ID
0,135.689107,32.494632,23.87882,22.2753,20.39501,19.16573,18.79371,2,79,0,0.634794,5812,171
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,5,119,0,0.779136,10445,427
2,142.18879,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,2,120,0,0.644195,4576,299
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.2501,3,214,0,0.932346,9149,775
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,3,137,0,0.116123,6121,842
5,340.995121,20.589476,23.48827,23.33776,21.32195,20.25615,19.54544,3,110,2,1.424659,5026,741
6,23.234926,11.418188,21.46973,21.17624,20.92829,20.60826,20.42573,2,462,2,0.586455,11069,113
7,5.433176,12.065186,22.24979,22.02172,20.34126,19.48794,18.84999,2,346,0,0.477009,6183,15
8,200.290475,47.199402,24.40286,22.35669,20.61032,19.4649,18.95852,5,108,0,0.660012,6625,719
9,39.149691,28.102842,21.74669,20.03493,19.17553,18.81823,18.65422,4,122,1,-8e-06,2444,232


In [7]:
# Define the features set.
X = df_mod.copy()
X = X.drop(columns=["class"])

In [8]:
# Define the target set.
y = df_mod["class"]

In [9]:
# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [10]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [11]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [12]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [13]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

In [14]:
# Calculating the accuracy score.
accuracy_score(y_test, predictions)

0.98004

In [15]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.01061671, 0.01103102, 0.05960461, 0.06482391, 0.04176732,
       0.05506448, 0.07696146, 0.00285047, 0.00684749, 0.61694768,
       0.04621942, 0.00726544])

In [16]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.6169476782740584, 'redshift'),
 (0.07696145503807554, 'z'),
 (0.06482390694361198, 'g'),
 (0.059604609232729096, 'u'),
 (0.05506448398605786, 'i'),
 (0.04621941630859073, 'plate'),
 (0.04176732191107626, 'r'),
 (0.01103101820638251, 'delta'),
 (0.010616709930062028, 'alpha'),
 (0.00726544031050859, 'fiber_ID'),
 (0.006847488220653701, 'field_ID'),
 (0.0028504716381934263, 'cam_col')]