In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split as tts
from sklearn.linear_model import LogisticRegression as LogReg
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
import category_encoders as ce

In [5]:
# imports the csv from my github as a dataframe

df0 = pd.read_csv('https://raw.githubusercontent.com/SeanAntosiak/LS-DS6-Unit-2-Project/master/SDSSdataset.csv')

In [26]:
# redshift values for QSO are much higher but intensity is similar
# I will create features that tries to account for this

cols = ['u','g','r','i','z']

df1 = df0.copy()

for col in cols:
    df1[f'{col}RS']=(df1[col]*(df1['redshift']));

In [27]:
# checking to make sure features were added correctly

df1.head()

Unnamed: 0,objid,ra,dec,u,g,r,i,z,run,rerun,...,class,redshift,plate,mjd,fiberid,uRS,gRS,rRS,iRS,zRS
0,1.23765e+18,183.531326,0.089693,19.47406,17.0424,15.94699,15.50342,15.22531,752,301,...,STAR,-9e-06,3306,54922,491,-0.000174,-0.000153,-0.000143,-0.000139,-0.000136
1,1.23765e+18,183.598371,0.135285,18.6628,17.21449,16.67637,16.48922,16.3915,752,301,...,STAR,-5.5e-05,323,51615,541,-0.001025,-0.000945,-0.000916,-0.000905,-0.0009
2,1.23765e+18,183.680207,0.126185,19.38298,18.19169,17.47428,17.08732,16.80125,752,301,...,GALAXY,0.123111,287,52023,513,2.386262,2.239601,2.15128,2.10364,2.068422
3,1.23765e+18,183.870529,0.049911,17.76536,16.60272,16.16116,15.98233,15.90438,752,301,...,STAR,-0.000111,3306,54922,510,-0.001965,-0.001837,-0.001788,-0.001768,-0.001759
4,1.23765e+18,183.883288,0.102557,17.55025,16.26342,16.43869,16.55492,16.61326,752,301,...,STAR,0.00059,3306,54922,512,0.010361,0.009601,0.009705,0.009773,0.009808


In [28]:
# sets initinal X features and y lables 
# using intensity(brightness) and redshift(distance) colums as features

X0 = df1[['u', 'g', 'r', 'i', 'z','uRS', 'gRS', 'rRS', 'iRS', 'zRS', 'redshift']]

y0 = df1['class']

In [29]:
# creating a train test split, and then again for a train validation split

X0train, X0test, y0train, y0test = tts(X0,y0, train_size=0.90, test_size=0.10, random_state=8)

X1train, X1val, y1train, y0val = tts(X0train,y0train, train_size=0.90, test_size=0.10, random_state=8)

In [30]:
# finding a baseline for majority class

y0.value_counts(normalize=True)

# predicting galaxy every time would result in an accuracy of almost 50%

GALAXY    0.4998
STAR      0.4152
QSO       0.0850
Name: class, dtype: float64

In [31]:
# settings a logistic regression model and scores it on test set

mod0 = LogReg(solver='lbfgs', multi_class='auto', max_iter=1000) # keywords set to silence warnings
mod0.fit(X1train,y1train)
mod0.score(X1val, y0val)


0.9822222222222222

In [32]:
# tries a basic RandomForestClassifier and scores it

mod1 = RandomForestClassifier(max_depth=20, n_estimators=1000, n_jobs=-1) # keywords set to silence warnings
mod1.fit(X1train,y1train)
mod1.score(X1val, y0val)


0.9866666666666667

In [33]:
# checks importance values

importance = mod1.feature_importances_
pd.Series(importance, X1val.columns)

u           0.004548
g           0.003190
r           0.005232
i           0.010165
z           0.008466
uRS         0.142294
gRS         0.179247
rRS         0.170724
iRS         0.149693
zRS         0.127106
redshift    0.199335
dtype: float64