# Preprocessing

In [13]:
import pandas as pd  
data  = pd.read_csv('data/SDSS_DR18.csv')
data.head()

Unnamed: 0,objid,specobjid,ra,dec,u,g,r,i,z,run,...,psfMag_g,psfMag_i,psfMag_z,expAB_u,expAB_g,expAB_r,expAB_i,expAB_z,redshift,class
0,1.24e+18,3.24e+17,184.950869,0.733068,18.87062,17.59612,17.11245,16.83899,16.70908,756,...,19.96352,19.25145,19.0523,0.479021,0.518483,0.520474,0.508502,0.488969,0.041691,GALAXY
1,1.24e+18,3.25e+17,185.729201,0.679704,19.5956,19.92153,20.34448,20.66213,20.59599,756,...,19.92417,20.65535,20.57387,0.573926,0.531728,0.403072,0.999874,0.189495,-0.000814,STAR
2,1.24e+18,3.24e+17,185.68769,0.82348,19.26421,17.87891,17.09593,16.65159,16.35329,756,...,19.33645,18.16669,17.78844,0.701666,0.743386,0.770897,0.778642,0.736771,0.113069,GALAXY
3,1.24e+18,2.88e+18,185.677904,0.768362,19.49739,17.96166,17.41269,17.20545,17.11567,756,...,17.96176,17.21564,17.12367,0.999818,0.78776,0.745611,0.399718,0.986137,8.7e-05,STAR
4,1.24e+18,2.88e+18,185.814763,0.77694,18.31519,16.83033,16.26352,16.0632,15.97527,756,...,16.85104,16.08275,15.98694,0.999795,0.83445,0.723526,0.712259,0.527055,1.8e-05,STAR


In [14]:
data.columns.values

array(['objid', 'specobjid', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'run',
       'rerun', 'camcol', 'field', 'plate', 'mjd', 'fiberid',
       'petroRad_u', 'petroRad_g', 'petroRad_i', 'petroRad_r',
       'petroRad_z', 'petroFlux_u', 'petroFlux_g', 'petroFlux_i',
       'petroFlux_r', 'petroFlux_z', 'petroR50_u', 'petroR50_g',
       'petroR50_i', 'petroR50_r', 'petroR50_z', 'psfMag_u', 'psfMag_r',
       'psfMag_g', 'psfMag_i', 'psfMag_z', 'expAB_u', 'expAB_g',
       'expAB_r', 'expAB_i', 'expAB_z', 'redshift', 'class'], dtype=object)

We can remove all the non numerical columns which are:
- obj_ID
- run_ID
- rerun_ID
- cam_col
- field_ID
- spec_obj_ID
- class
- plate
- MJD
- fiber_ID

These columns are used for image IDS and other such ids for a larger dataset which we do not need for our purposes

In [15]:
data = data.drop(
    ['objid', 'specobjid', 'fiberid', 'run', 'class', 'camcol', 'field', 'rerun', 'fiberid', 'mjd'],
    axis=1
)

Now we can also drop all nan values or empty values

In [16]:
data.dropna(inplace=True)

Now finally we just need to seperate features and target

In [17]:
features = data.drop(['redshift'], axis=1)
target = data['redshift']

Now we can train test split our data

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Model training

We will use a 

In [19]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [20]:
model.score(X_test, y_test)

0.7808525524486037