## Data Preprocessing

The dataset is obtained from the Seventeenth release of Sloan Digital Sky Surveys (SDSS - DR17). The dataset consists of 100,000 data points with 18 attributes including the class of astronomical objects.

In [1]:
import pandas as pd # For Dataframe manupulation
import numpy as np # For arrays manipulation

#### Data Cleansing

In [2]:
df = pd.read_csv("D:/Academic/MSc_Data_Science/Course/8.Technology Dissertation/4. Dissertation/Datasets/sgq_classification.csv") # Importing astronomical objects dataset
df

Unnamed: 0,obj_ID,alpha,delta,u,g,r,i,z,run_ID,rerun_ID,cam_col,field_ID,spec_obj_ID,class,redshift,plate,MJD,fiber_ID
0,1.237661e+18,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,3606,301,2,79,6.543777e+18,GALAXY,0.634794,5812,56354,171
1,1.237665e+18,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,4518,301,5,119,1.176014e+19,GALAXY,0.779136,10445,58158,427
2,1.237661e+18,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,3606,301,2,120,5.152200e+18,GALAXY,0.644195,4576,55592,299
3,1.237663e+18,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,4192,301,3,214,1.030107e+19,GALAXY,0.932346,9149,58039,775
4,1.237680e+18,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,8102,301,3,137,6.891865e+18,GALAXY,0.116123,6121,56187,842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,1.237679e+18,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,7778,301,2,581,1.055431e+19,GALAXY,0.000000,9374,57749,438
99996,1.237679e+18,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,7917,301,1,289,8.586351e+18,GALAXY,0.404895,7626,56934,866
99997,1.237668e+18,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,5314,301,4,308,3.112008e+18,GALAXY,0.143366,2764,54535,74
99998,1.237661e+18,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,3650,301,4,131,7.601080e+18,GALAXY,0.455040,6751,56368,470


In [3]:
# Handling missing values

null_counts = df.isnull().sum() # To Count the null values in each column

print(null_counts) # to print the null counts for each column

obj_ID         0
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
class          0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64


There are no null values in the dataset.

In [4]:
# Handling duplicate values

# To Count the duplicate rows in the DataFrame
duplicate_count = df.duplicated().sum()

duplicate_count

0

There are no duplicate rows in the dataset

In [5]:
# Handling Outliers

# Defining of a function to remove outliers based on the interquartile range (IQR) method
def remove_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5*IQR
    upper_bound = Q3 + 1.5*IQR
    return column[(column >= lower_bound) & (column <= upper_bound)]

# Removing outliers from the alpha,delta,u,g,i,r,z columns
df['alpha'] = remove_outliers(df['alpha'])
df['delta'] = remove_outliers(df['delta'])
df['u'] = remove_outliers(df['u'])
df['g'] = remove_outliers(df['g'])
df['r'] = remove_outliers(df['r'])
df['i'] = remove_outliers(df['i'])
df['z'] = remove_outliers(df['z'])
df['redshift'] = remove_outliers(df['redshift'])



In [6]:
# Handling missing values after removing outliers

null_counts = df.isnull().sum() # To Count the null values in each column

print(null_counts) # to print the null counts for each column

obj_ID            0
alpha             0
delta             0
u                56
g                99
r               132
i               198
z               320
run_ID            0
rerun_ID          0
cam_col           0
field_ID          0
spec_obj_ID       0
class             0
redshift       8990
plate             0
MJD               0
fiber_ID          0
dtype: int64


In [7]:
# Removing rows with NaN or None values
df = df.dropna()

null_counts = df.isnull().sum() # To Count the null values in each column

print(null_counts) # To print the null counts for each column

obj_ID         0
alpha          0
delta          0
u              0
g              0
r              0
i              0
z              0
run_ID         0
rerun_ID       0
cam_col        0
field_ID       0
spec_obj_ID    0
class          0
redshift       0
plate          0
MJD            0
fiber_ID       0
dtype: int64


#### Data Reduction

In [8]:
# To select a subset of columns
columns_to_keep = ['alpha', 'delta', 'u','g','r','i','z','redshift','class']
df = df[columns_to_keep]

df

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,0.634794,GALAXY
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0.779136,GALAXY
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0.644195,GALAXY
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,0.932346,GALAXY
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0.116123,GALAXY
...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,0.000000,GALAXY
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,0.404895,GALAXY
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,0.143366,GALAXY
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,0.455040,GALAXY


In [9]:
# Saving the pre transformed dataset for exploratory analysis

df.to_csv("D:/Academic/MSc_Data_Science/Course/8.Technology Dissertation/4. Dissertation/Datasets/sgq_classification_exp.csv",index = False)

#### Data Transforming

In [10]:
from sklearn.preprocessing import LabelEncoder

# To create a LabelEncoder object
le = LabelEncoder()

# To encode categorical variables
df['class'] = le.fit_transform(df['class'])
df

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
0,135.689107,32.494632,23.87882,22.27530,20.39501,19.16573,18.79371,0.634794,0
1,144.826101,31.274185,24.77759,22.83188,22.58444,21.16812,21.61427,0.779136,0
2,142.188790,35.582444,25.26307,22.66389,20.60976,19.34857,18.94827,0.644195,0
3,338.741038,-0.402828,22.13682,23.77656,21.61162,20.50454,19.25010,0.932346,0
4,345.282593,21.183866,19.43718,17.58028,16.49747,15.97711,15.54461,0.116123,0
...,...,...,...,...,...,...,...,...,...
99995,39.620709,-2.594074,22.16759,22.97586,21.90404,21.30548,20.73569,0.000000,0
99996,29.493819,19.798874,22.69118,22.38628,20.45003,19.75759,19.41526,0.404895,0
99997,224.587407,15.700707,21.16916,19.26997,18.20428,17.69034,17.35221,0.143366,0
99998,212.268621,46.660365,25.35039,21.63757,19.91386,19.07254,18.62482,0.455040,0


In [11]:
print(le.classes_)

['GALAXY' 'QSO' 'STAR']


#### Data Normalization

##### Normalization using min-max method

In [12]:
# Copying df dataset for min_max normalization
dfn = df.copy()

In [13]:
# Selecting the columns to be normalized
cols_to_normalize = ['alpha', 'delta', 'u','g','r','i','z','redshift']

# Calculating the minimum and maximum values of each column
min_vals = dfn[cols_to_normalize].min()
max_vals = dfn[cols_to_normalize].max()

# Normalizing each column using the min-max method
dfn[cols_to_normalize] = (dfn[cols_to_normalize] - min_vals) / (max_vals - min_vals)

In [14]:
dfn

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
0,0.376905,0.503802,0.642552,0.636604,0.562453,0.503890,0.510806,0.381862,0
1,0.402286,0.491812,0.710331,0.681134,0.754730,0.694137,0.797847,0.467349,0
2,0.394960,0.534139,0.746942,0.667693,0.581312,0.521262,0.526535,0.387430,0
3,0.940947,0.180600,0.511182,0.756715,0.669296,0.631090,0.557252,0.558088,0
4,0.959118,0.392679,0.307593,0.260969,0.220169,0.200940,0.180154,0.074679,0
...,...,...,...,...,...,...,...,...,...
99995,0.110044,0.159072,0.513502,0.692653,0.694976,0.707187,0.708436,0.005905,0
99996,0.081913,0.379072,0.552988,0.645483,0.567285,0.560123,0.574060,0.245705,0
99997,0.623848,0.338810,0.438207,0.396156,0.370062,0.363714,0.364108,0.090814,0
99998,0.589629,0.642974,0.753527,0.585581,0.520198,0.495036,0.493619,0.275403,0


In [15]:
# Save the normalized dataset to a new CSV file
dfn.to_csv("D:/Academic/MSc_Data_Science/Course/8.Technology Dissertation/4. Dissertation/Datasets/sgq_classification_mm.csv",index = False)

##### Normalization using Z-Score method

In [16]:
# Copying df dataset for Z-Score normalization
dfz = df.copy()

In [17]:
# Select the columns to be normalized
cols_to_normalize = ['alpha', 'delta', 'u','g','r','i','z','redshift']

# Calculating the mean and standard deviation of each column
mean_vals = dfz[cols_to_normalize].mean()
std_vals = dfz[cols_to_normalize].std()

# Normalizing each column using the z-score method
dfz[cols_to_normalize] = (dfz[cols_to_normalize] - mean_vals) / std_vals


In [18]:
dfz

Unnamed: 0,alpha,delta,u,g,r,i,z,redshift,class
0,-0.431243,0.435239,0.766208,0.800355,0.456317,0.126944,0.102913,0.621070,0
1,-0.336439,0.373162,1.158204,1.067884,1.636447,1.292873,1.751187,0.993048,0
2,-0.363803,0.592296,1.369945,0.987137,0.572069,0.233406,0.193235,0.645296,0
3,1.675575,-1.238043,0.006439,1.521959,1.112084,0.906491,0.369618,1.387878,0
4,1.743448,-0.140067,-1.171002,-1.456380,-1.644506,-1.729691,-1.795789,-0.715575,0
...,...,...,...,...,...,...,...,...,...
99995,-1.428025,-1.349498,0.019859,1.137090,1.269703,1.372854,1.237764,-1.014829,0
99996,-1.533099,-0.210513,0.248222,0.853699,0.485973,0.471565,0.466134,0.028608,0
99997,0.491145,-0.418960,-0.415603,-0.644204,-0.724514,-0.732130,-0.739468,-0.645368,0
99998,0.363328,1.155758,1.408030,0.493820,0.196971,0.072682,0.004218,0.157834,0


In [19]:
# Saving the normalized dataset to a new CSV file
dfz.to_csv("D:/Academic/MSc_Data_Science/Course/8.Technology Dissertation/4. Dissertation/Datasets/sgq_classification_zc.csv",index = False)