In [1]:
#Importing libraries

import plotly.express as px
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
#import Random forest classifiers
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [2]:
import pandas as pd 

# get the objid and corresponding asset_id from gz2_filename_mapping.csv
columns_to_keep = ['objid', 'asset_id']

# Read the selected columns from the file
name_map = pd.read_csv("data/gz2_filename_mapping.csv", usecols=columns_to_keep)

# display the first few rows
print(name_map.head(5))

name_map.info()

                objid  asset_id
0  587722981736120347         1
1  587722981736579107         2
2  587722981741363294         3
3  587722981741363323         4
4  587722981741559888         5
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 355990 entries, 0 to 355989
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   objid     355990 non-null  int64
 1   asset_id  355990 non-null  int64
dtypes: int64(2)
memory usage: 5.4 MB


In [3]:
# select columns dr7objid and gz2class from zoo2MainSpecz.csv
columns_to_keep = ['dr7objid', 'gz2class']

# Read the selected columns from the file
labels = pd.read_csv("data/zoo2MainSpecz.csv", usecols=columns_to_keep)

# change the name of column dr7objid to objid for merging later
labels.rename(columns={'dr7objid':'objid'}, inplace=True)

# display
print(labels.head(5))

labels.info()

                objid gz2class
0  588017703996096547    SBb?t
1  587738569780428805      Ser
2  587735695913320507     Sc+t
3  587742775634624545   SBc(r)
4  587732769983889439      Ser
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243500 entries, 0 to 243499
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   objid     243500 non-null  int64 
 1   gz2class  243500 non-null  object
dtypes: int64(1), object(1)
memory usage: 3.7+ MB


In [4]:
# Sequential implementation. Loops through one image at a time. This is embarassingly 
# parallelizable. The task which here consists of 1. processing images, converting to grayscale 
# and flattening pixel values is CPU-bound, ie performance is determined promarily by how
# CPU can process it in contrast to I/O bound. 
# We can parallelize using Multiprocessing library or Dask. 

import os 
from PIL import Image, ImageOps
from numpy import asarray

# Directory containing the images
image_dir = "data/images"

# List to store image data 
image_data = []
image_names = []

# Iterate over all files in the directory 
for filename in os.listdir(image_dir):
    if filename.endswith(('.jpg', '.png')): #filter the image files
        image_path = os.path.join(image_dir, filename)

        # Open image and convert to grayscale
        img = Image.open(image_path)
        img_gray = ImageOps.grayscale(img)

        # Convert to a numpy array and flatter it to 1D
        img_array = np.asarray(img_gray).flatten()

        #store the image data and filename
        image_data.append(img_array)

        # Extract the base name without the extension
        image_name = os.path.splitext(filename)[0] # Get only the root, ie w/o extension
    
        image_names.append(image_name)

# convert to DataFrame
image_data = pd.DataFrame(image_data)
image_data.insert(0, "asset_id", image_names) # NOTE: asset_id values are object type. Need to convert to int64 before merging later. 
# print(image_data['asset_id'].dtype)

#display the data frame
print(image_data.head())
image_data.info()

# Save to CSV
#image_data("image_pixel_data.csv", index=False)

  asset_id   0   1   2   3   4   5   6  7  8  ...  179766  179767  179768  \
0   175547   3   3   2   1   0   0   0  0  0  ...       9      11       7   
1   275362  17  16  13  10   7   4   2  1  0  ...       1       1       0   
2    44579  15  11   6   3   3   3   3  3  5  ...       9       9       5   
3   246001   6   6   7   7   6   5   3  1  1  ...      31      31      29   
4   218127   0   1   5  10  13  13  10  7  5  ...       2       2       1   

   179769  179770  179771  179772  179773  179774  179775  
0       5       2       0       0       2       6       9  
1       0       1       2       3       5       5       6  
2       3       1       0       0       1       3       5  
3      30      31      33      34      34      34      34  
4       1       2       3       6       7       5       3  

[5 rows x 179777 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Columns: 179777 entries, asset_id to 179775
dtypes: object(1), uint8(179776)
m

In [5]:
# Merge labels and name_map dataframes to map asset_id to gz2class
# merge based on objid. use an inner join (only matching rows) 
# since only a subset of points in labels are in name_map, ann inner join 
# will include the rows from name_map that have matching gz2class values
# this will avoid NaNs

labels_mapped = pd.merge(name_map, labels, on='objid', how='inner' ) 

print(labels_mapped.head(5))

labels_mapped.info() # should have the same number of rows as the dataframe labels

                objid  asset_id gz2class
0  587722981741363294         3       Ei
1  587722981741363323         4       Sc
2  587722981741559888         5       Er
3  587722981741625481         6       Er
4  587722981741625484         7       Ei
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243500 entries, 0 to 243499
Data columns (total 3 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   objid     243500 non-null  int64 
 1   asset_id  243500 non-null  int64 
 2   gz2class  243500 non-null  object
dtypes: int64(2), object(1)
memory usage: 5.6+ MB


In [6]:
# Merge labels_mapped with image_data to insert gz2class columnt to the latter 
# Merge based on asset_id and use an inner join. image_data which is our 
# main data frame will only have, in general, a subset of data points (galaxies)
# in labels_mapped. 

# convert asset_id values in image_data from object to int64 before mergeing
image_data['asset_id'] = labels_mapped['asset_id'].astype(int)

#merge
galaxy_data = pd.merge(labels_mapped, image_data, on='asset_id', how='inner' ) 

# Move gz2class to the last position to serve as labels
galaxy_data['gz2class'] = galaxy_data.pop('gz2class')  

# print
print(galaxy_data)

galaxy_data.info()

                  objid  asset_id   0   1   2   3   4   5   6  7  ...  179767  \
0    587722981741363294         3   3   3   2   1   0   0   0  0  ...      11   
1    587722981741363323         4  17  16  13  10   7   4   2  1  ...       1   
2    587722981741559888         5  15  11   6   3   3   3   3  3  ...       9   
3    587722981741625481         6   6   6   7   7   6   5   3  1  ...      31   
4    587722981741625484         7   0   1   5  10  13  13  10  7  ...       2   
..                  ...       ...  ..  ..  ..  ..  ..  ..  .. ..  ...     ...   
495  587722982300058037       545  12  10   7   4   2   2   3  3  ...       1   
496  587722982300123441       546  21   4   1   1   1   2   6  1  ...       9   
497  587722982300123469       547   6   5   3   1   2   3   5  7  ...       3   
498  587722982300123578       548   8   7   4   2   1   0   1  1  ...       4   
499  587722982300188997       549   5   2   0   0   4   6   5  3  ...       6   

     179768  179769  179770

In [7]:
galaxy_data.to_csv("galaxy_data.csv", index=False)

In [8]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
galaxy_data = pd.read_csv("galaxy_data.csv")

# Drop asset_id
galaxy_data.drop(columns=['asset_id'], inplace=True)

In [9]:
# Count missing values
print(galaxy_data.isnull().sum().sum())

0


In [10]:
galaxy_data["gz2class"].unique().shape

(70,)

In [11]:
# Preprocess the features
# Encode categorical variables
from sklearn.preprocessing import LabelEncoder

def map_gz2class_to_numeric(galaxy_data):
    """
    Maps the categorical values in the 'gz2class' column to numerical values using LabelEncoder.
    """
    # Initialize LabelEncoder
    le = LabelEncoder()

    # Fit and transform the target 'gz2class' column and create a new 'gz2class_mapped' column
    galaxy_data['gz2class_mapped'] = le.fit_transform(galaxy_data['gz2class'])

    return galaxy_data

In [12]:
galaxy_data = map_gz2class_to_numeric(galaxy_data)

# Print the updated DataFrame with the mapped column
print(galaxy_data[['gz2class', 'gz2class_mapped']].head())

  gz2class  gz2class_mapped
0       Ei                2
1       Sc               47
2       Er                7
3       Er                7
4       Ei                2


In [13]:
# In this step we check different values of variance (0.001,0.1, 0.5) but the shape remain same.
# Variancethreshold has no effect on the feature
from sklearn.feature_selection import VarianceThreshold

# Drop ID and target columns first
columns_to_drop = ['objid', 'gz2class', 'gz2class_mapped']
feature_data = galaxy_data.drop(columns=columns_to_drop)

# Remove low-variance features
selector = VarianceThreshold(threshold=0.1)
feature_data = feature_data.loc[:, selector.fit(feature_data).get_support()]

print("Shape after removing low-variance features:", feature_data.shape)

Shape after removing low-variance features: (500, 179776)


In [None]:
from sklearn.feature_selection import mutual_info_classif

# Drop unwanted columns first
columns_to_drop = ['objid', 'gz2class']
feature_data = galaxy_data.drop(columns=[col for col in columns_to_drop if col in galaxy_data.columns])

# Define features (X) and target (y)
X = feature_data.drop(columns=['gz2class_mapped'])  # Ensure target is not in features
y = feature_data['gz2class_mapped']  # Use the numerical label for MI computation

# Compute Mutual Information Scores
mi_scores = mutual_info_classif(X, y, discrete_features=False)

# Convert to DataFrame for better visualization
mi_scores_df = pd.DataFrame({'Feature': X.columns, 'MI_Score': mi_scores})

# Select only important features (MI > 0.01)
important_features = mi_scores_df[mi_scores_df['MI_Score'] > 0.01]['Feature']

# Create reduced dataset with selected features
df_reduced = feature_data[['gz2class_mapped'] + list(important_features)]

# Print results
print(f"Kept {len(important_features)} important pixels out of {X.shape[1]} total features.")
print("New shape of feature data:", df_reduced.shape)

In [None]:
from sklearn.decomposition import PCA
import numpy as np

# Standardize features for PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_reduced.drop(columns=['gz2class_mapped']))

# Apply PCA by keep enough components to retain 95% variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

# Print number of components selected
print(f"PCA reduced features from {X_scaled.shape[1]} to {X_pca.shape[1]} components.")

# Convert back to DataFrame
df_pca = pd.DataFrame(X_pca)
df_pca['gz2class_mapped'] = df_reduced['gz2class_mapped'].values