<a href="https://colab.research.google.com/github/RaincallerMei/CSC413/blob/main/Astro_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
!unzip /content/drive/MyDrive/AstroCNN/galaxy-zoo-the-galaxy-challenge.zip -d /content/dataset


Archive:  /content/drive/MyDrive/AstroCNN/galaxy-zoo-the-galaxy-challenge.zip
  inflating: /content/dataset/all_ones_benchmark.zip  
  inflating: /content/dataset/all_zeros_benchmark.zip  
  inflating: /content/dataset/central_pixel_benchmark.zip  
  inflating: /content/dataset/images_test_rev1.zip  
  inflating: /content/dataset/images_training_rev1.zip  
  inflating: /content/dataset/training_solutions_rev1.zip  


In [4]:
import os
import glob

# Go to the directory where the 6 zips live
os.chdir("/content/dataset")

# Find all the .zip files
zip_files = glob.glob("*.zip")

# Unzip each
for zipf in zip_files:
    folder_name = os.path.splitext(zipf)[0]  # e.g., "folder1" from "folder1.zip"
    !unzip -q {zipf} -d {folder_name}

In [5]:
!rm *.zip

#Match Images to CSV Rows

Load and Index the CSV

In [7]:
import pandas as pd

solutions_df = pd.read_csv("/content/dataset/training_solutions_rev1/training_solutions_rev1.csv")

# Suppose the columns are like:
# GalaxyID, p1, p2, ..., p37
# We'll create a dict { galaxy_id: [p1, p2, ..., p37], ... }

id_to_probs = {}
for row in solutions_df.itertuples(index=False):
    galaxy_id = str(row.GalaxyID)
    # row has p1, p2, ..., p37 as subsequent columns
    # Convert them to a list or tensor
    probabilities = list(row[1:])  # row[0] is GalaxyID, row[1:] are the 37 prob columns
    id_to_probs[galaxy_id] = probabilities

#Now id_to_probs["123456"] might be [0.123, 0.456, ..., 0.999], etc.

Custom Dataset Class:


1.   Lists all image files in training/.
2. For each file, extracts GalaxyID from the filename (e.g., 123456 from 123456.jpg).
3. Finds the matching probabilities from id_to_probs.
4. Loads the image, applies transformations, and returns (image_tensor, label_tensor).



In [8]:
import os
import torch
from torch.utils.data import Dataset
from PIL import Image

class GalaxyDataset(Dataset):
    def __init__(self,
                 images_dir,
                 id_to_probs,
                 transform=None):
        self.images_dir = images_dir
        self.id_to_probs = id_to_probs
        self.transform = transform

        # List all JPG files in images_dir
        self.image_paths = [
            os.path.join(images_dir, f)
            for f in os.listdir(images_dir)
            if f.lower().endswith('.jpg')
        ]

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        # Filename is something like "123456.jpg"
        filename = os.path.basename(img_path)
        galaxy_id = os.path.splitext(filename)[0]  # "123456"

        # Load image
        image = Image.open(img_path).convert("RGB")

        # Convert probabilities to a tensor
        # If galaxy_id doesn't exist in id_to_probs,
        # you might want to handle KeyError or skip it
        probabilities = self.id_to_probs.get(galaxy_id, None)
        if probabilities is None:
            # If no label is found, you could handle it
            # (e.g., raise an Exception, or return dummy data)
            raise ValueError(f"GalaxyID {galaxy_id} not found in id_to_probs")

        # Transform image
        if self.transform:
            image = self.transform(image)

        # Convert list of probabilities to a FloatTensor
        label = torch.tensor(probabilities, dtype=torch.float32)

        return image, label


#4. Creating a DataLoader


Create a Dataset and a DataLoader

In [14]:
train_dir = "/content/dataset/images_training_rev1/images_training_rev1"
train_dataset = GalaxyDataset(
    images_dir=train_dir,
    id_to_probs=id_to_probs
)

from torch.utils.data import DataLoader

train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True,
    num_workers=2  # or more, depending on environment
)


#5. Defining the CNN Model


GOATED CNN MODEL IMPLEMENTATION

In [12]:
import torch.nn as nn
import torch.nn.functional as F

class GalaxyCNN(nn.Module):
    def __init__(self, num_outputs=37):
        super(GalaxyCNN, self).__init__()
        # Example: small CNN
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2)
        # After a couple of pools, the feature map shrinks.
        # Let's do a rough down-sampling approach:

        self.fc1 = nn.Linear(32 * 56 * 56, 128)
        self.fc2 = nn.Linear(128, num_outputs)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # shape: [batch,16,112,112]
        x = self.pool(F.relu(self.conv2(x)))  # shape: [batch,32,56,56]
        x = x.view(x.size(0), -1)            # flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)  # shape: [batch, 37]
        # We'll apply a sigmoid later, or use BCEWithLogitsLoss
        return x

model = GalaxyCNN(num_outputs=37)


#6. Training: Loss Function and Optimizer


BCEWithLogitsLoss is typically used for multi-label classification of the 37 features that can be present (probability close to 1) or absent (close to 0)

In [13]:
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

#A straightforward training loop in PyTorch:

num_epochs = 5
model.train()

for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in train_loader:
        # images shape: [batch, 3, 224, 224]
        # labels shape: [batch, 37]

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * images.size(0)

    epoch_loss = running_loss / len(train_dataset)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}")


TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
           ^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/collate.py", line 398, in default_collate
    return collate(batch, collate_fn_map=default_collate_fn_map)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/collate.py", line 211, in collate
    return [
           ^
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/collate.py", line 212, in <listcomp>
    collate(samples, collate_fn_map=collate_fn_map)
  File "/usr/local/lib/python3.11/dist-packages/torch/utils/data/_utils/collate.py", line 240, in collate
    raise TypeError(default_collate_err_msg_format.format(elem_type))
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'PIL.Image.Image'>
