In [20]:
# # 13. Astronomy: Cosmic Curator – Decoding Galaxy Shapes

# **Stakeholders:** Abhay Upparwal, Abdul Qadir Ronak  

# **Platform:** [Kaggle](https://www.kaggle.com/competitions/cosmic-curator)

# **Team Size:** 1  

# ## The Astronomical Observatory

# Welcome to the International Virtual Observatory, humanity's most advanced center for galactic research. As a member of the elite Morphological Analysis Team, you're surrounded by massive screens displaying telescope data from across the universe. The observatory has recently received an unprecedented influx of galaxy images from the latest deep-field survey, capturing over 100,000 previously undocumented galaxies. Your director has tasked your team with classifying these cosmic structures, but the volume is overwhelming for human analysis alone. The future of galactic cartography depends on automating this process with precision and reliability.

# ## The GalaxyVision System

# Your team has been granted access to GalaxyVision, a cutting-edge computational platform specifically designed for astronomical image analysis. This system can process high-resolution, multi-wavelength imagery and extract complex visual features that distinguish different galaxy types. GalaxyVision incorporates specialized image preprocessing techniques to handle the unique challenges of astronomical data, including noise reduction, background subtraction, and normalization across different telescope observations. However, its classification algorithm needs significant improvement to match the expertise of human astronomers.

# ## Detailed Problem Statement

# Galaxies represent some of the most complex structures in our universe, with morphologies shaped by billions of years of evolution. Edwin Hubble's classification system (the "tuning fork") categorizes galaxies primarily as elliptical, spiral, or irregular, but modern astronomy recognizes many more nuanced variations:

# Your challenge is to develop a sophisticated machine learning model that can identify these morphological types from telescope images. The model must be robust to variations in image quality, galaxy orientation, redshift effects, and the presence of nearby objects. It should extract meaningful features that astronomers use for classification, such as bulge-to-disk ratio, spiral arm tightness, presence of bars or rings, and signs of interaction.

# The classification system you develop will be integrated into the next generation of astronomical surveys, helping scientists understand galaxy formation and evolution across cosmic time. It will also enable researchers to identify rare or unusual galaxies that merit further investigation with more powerful instruments.

# ## Dataset

# - **Source:** [Kaggle](https://www.kaggle.com/competitions/cosmic-curator)  
# - **Format:** JPG images of galaxies (each image is a few hundred pixels wide, with 3 color channels)  
# - **Classes:** Multiple galaxy types based on morphological classification  
# - **Size:** 916 labelled images  
# - **Labels:**  
#   - 0: SPIRAL  
#   - 1: ELLIPTICAL  
#   - 2: UNCERTAIN  
# - **Characteristics:** Images vary in resolution, contrast, and background noise, reflecting real-world astronomical data collection challenges  

# ## Submission Format

# - Csv file in the format (2 columns: asset_id, GalaxyType)  

# ## Technical Challenges

# Your model must address several specific technical challenges:

# - Distinguishing subtle differences between similar galaxy types  
# - Processing images with varying levels of detail and quality  
# - Extracting meaningful features that correlate with astronomical classification criteria  
# - Achieving high accuracy while maintaining computational efficiency  

# ## Judging Criteria

# Models will be evaluated on a held-out test set of galaxy images with hidden labels. The primary metric is classification accuracy (or macro-averaged F1-score to account for class imbalance). In other words, how many galaxies are correctly classified into the right category. The highest-scoring model (with most correct classifications) wins the contest.

In [21]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import torch
import torch.nn as nn
import torchvision.transforms as transforms
from torchvision import models
from torch.utils.data import Dataset, DataLoader, Subset
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


def set_all_seed(seed=42):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
seed = 42
set_all_seed(seed=seed)

Using device: cuda


In [None]:
# Paths
TRAIN_DIR = './cosmic-curator/train_images'
TEST_DIR = './cosmic-curator/test_images'
TRAIN_CSV = './cosmic-curator/train2.csv'
OUT_DIR = './results'
# delete if exists
if os.path.exists(OUT_DIR):
    import shutil
    shutil.rmtree(OUT_DIR)
# create directory
os.makedirs(OUT_DIR, exist_ok=True)
OUT_TRAIN_CSV = os.path.join(OUT_DIR, 'train.csv')
OUT_TEST_CSV = os.path.join(OUT_DIR, 'test.csv')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Parameters
BATCH_SIZE = 32
NUM_CLASSES = 3
NUM_EPOCHS = 50
LEARNING_RATE = 1e-3
IMG_SIZE = 424
VAL_SPLIT = 0.2
SAVE_CHECKPOINT = True

In [None]:
# --- Custom Dataset ---
class GalaxyDataset(Dataset):
    def __init__(self, df, img_dir, transform=None, is_test=False):
        self.df = df.reset_index(drop=True)
        self.img_dir = img_dir
        self.transform = transform
        self.is_test = is_test

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        asset_id = self.df.loc[idx, 'asset_id']
        img_path = os.path.join(self.img_dir, f"{asset_id}.jpg")
        image = Image.open(img_path).convert("RGB")

        if self.transform:
            image = self.transform(image)

        if self.is_test:
            return image, asset_id
        else:
            label = int(self.df.loc[idx, 'GalaxyType'])
            return image, label

# --- Transforms ---
train_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(30),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.RandomResizedCrop(IMG_SIZE, scale=(0.8, 1.0)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])

test_transform = transforms.Compose([
    transforms.Resize((IMG_SIZE, IMG_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],
                         [0.229, 0.224, 0.225])
])


In [24]:
# --- Load CSV ---
train_df = pd.read_csv(TRAIN_CSV)
print("Images in train set:", len(train_df))
# delete the images that are not in the train set
train_img_asset_ids = [int(f.split('.')[0]) for f in os.listdir(TRAIN_DIR) if f.endswith('.jpg')]
train_df = train_df[train_df['asset_id'].isin(train_img_asset_ids)]
train_df = train_df.reset_index(drop=True)

test_asset_ids = [int(f.split('.')[0]) for f in os.listdir(TEST_DIR) if f.endswith('.jpg')]
test_df = pd.DataFrame({'asset_id': test_asset_ids})
print("Images in test set:", len(test_df))


# --- Train/Val Split ---
train_df, val_df = train_test_split(train_df, test_size=VAL_SPLIT, stratify=train_df['GalaxyType'], random_state=seed)
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
print("After splitting into train and val sets:")
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

Images in train set: 912
Images in test set: 412
After splitting into train and val sets:
Train size: 727
Val size: 182
Test size: 412


In [13]:
train_dataset = GalaxyDataset(train_df, TRAIN_DIR, transform=train_transform)
val_dataset = GalaxyDataset(val_df, TRAIN_DIR, transform=test_transform)
test_dataset = GalaxyDataset(test_df, TEST_DIR, transform=test_transform, is_test=True)
print("Train dataset size:", len(train_dataset))
print("Validation dataset size:", len(val_dataset)) 
print("Test dataset size:", len(test_dataset))

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print("Train loader size:", len(train_loader))
print("Validation loader size:", len(val_loader))
print("Test loader size:", len(test_loader))

Train dataset size: 727
Validation dataset size: 182
Test dataset size: 412
Train loader size: 23
Validation loader size: 6
Test loader size: 13


In [None]:
# --- Model ---
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model = model.to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)



In [None]:
# --- Training Loop ---
# Initialize lists to store F1 scores
train_f1_scores = []
val_f1_scores = []
train_losses = []
val_losses = []

best_val_f1 = 0.0  # To track the best validation F1 score
best_model_path = os.path.join(OUT_DIR, 'best_model.pth')

for epoch in range(NUM_EPOCHS):
    model.train()
    train_loss = 0
    train_preds, train_labels = [], []

    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1} [Train]"):
        images, labels = images.to(DEVICE), labels.to(DEVICE)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        train_preds.extend(outputs.argmax(dim=1).cpu().numpy())
        train_labels.extend(labels.cpu().numpy())

    train_f1 = f1_score(train_labels, train_preds, average='macro')
    train_f1_scores.append(train_f1)
    train_losses.append(train_loss / len(train_loader))  # Average train loss

    # --- Validation ---
    model.eval()
    val_loss = 0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"Epoch {epoch+1} [Val]"):
            images, labels = images.to(DEVICE), labels.to(DEVICE)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            val_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    val_f1 = f1_score(val_labels, val_preds, average='macro')
    val_f1_scores.append(val_f1)
    val_losses.append(val_loss / len(val_loader))  # Average validation loss

    print(f"Epoch {epoch+1}: Train Loss={train_losses[-1]:.4f}, Train F1={train_f1:.4f}, Val Loss={val_losses[-1]:.4f}, Val F1={val_f1:.4f}")

    # Save the best model
    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        torch.save(model.state_dict(), best_model_path)
        print(f"✅ Best model saved with Val F1={best_val_f1:.4f}")

    # Save checkpoint for each epoch if enabled
    if SAVE_CHECKPOINT and (epoch + 1) % 5 == 0:
        checkpoint_path = os.path.join(OUT_DIR, f"model_epoch_{epoch+1}.pth")
        torch.save(model.state_dict(), checkpoint_path)
        

In [None]:
# load best model
model = models.resnet18(pretrained=True)
model.fc = nn.Linear(model.fc.in_features, NUM_CLASSES)
model.load_state_dict(torch.load(best_model_path))
model = model.to(DEVICE)


Model saved to ./results\model.pth


In [None]:
# --- Plot F1 Scores ---
plt.figure(figsize=(10, 6))
plt.plot(range(1, NUM_EPOCHS + 1), train_f1_scores, label='Train F1', marker='o')
plt.plot(range(1, NUM_EPOCHS + 1), val_f1_scores, label='Validation F1', marker='o')
plt.xlabel('Epoch')
plt.ylabel('F1 Score')
plt.title('F1 Score Over Epochs')
plt.legend()
plt.grid(True)
plot_path = os.path.join(OUT_DIR, 'f1_scores_plot.png')
plt.savefig(plot_path)
plt.show()
print(f"✅ F1 score plot saved at {plot_path}")

# --- Plot Losses ---
plt.figure(figsize=(10, 6))
plt.plot(range(1, NUM_EPOCHS + 1), train_losses, label='Train Loss', marker='o')
plt.plot(range(1, NUM_EPOCHS + 1), val_losses, label='Validation Loss', marker='o')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Loss Over Epochs')
plt.legend()
plt.grid(True)
plot_path = os.path.join(OUT_DIR, 'losses_plot.png')
plt.savefig(plot_path)
plt.show()
print(f"✅ Loss plot saved at {plot_path}")

In [None]:
# --- Test Prediction ---
model.eval()
predictions = []
asset_ids = []

with torch.no_grad():
    for images, ids in tqdm(test_loader, desc="Predicting"):
        images = images.to(DEVICE)
        outputs = model(images)
        preds = outputs.argmax(dim=1).cpu().numpy()
        predictions.extend(preds)
        asset_ids.extend(ids.cpu().numpy())  # Convert tensor to numpy array and extend

# --- Save Submission ---
submission_df = pd.DataFrame({'id': asset_ids, 'GalaxyType': predictions})
submission_df.sort_values('id', inplace=True)
submission_df.to_csv(OUT_TEST_CSV, index=False)
print(f"Submission saved to {OUT_TEST_CSV}")
print(submission_df.head())
