In [4]:
# 02_image_download.ipynb

# -----------------------------
# 1️⃣ Imports
# -----------------------------
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm

# -----------------------------
# 2️⃣ Paths and Config
# -----------------------------
CLEANED_CSV = "merged_cleaned.csv"   # your cleaned dataset
IMG_DIR = "data/images"
os.makedirs(IMG_DIR, exist_ok=True)
MAPPING_CSV = "data/image_mapping.csv"

# SDSS cutout parameters
IMG_SIZE = 128   # pixels
SCALE = 0.2      # arcsec/pixel

# -----------------------------
# 3️⃣ Load cleaned dataset
# -----------------------------
df = pd.read_csv("../data/merged_gz2_sdss.csv")
print("Dataset loaded:", df.shape)
df.head()

# -----------------------------
# 4️⃣ Function to download SDSS images
# -----------------------------
def fetch_sdss_image(ra, dec, filename, scale=SCALE, size=IMG_SIZE):
    url = f"http://skyserver.sdss.org/dr16/SkyServerWS/ImgCutout/getjpeg?ra={ra}&dec={dec}&scale={scale}&width={size}&height={size}"
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img.save(filename)
        return True
    except Exception as e:
        print(f"Failed to fetch {filename}: {e}")
        return False

# -----------------------------
# 5️⃣ Download loop
# -----------------------------
# For demonstration, you can limit to first 1000 rows (or full dataset)
df_to_download = df.head(1000)  

mapping = []

for idx, row in tqdm(df_to_download.iterrows(), total=len(df_to_download)):
    ra, dec = row['ra'], row['dec']
    filename = os.path.join(IMG_DIR, f"{idx}.jpg")
    
    if not os.path.exists(filename):
        success = fetch_sdss_image(ra, dec, filename)
        if not success:
            continue
    
    # store mapping info
    mapping.append({
        "idx": idx,
        "image_filename": filename,
        "ra": ra,
        "dec": dec,
        **{col: row[col] for col in df.columns if 't0' in col}  # morphology labels
    })

# -----------------------------
# 6️⃣ Save mapping CSV
# -----------------------------
mapping_df = pd.DataFrame(mapping)
mapping_df.to_csv(MAPPING_CSV, index=False)
print("Mapping CSV saved:", MAPPING_CSV)


Dataset loaded: (48664, 248)


 78%|██████████████████████████████████████████████████████████████████▏                  | 778/1000 [27:16<17:50,  4.82s/it]

Failed to fetch data/images/777.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Max retries exceeded with url: /dr16/SkyServerWS/ImgCutout/getjpeg?ra=229.85281372070312&dec=4.579733848571777&scale=0.2&width=128&height=128 (Caused by ConnectTimeoutError(<urllib3.connection.HTTPSConnection object at 0x152412a50>, 'Connection to skyserver.sdss.org timed out. (connect timeout=10)'))


100%|████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [34:49<00:00,  2.09s/it]

Mapping CSV saved: data/image_mapping.csv



