In [1]:
# -----------------------------
# 1️⃣ Imports
# -----------------------------
import os
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
from tqdm import tqdm


In [2]:
# -----------------------------
# 2️⃣ Paths and Config
# -----------------------------
CLEANED_CSV = "merged_gz2_sdss.csv"   # your cleaned dataset CSV
IMG_DIR = "datafinal2/images"               # folder to save images
MAPPING_CSV = "datafinal2/image_mapping_new.csv"

os.makedirs(IMG_DIR, exist_ok=True)

# SDSS cutout parameters
IMG_SIZE = 128   # pixels
SCALE = 0.2      # arcsec/pixel

# Balanced target images per class
TARGET_PER_CLASS = 2000   # set based on smallest class you want to include


In [3]:
# -----------------------------
# 3️⃣ Load cleaned dataset
# -----------------------------
df = pd.read_csv(CLEANED_CSV)
print("Dataset loaded:", df.shape)
df.head()

Dataset loaded: (48664, 248)


Unnamed: 0,dr7objid,ra,dec,rastring,decstring,sample,gz2_class,total_classifications,total_votes,t01_smooth_or_features_a01_smooth_count,...,z,petroR50_r,petroR90_r,fracDeV_r,concentration_index,u_g_color,g_r_color,r_i_color,i_z_color,redshift
0,5.88009e+17,135.084396,52.49424,00:20.3,+52:29:39.3,original,Sb+t,42,332,1,...,11.91701,14.10413,36.87098,0.95881,2.614197,1.871747,0.891748,0.461641,0.281975,0.030118
1,5.8773e+17,246.921387,40.926968,27:41.1,+40:55:37.1,extra,Ei,48,154,41,...,12.0717,11.19578,35.93066,0.864857,3.209305,2.025663,0.82004,0.435435,0.398816,0.031728
2,5.87732e+17,183.062058,56.177532,12:14.9,+56:10:39.1,original,Sb?t,43,275,8,...,12.0424,9.284981,28.55589,1.0,3.075492,1.964868,0.975801,0.456394,0.309238,0.031083
3,5.87729e+17,119.617126,37.786617,58:28.1,+37:47:11.8,original,Ei,42,139,39,...,12.2125,10.26644,32.90302,1.0,3.20491,1.987321,0.892363,0.46651,0.306604,0.040825
4,5.87726e+17,209.473053,64.91098,57:53.5,+64:54:39.5,original,Er,35,102,26,...,12.0669,12.15487,38.81894,1.0,3.193695,1.940805,0.837738,0.391821,0.320923,0.032005


In [None]:
# -----------------------------
# 4️⃣ Assign Extended Labels
# -----------------------------
def assign_extended_label(row):
    fractions = {
        0: row['t01_smooth_or_features_a01_smooth_fraction'],   # Smooth
        1: row['t02_edgeon_a04_yes_fraction'] if 't02_edgeon_a04_yes_fraction' in row else 0,  # Edge-on
        # 2: row['t04_spiral_a08_spiral_fraction'] if 't04_spiral_a08_spiral_fraction' in row else 0,  # Spiral
        # 3: row['t03_bar_a06_bar_fraction'] if 't03_bar_a06_bar_fraction' in row else 0,  # Barred Spiral
    }
    return max(fractions, key=fractions.get)

df['extended_label'] = df.apply(assign_extended_label, axis=1)
print("Label distribution:\n", df['extended_label'].value_counts())


Label distribution:
 extended_label
2    42582
3     6082
Name: count, dtype: int64


In [5]:
# -----------------------------
# 5️⃣ Create balanced dataset
# -----------------------------
balanced_df = pd.DataFrame()

for label in df['extended_label'].unique():
    class_rows = df[df['extended_label'] == label]
    if len(class_rows) >= TARGET_PER_CLASS:
        sampled_rows = class_rows.sample(n=TARGET_PER_CLASS, random_state=42)
    else:
        sampled_rows = class_rows  # include all if fewer than target
    balanced_df = pd.concat([balanced_df, sampled_rows])

# Shuffle dataset
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

print("Balanced label distribution:\n", balanced_df['extended_label'].value_counts())


Balanced label distribution:
 extended_label
2    2000
3    2000
Name: count, dtype: int64


In [6]:
# -----------------------------
# 6️⃣ Function to fetch SDSS image
# -----------------------------
def fetch_sdss_image(ra, dec, filename, scale=SCALE, size=IMG_SIZE):
    url = f"http://skyserver.sdss.org/dr16/SkyServerWS/ImgCutout/getjpeg?ra={ra}&dec={dec}&scale={scale}&width={size}&height={size}"
    try:
        response = requests.get(url, timeout=10)
        img = Image.open(BytesIO(response.content)).convert("RGB")
        img.save(filename)
        return True
    except Exception as e:
        print(f"Failed to fetch {filename}: {e}")
        return False


In [21]:
# -----------------------------
# 7️⃣ Download balanced images
# -----------------------------
mapping = []

for idx, row in tqdm(balanced_df.iterrows(), total=len(balanced_df)):
    ra, dec = row['ra'], row['dec']
    filename = os.path.join(IMG_DIR, f"{idx}.jpg")
    
    if not os.path.exists(filename):
        success = fetch_sdss_image(ra, dec, filename)
        if not success:
            continue
    
    # store mapping info
    mapping.append({
        "idx": idx,
        "image_filename": filename,
        "ra": ra,
        "dec": dec,
        "extended_label": row['extended_label'],
        **{col: row[col] for col in df.columns if 't0' in col}  # optional morphology columns
    })

# Save mapping CSV
mapping_df = pd.DataFrame(mapping) 
mapping_df.to_csv(MAPPING_CSV, index=False)
print("Balanced mapping CSV saved:", MAPPING_CSV)

  0%|          | 0/4000 [00:00<?, ?it/s]

 86%|████████▌ | 3429/4000 [1:27:23<39:20,  4.13s/it]   

Failed to fetch datafinal/images\3428.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|██████████| 4000/4000 [1:40:13<00:00,  1.50s/it]


Balanced mapping CSV saved: datafinal/image_mapping_new.csv


In [None]:
# -----------------------------
# 4️⃣ Assign Extended Labels
# -----------------------------
def assign_extended_label(row):
    fractions = {
        # 0: row['t01_smooth_or_features_a01_smooth_fraction'],   # Smooth
        # 1: row['t02_edgeon_a04_yes_fraction'] if 't02_edgeon_a04_yes_fraction' in row else 0,  # Edge-on
        2: row['t04_spiral_a08_spiral_fraction'] if 't04_spiral_a08_spiral_fraction' in row else 0,  # Spiral
        3: row['t03_bar_a06_bar_fraction'] if 't03_bar_a06_bar_fraction' in row else 0,  # Barred Spiral
    }
    return max(fractions, key=fractions.get)

df['extended_label'] = df.apply(assign_extended_label, axis=1)
print("Label distribution:\n", df['extended_label'].value_counts())


In [7]:
# -----------------------------
# 7️⃣ Download balanced images
# -----------------------------
mapping = []

for idx, row in tqdm(balanced_df.iterrows(), total=len(balanced_df)):
    ra, dec = row['ra'], row['dec']
    filename = os.path.join(IMG_DIR, f"{idx}.jpg")
    
    if not os.path.exists(filename):
        success = fetch_sdss_image(ra, dec, filename)
        if not success:
            continue
    
    # store mapping info
    mapping.append({
        "idx": idx,
        "image_filename": filename,
        "ra": ra,
        "dec": dec,
        "extended_label": row['extended_label'], 
        **{col: row[col] for col in df.columns if 't0' in col}  # optional morphology columns
    })

# Save mapping CSV
mapping_df = pd.DataFrame(mapping)        
mapping_df.to_csv(MAPPING_CSV, index=False)         
print("Balanced mapping CSV saved:", MAPPING_CSV)         

  1%|▏         | 53/4000 [01:32<4:56:07,  4.50s/it]

Failed to fetch datafinal2/images\52.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 79%|███████▉  | 3171/4000 [1:18:41<1:22:25,  5.97s/it]

Failed to fetch datafinal2/images\3170.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 79%|███████▉  | 3177/4000 [1:19:33<2:03:30,  9.00s/it]

Failed to fetch datafinal2/images\3176.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 88%|████████▊ | 3538/4000 [1:29:08<39:10,  5.09s/it]  

Failed to fetch datafinal2/images\3537.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 92%|█████████▏| 3675/4000 [1:33:38<43:30,  8.03s/it]

Failed to fetch datafinal2/images\3674.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 92%|█████████▎| 3700/4000 [1:34:31<26:01,  5.21s/it]

Failed to fetch datafinal2/images\3699.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 93%|█████████▎| 3701/4000 [1:34:42<34:34,  6.94s/it]

Failed to fetch datafinal2/images\3700.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 93%|█████████▎| 3708/4000 [1:35:26<37:02,  7.61s/it]

Failed to fetch datafinal2/images\3707.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 93%|█████████▎| 3714/4000 [1:36:02<33:53,  7.11s/it]

Failed to fetch datafinal2/images\3713.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 93%|█████████▎| 3715/4000 [1:36:12<39:10,  8.25s/it]

Failed to fetch datafinal2/images\3714.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 93%|█████████▎| 3716/4000 [1:36:23<42:50,  9.05s/it]

Failed to fetch datafinal2/images\3715.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 93%|█████████▎| 3717/4000 [1:36:34<45:30,  9.65s/it]

Failed to fetch datafinal2/images\3716.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 94%|█████████▎| 3742/4000 [1:37:25<20:21,  4.73s/it]

Failed to fetch datafinal2/images\3741.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 94%|█████████▎| 3743/4000 [1:37:36<28:15,  6.60s/it]

Failed to fetch datafinal2/images\3742.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 94%|█████████▍| 3752/4000 [1:38:19<31:31,  7.63s/it]

Failed to fetch datafinal2/images\3751.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 94%|█████████▍| 3753/4000 [1:38:30<35:26,  8.61s/it]

Failed to fetch datafinal2/images\3752.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 94%|█████████▍| 3754/4000 [1:38:40<38:08,  9.30s/it]

Failed to fetch datafinal2/images\3753.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3801/4000 [1:40:07<21:41,  6.54s/it]

Failed to fetch datafinal2/images\3800.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3802/4000 [1:40:18<25:56,  7.86s/it]

Failed to fetch datafinal2/images\3801.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3803/4000 [1:40:29<28:53,  8.80s/it]

Failed to fetch datafinal2/images\3802.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3804/4000 [1:40:40<30:52,  9.45s/it]

Failed to fetch datafinal2/images\3803.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3807/4000 [1:41:05<29:20,  9.12s/it]

Failed to fetch datafinal2/images\3806.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3808/4000 [1:41:16<30:53,  9.66s/it]

Failed to fetch datafinal2/images\3807.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3809/4000 [1:41:27<32:07, 10.09s/it]

Failed to fetch datafinal2/images\3808.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 95%|█████████▌| 3810/4000 [1:41:38<32:46, 10.35s/it]

Failed to fetch datafinal2/images\3809.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3926/4000 [1:44:59<07:47,  6.32s/it]

Failed to fetch datafinal2/images\3925.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3927/4000 [1:45:10<09:23,  7.72s/it]

Failed to fetch datafinal2/images\3926.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3928/4000 [1:45:21<10:24,  8.67s/it]

Failed to fetch datafinal2/images\3927.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3929/4000 [1:45:32<11:03,  9.34s/it]

Failed to fetch datafinal2/images\3928.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3930/4000 [1:45:42<11:27,  9.83s/it]

Failed to fetch datafinal2/images\3929.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3935/4000 [1:46:09<07:39,  7.07s/it]

Failed to fetch datafinal2/images\3934.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3936/4000 [1:46:19<08:46,  8.23s/it]

Failed to fetch datafinal2/images\3935.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3937/4000 [1:46:30<09:29,  9.05s/it]

Failed to fetch datafinal2/images\3936.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 98%|█████████▊| 3938/4000 [1:46:42<10:00,  9.68s/it]

Failed to fetch datafinal2/images\3937.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▊| 3941/4000 [1:47:03<08:20,  8.49s/it]

Failed to fetch datafinal2/images\3940.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▊| 3945/4000 [1:47:22<06:11,  6.75s/it]

Failed to fetch datafinal2/images\3944.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▊| 3946/4000 [1:47:33<07:12,  8.01s/it]

Failed to fetch datafinal2/images\3945.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▊| 3947/4000 [1:47:44<07:50,  8.89s/it]

Failed to fetch datafinal2/images\3946.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3952/4000 [1:48:11<05:57,  7.45s/it]

Failed to fetch datafinal2/images\3951.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3953/4000 [1:48:22<06:39,  8.51s/it]

Failed to fetch datafinal2/images\3952.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3954/4000 [1:48:33<07:04,  9.23s/it]

Failed to fetch datafinal2/images\3953.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3970/4000 [1:49:18<03:02,  6.07s/it]

Failed to fetch datafinal2/images\3969.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3971/4000 [1:49:29<03:39,  7.56s/it]

Failed to fetch datafinal2/images\3970.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3972/4000 [1:49:40<04:01,  8.62s/it]

Failed to fetch datafinal2/images\3971.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3977/4000 [1:50:07<02:46,  7.24s/it]

Failed to fetch datafinal2/images\3976.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3978/4000 [1:50:18<03:03,  8.34s/it]

Failed to fetch datafinal2/images\3977.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


 99%|█████████▉| 3979/4000 [1:50:29<03:12,  9.15s/it]

Failed to fetch datafinal2/images\3978.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3980/4000 [1:50:40<03:14,  9.72s/it]

Failed to fetch datafinal2/images\3979.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3983/4000 [1:50:56<02:07,  7.50s/it]

Failed to fetch datafinal2/images\3982.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3984/4000 [1:51:07<02:16,  8.53s/it]

Failed to fetch datafinal2/images\3983.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3985/4000 [1:51:18<02:18,  9.24s/it]

Failed to fetch datafinal2/images\3984.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3986/4000 [1:51:29<02:16,  9.76s/it]

Failed to fetch datafinal2/images\3985.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3987/4000 [1:51:40<02:11, 10.12s/it]

Failed to fetch datafinal2/images\3986.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|█████████▉| 3997/4000 [1:52:19<00:18,  6.27s/it]

Failed to fetch datafinal2/images\3996.jpg: HTTPSConnectionPool(host='skyserver.sdss.org', port=443): Read timed out. (read timeout=10)


100%|██████████| 4000/4000 [1:52:23<00:00,  1.69s/it]


Balanced mapping CSV saved: datafinal2/image_mapping_new.csv


In [10]:
import pandas as pd
import os

# Folder to save the merged CSV
output_folder = "datafinalmerged"
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Load 4000 records from each CSV
csv1 = pd.read_csv("datafinal/image_mapping_new.csv").head(4000)
csv2 = pd.read_csv("datafinal2/image_mapping_new.csv").head(4000)

# Ensure columns match
csv2 = csv2[csv1.columns]

# Update 'idx' column to continue from csv1
csv2['idx'] = range(csv1['idx'].iloc[-1] + 1, csv1['idx'].iloc[-1] + 1 + len(csv2))

# Concatenate the two CSVs
merged_csv = pd.concat([csv1, csv2], ignore_index=False)

# Save merged CSV in the folder
output_file = os.path.join(output_folder, "merged_8000.csv")
merged_csv.to_csv(output_file, index=False)

print(f"Merged CSV created successfully at '{output_file}' with {len(merged_csv)} records.")


Merged CSV created successfully at 'datafinalmerged\merged_8000.csv' with 7945 records.


In [11]:
import os
import shutil

# Source folders
folder1 = "datafinal/images"
folder2 = "datafinal2/images"

# Destination folder
output_folder = "datafinalmerged/image"
os.makedirs(output_folder, exist_ok=True)  # Create folder if it doesn't exist

# Function to copy and rename images sequentially
def merge_images(src_folder, start_index):
    images = sorted([f for f in os.listdir(src_folder) if f.endswith(".jpg")])
    for i, img_name in enumerate(images):
        src_path = os.path.join(src_folder, img_name)
        dst_name = f"{start_index + i}.jpg"
        dst_path = os.path.join(output_folder, dst_name)
        shutil.copy(src_path, dst_path)
    return start_index + len(images)  # Return next start index

# Copy first folder starting from 0
next_index = merge_images(folder1, start_index=0)

# Copy second folder starting from where first ended
merge_images(folder2, start_index=next_index)

print(f"All images merged successfully into '{output_folder}' with sequential names.")


All images merged successfully into 'datafinalmerged/image' with sequential names.
