In [3]:
import pandas as pd
from astropy.coordinates import SkyCoord
import astropy.units as u

In [5]:
gz2_path = "../data/gz2.csv"
sdss_path = "../data/sdss.csv"

gz2 = pd.read_csv(gz2_path, low_memory=False)
sdss = pd.read_csv(sdss_path, skiprows=1, low_memory=False)

print("Galaxy Zoo 2:", gz2.shape)
print("SDSS:", sdss.shape)

print("\n--- GZ2 Columns ---")
print(gz2.columns[:20])  # first 20 col names

print("\n--- SDSS Columns ---")
print(sdss.columns[:20])  # first 20 col names


Galaxy Zoo 2: (239695, 231)
SDSS: (500000, 17)

--- GZ2 Columns ---
Index(['dr7objid', 'ra', 'dec', 'rastring', 'decstring', 'sample', 'gz2_class',
       'total_classifications', 'total_votes',
       't01_smooth_or_features_a01_smooth_count',
       't01_smooth_or_features_a01_smooth_weight',
       't01_smooth_or_features_a01_smooth_fraction',
       't01_smooth_or_features_a01_smooth_weighted_fraction',
       't01_smooth_or_features_a01_smooth_debiased',
       't01_smooth_or_features_a01_smooth_flag',
       't01_smooth_or_features_a02_features_or_disk_count',
       't01_smooth_or_features_a02_features_or_disk_weight',
       't01_smooth_or_features_a02_features_or_disk_fraction',
       't01_smooth_or_features_a02_features_or_disk_weighted_fraction',
       't01_smooth_or_features_a02_features_or_disk_debiased'],
      dtype='object')

--- SDSS Columns ---
Index(['objID', 'ra', 'dec', 'u', 'g', 'r', 'i', 'z', 'petroR50_r',
       'petroR90_r', 'fracDeV_r', 'concentration_index'

In [6]:
print(gz2[['ra','dec']].dtypes)
print(sdss[['ra','dec']].dtypes)


ra     float64
dec    float64
dtype: object
ra     float64
dec    float64
dtype: object


In [7]:
gz2 = gz2.dropna(subset=['ra','dec'])
sdss = sdss.dropna(subset=['ra','dec'])


In [8]:
from astropy.coordinates import SkyCoord
import astropy.units as u

coords_gz2 = SkyCoord(ra=gz2['ra'].values*u.deg, dec=gz2['dec'].values*u.deg)
coords_sdss = SkyCoord(ra=sdss['ra'].values*u.deg, dec=sdss['dec'].values*u.deg)

idx, d2d, _ = coords_gz2.match_to_catalog_sky(coords_sdss)

matched = d2d.arcsec < 2.0  # tolerance in arcseconds

gz2_matched = gz2[matched].reset_index(drop=True)
sdss_matched = sdss.iloc[idx[matched]].reset_index(drop=True)

merged = pd.concat([gz2_matched, sdss_matched], axis=1)
print("Merged dataset:", merged.shape)


Merged dataset: (48664, 248)


In [13]:
merged.to_csv("../data/merged_gz2_sdss.csv", index=False)
