In [1]:
"""
Merge Galaxy Zoo 2 (GZ2) morphology features with SDSS features by `specObjID`.

Workflow
--------
1) Read ../data/features_gz2.csv and ../data/features_sdss.csv with `specObjID` as string.
2) Inner-join on `specObjID` to keep rows present in both sources.
3) Drop the ID column to produce the working feature table `df`.
"""

# ---------------------------------------------------------------------
# Imports
# ---------------------------------------------------------------------
import pandas as pd

# ---------------------------------------------------------------------
# Load source tables (ensure `specObjID` is read as string for exact matching)
# ---------------------------------------------------------------------
morph_df = pd.read_csv("../data/features_gz2.csv", dtype={"specObjID": str})
sdss_df  = pd.read_csv("../data/features_sdss.csv", dtype={"specObjID": str})

# ---------------------------------------------------------------------
# Inner-join on the shared identifier
# ---------------------------------------------------------------------
merged_df = pd.merge(
    morph_df,
    sdss_df,
    left_on="specObjID",
    right_on="specObjID",
    how="inner"
)

# ---------------------------------------------------------------------
# Final feature matrix (ID dropped)
# ---------------------------------------------------------------------
df = merged_df.drop(columns=["specObjID"])


In [2]:
print("Shape:", df.shape)

print(df.info())

print(df.describe())

Shape: (176986, 16)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 176986 entries, 0 to 176985
Data columns (total 16 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   bar_prob          176986 non-null  float64
 1   bulge_ev          176986 non-null  float64
 2   round_ev          176986 non-null  float64
 3   arms_num_ev       176986 non-null  float64
 4   arms_wind_ev      176986 non-null  float64
 5   elliptical_prob   176986 non-null  float64
 6   spiral_prob_eff   176986 non-null  float64
 7   odd_prob          176986 non-null  float64
 8   z                 176986 non-null  float64
 9   color_pc1         176986 non-null  float64
 10  oh_p50            176986 non-null  float64
 11  logMass_median    176986 non-null  float64
 12  v_disp            176986 non-null  float64
 13  ssfr_mean         176986 non-null  float64
 14  age_mean          176986 non-null  float64
 15  metallicity_mean  176986 non-null  float64
dtype

In [3]:
df.to_csv("../data/main.csv", index=False)