In [None]:
from astropy.io import fits
from astropy.table import Table
from matplotlib.colors import LogNorm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import corner

## Main DATA Exploration
## DATA From : https://www.horizon-simulation.org/PHOTOCAT/HorizonAGN_LAIGLE-DAVIDZON+2019_EUC-LSST_v1.5.fits

In [None]:
hdul = fits.open(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\raw\Euclid.fits')
hdul.info()

In [None]:
data = hdul[1].data

In [None]:
print(data.columns.names)

In [None]:
print(data[0])

In [None]:
table = Table(hdul[1].data)

In [None]:
df = table.to_pandas()

In [None]:
print(df.shape)
print(df.head(3))

In [None]:
print(df.info())
print(df.describe())

In [None]:
def generate_cleanliness_report(df):
    report = []

    placeholder_values = [-999, -99, 9999, 999, -1e10, 1e10]

    for col in df.columns:
        if np.issubdtype(df[col].dtype, np.number):
            values = df[col]
            clean_vals = values[~values.isin(placeholder_values)]

            report.append({
                "Column": col,
                "Type": df[col].dtype,
                "NaNs": values.isna().sum(),
                "Placeholder count": values.isin(placeholder_values).sum(),
                "Min": clean_vals.min(),
                "Max": clean_vals.max(),
                "Mean": clean_vals.mean(),
                "Std": clean_vals.std(),
                "Unique values": values.nunique()
            })

    return pd.DataFrame(report).sort_values(by="Placeholder count", ascending=False)

In [None]:
cleanliness_report = generate_cleanliness_report(df)

In [None]:
# cleanliness_report.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\cleanliness_report.csv', index=False)

In [None]:
placeholders = [-999, -99, 999, 9999]

cols_to_clean = [c for c in df.columns if c not in ['ID','NUM'] ]

df[cols_to_clean] = df[cols_to_clean].replace(placeholders, np.nan)

nan_counts = df.isna().sum().sort_values(ascending=False)

nan_report = nan_counts.reset_index()
nan_report.columns = ['Column', 'NaN_Count']

In [None]:
# nan_report.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\nan_counts.csv', index=False)

## Core Simulation Truth Features Exploration

In [None]:
core_cols = [
    'Z_true',           'Z_dist',
    'MASS_true',        'MASSBC03_true',
    'SFR_true',         'MASS-WEIGHTED_AGE_true',
    'METALLICITY_true'
]

In [None]:
df_core_clean = df.dropna(subset=core_cols)
df_core = df_core_clean[core_cols]

In [None]:
df_core = df[core_cols]

In [None]:
print(df_core.shape)
print(df_core.isna().sum())

In [None]:
cols = df_core.columns.tolist()
n = len(cols)

In [None]:
n_cols = 3
n_rows = math.ceil(n / n_cols)

In [None]:
fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for ax, col in zip(axes, cols):
    sns.histplot(df_core[col], bins=50, kde=True, ax=ax)
    ax.set_title(col)
    ax.set(xlabel='', ylabel='Count')

for ax in axes[n:]:
    ax.axis('off')

plt.tight_layout()
# plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\core_data_histograms.png', dpi=300)
plt.show()

In [None]:
sample = df_core.sample(5000, random_state=42)

sns.pairplot(sample, vars=cols, corner=True,
             plot_kws={'s':10, 'alpha':0.3})
# plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\core_data_pairplot.png', dpi=300)
plt.show()

In [None]:
corr = df_core[cols].corr()

plt.figure(figsize=(7,7))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag", center=0)
plt.title("Feature Correlation Matrix")
plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\core_data_correlation_matrix.png', dpi=300)
plt.show()

In [None]:
to_drop = ['Z_dist', 'MASSBC03_true']
df_prep = df_core.drop(columns=to_drop)

print("Remaining columns:", df_prep.columns.tolist())

In [None]:
eps = 1e-4

sfr_log   = np.log10(np.clip(df_prep["SFR_true"].to_numpy(), eps, None))
age_log   = np.log10(df_prep["MASS-WEIGHTED_AGE_true"].to_numpy())

df_prep[["log_SFR", "log_Mass-Weighted Age"]] = np.column_stack([sfr_log, age_log])

df_prep[['log_SFR','log_Mass-Weighted Age']].describe().T

In [None]:
df_prep.head()

In [None]:
to_drop = ['SFR_true', 'MASS-WEIGHTED_AGE_true']
df_core_final = df_prep.drop(columns=to_drop)

print("Remaining columns:", df_core_final.columns.tolist())

In [None]:
final_cols = df_core_final.columns.tolist()
n = len(final_cols)

In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for ax, feat in zip(axes, final_cols):
    sns.histplot(
        df_core_final[feat],
        bins=50,
        kde=True,
        ax=ax
    )
    ax.set_title(feat)
    ax.set_xlabel('')
    ax.set_ylabel('Count')

axes[len(final_cols)].axis('off')

plt.tight_layout()
# plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\final_core_data_histograms.png', dpi=300)
plt.show()

In [None]:
sample = df_core_final[final_cols].sample(5000, random_state=42)

sns.pairplot(sample, vars=final_cols, corner=True,
             plot_kws={'s':10, 'alpha':0.3})
# plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\final_core_data_pairplot.png', dpi=300)
plt.show()

In [None]:
corr = df_core_final[final_cols].corr()

plt.figure(figsize=(6,5))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag", center=0)
plt.title("Feature Correlation Matrix")
# plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\final_core_data_correlation_matrix.png', dpi=300)
plt.show()

In [None]:
df_core_final.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\core_data.csv', index=False)

In [None]:
def hexbin_plot(x, y, title="", xlabel="", ylabel="", gridsize=80, log=True, ax=None):
    if ax is None:
        fig, ax = plt.subplots(figsize=(6,5))
    
    hb = ax.hexbin(
        x, y,
        gridsize=gridsize,
        mincnt=1,
        norm=LogNorm() if log else None
    )
    cb = plt.colorbar(hb, ax=ax)
    cb.set_label("counts")
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    return ax

In [None]:
n_features = len(final_cols)

In [None]:
fig, axes = plt.subplots(n_features, n_features, figsize=(15, 15))

for i in range(n_features):
    for j in range(n_features):
        ax = axes[i, j]
        
        if i < j:
            ax.set_visible(False)
        elif i == j:
            ax.hist(df_core_final[final_cols[i]], bins=50, alpha=0.7, density=True)
            ax.set_xlabel(final_cols[i])
            ax.set_ylabel('Density')
            if i == 0:
                ax.set_title(final_cols[i])
        else:
            x_data = df_core_final[final_cols[j]]
            y_data = df_core_final[final_cols[i]]
            
            hb = ax.hexbin(x_data, y_data, gridsize=30, mincnt=1, 
                          norm=LogNorm() if True else None)
            
            if i == n_features - 1:
                ax.set_xlabel(final_cols[j])
            if j == 0:
                ax.set_ylabel(final_cols[i])
            
            if i != n_features - 1:
                ax.set_xticklabels([])
            if j != 0:
                ax.set_yticklabels([])
                
plt.tight_layout()
plt.suptitle('Hexbin Corner Plot of 5 Features', fontsize=16, y=0.98)

plt.subplots_adjust(right=0.92)
cbar_ax = fig.add_axes([0.93, 0.15, 0.02, 0.5])
cb = plt.colorbar(hb, cax=cbar_ax)
cb.set_label('Counts', rotation=270, labelpad=15)

# plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\final_core_data_hexbin_corner_plot.png', dpi=300)
plt.show()

## Observed DATA Exploration

In [None]:
photometry_cols = [
    'ulsst_TOT',    'gdes_TOT',
    'glsst_TOT',    'rdes_TOT',     
    'ides_TOT',     'ilsst_TOT',
    'zlsst_TOT',    'zdes_TOT',     
    'Jeuc_TOT',     'Yeuc_TOT',
    'Heuc_TOT',     'VISeuc_TOT',
    'rlsst_TOT',    'ylsst_TOT'
]

In [None]:
df_photometry = df[photometry_cols]

In [None]:
print(df_photometry.shape)
print(df_photometry.isna().sum())

In [None]:
print("Analyzing SFR values where photometry data is missing:")
print("="*60)

# Find rows with any NaN in photometry columns
rows_with_photometry_nan = df_photometry.isna().any(axis=1)
nan_row_indices = df_photometry.index[rows_with_photometry_nan]

print(f"Total rows: {len(df)}")
print(f"Rows with NaN in photometry: {len(nan_row_indices)}")
print(f"Rows with complete photometry: {len(df) - len(nan_row_indices)}")

In [None]:
sfr_with_nan_photometry = df.loc[nan_row_indices, 'SFR_true']
sfr_with_complete_photometry = df.loc[~rows_with_photometry_nan, 'SFR_true']

print(f"\nSFR statistics for rows with NaN photometry:")
print(f"Count: {len(sfr_with_nan_photometry)}")
print(f"Non-NaN SFR values: {sfr_with_nan_photometry.notna().sum()}")
print(f"NaN SFR values: {sfr_with_nan_photometry.isna().sum()}")

In [None]:
print(f"\nSFR statistics for rows with NaN photometry:")
print(f"Count: {len(sfr_with_nan_photometry)}")
print(f"Non-NaN SFR values: {sfr_with_nan_photometry.notna().sum()}")
print(f"NaN SFR values: {sfr_with_nan_photometry.isna().sum()}")

if sfr_with_nan_photometry.notna().sum() > 0:
    valid_sfr_nan_rows = sfr_with_nan_photometry.dropna()
    print(f"Min SFR: {valid_sfr_nan_rows.min():.6f}")
    print(f"Max SFR: {valid_sfr_nan_rows.max():.6f}")
    print(f"Mean SFR: {valid_sfr_nan_rows.mean():.6f}")
    print(f"Median SFR: {valid_sfr_nan_rows.median():.6f}")


In [None]:
print(f"\nSFR statistics for rows with complete photometry:")
print(f"Count: {len(sfr_with_complete_photometry)}")
print(f"Non-NaN SFR values: {sfr_with_complete_photometry.notna().sum()}")
print(f"NaN SFR values: {sfr_with_complete_photometry.isna().sum()}")

if sfr_with_complete_photometry.notna().sum() > 0:
    valid_sfr_complete_rows = sfr_with_complete_photometry.dropna()
    print(f"Min SFR: {valid_sfr_complete_rows.min():.6f}")
    print(f"Max SFR: {valid_sfr_complete_rows.max():.6f}")
    print(f"Mean SFR: {valid_sfr_complete_rows.mean():.6f}")
    print(f"Median SFR: {valid_sfr_complete_rows.median():.6f}")

In [None]:
placeholder_values = [-999, -99, 999, 9999, 0]
print(f"\nPlaceholder analysis for SFR in complete photometry rows:")
for placeholder in placeholder_values:
    count = (sfr_with_complete_photometry == placeholder).sum()
    if count > 0:
        print(f"SFR = {placeholder}: {count} occurrences")

In [None]:
lsst_cols = [c for c in photometry_cols if 'lsst' in c.lower()]
des_cols = [c for c in photometry_cols if 'des' in c.lower()]
euc_cols = [c for c in photometry_cols if 'euc' in c.lower()]

print("LSST columns:", lsst_cols)
print("DES columns:", des_cols)
print("Euclid columns:", euc_cols)

In [None]:
df_photometry_with_colors = df_photometry.copy()

In [None]:
def create_intra_survey_colors(df, cols, survey_name):
    colors_created = []
    
    wavelength_order = ['u', 'g', 'r', 'i', 'z', 'y', 'vis', 'j', 'h']
    
    def get_band_priority(col):
        col_lower = col.lower()
        for i, band in enumerate(wavelength_order):
            if band in col_lower:
                return i
        return 999 
    
    sorted_cols = sorted(cols, key=get_band_priority)
    
    for i in range(len(sorted_cols) - 1):
        band1 = sorted_cols[i]
        band2 = sorted_cols[i + 1]
        
        band1_name = band1.replace('_TOT', '').replace('lsst', '').replace('des', '').replace('euc', '')
        band2_name = band2.replace('_TOT', '').replace('lsst', '').replace('des', '').replace('euc', '')
        
        color_name = f"{band1_name}_{band2_name}_{survey_name}"
        df[color_name] = df[band1] - df[band2]
        colors_created.append(color_name)
    
    return colors_created

In [None]:
lsst_colors = create_intra_survey_colors(df_photometry_with_colors, lsst_cols, 'lsst')
des_colors = create_intra_survey_colors(df_photometry_with_colors, des_cols, 'des')
euc_colors = create_intra_survey_colors(df_photometry_with_colors, euc_cols, 'euc')

print(f"Created {len(lsst_colors)} LSST color indices: {lsst_colors}")
print(f"Created {len(des_colors)} DES color indices: {des_colors}")
print(f"Created {len(euc_colors)} Euclid color indices: {euc_colors}")

In [None]:
df_photometry_final = df_photometry_with_colors.dropna().reset_index(drop=True)

print(f"\nFinal photometry dataset shape: {df_photometry_final.shape}")
print(f"NaN counts after cleaning: {df_photometry_final.isna().sum().sum()}")

In [None]:
print("\nBasic statistics for color indices:")
all_color_cols = lsst_colors + des_colors + euc_colors
print(df_photometry_final[all_color_cols].describe())

In [None]:
cols = df_photometry_final.columns.tolist()
n = len(cols)
n_cols = 4
n_rows = math.ceil(n / n_cols)

fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
axes = axes.flatten()

for ax, col in zip(axes, cols):
    sns.histplot(df_photometry_final[col], bins=50, kde=True, ax=ax)
    ax.set_title(col, fontsize=10)
    ax.set(xlabel='', ylabel='Count')

for ax in axes[n:]:
    ax.axis('off')

plt.tight_layout()
plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\photometry_with_colors_histograms.png', dpi=300)
plt.show()

In [None]:
plt.figure(figsize=(20, 16))
sns.heatmap(
    df_photometry_final.corr(),
    annot=True,  # Show correlation numbers
    fmt=".2f",
    cmap="vlag",
    center=0,
    vmin=-1,
    vmax=1,
    linewidths=0.1,
    cbar_kws={"shrink": 0.8},
    annot_kws={"size": 8}  # Smaller font for readability
)
plt.title("Correlation Heatmap - Photometry + Color Indices")
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.savefig(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\results\eda\photometry_with_colors_correlation_heatmap.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
df_photometry_final.to_csv(
    r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\photometry_data.csv',
    index=False
)

## Colors Data Exploration

In [None]:
df_lsst_colors = df_photometry_final[lsst_cols + lsst_colors].dropna()
df_des_colors = df_photometry_final[des_cols + des_colors].dropna()
df_euc_colors = df_photometry_final[euc_cols + euc_colors].dropna()

print(f"\nLSST colors dataset shape: {df_lsst_colors.shape}")
print(f"DES colors dataset shape: {df_des_colors.shape}")
print(f"Euclid colors dataset shape: {df_euc_colors.shape}")


In [None]:
def visualize_survey_colors(df, survey_name, color_cols):
    
    if len(color_cols) > 0:
        n_colors = len(color_cols)
        n_cols = 3
        n_rows = math.ceil(n_colors / n_cols)
        
        fig, axes = plt.subplots(n_rows, n_cols, figsize=(5 * n_cols, 4 * n_rows))
        if n_rows == 1:
            axes = [axes] if n_colors == 1 else axes
        else:
            axes = axes.flatten()
        
        for i, col in enumerate(color_cols):
            if i < len(axes):
                sns.histplot(df[col], bins=50, kde=True, ax=axes[i])
                axes[i].set_title(col)
                axes[i].set(xlabel='', ylabel='Count')
        
        for i in range(len(color_cols), len(axes)):
            axes[i].axis('off')
        
        plt.tight_layout()
        plt.savefig(f'C:\\Users\\shash\\OneDrive\\Documents\\Dessertation\\Astro-DIM\\results\\eda\\{survey_name}_colors_histograms.png', dpi=300)
        plt.show()
        
        if len(color_cols) > 1:
            plt.figure(figsize=(10, 8))
            sns.heatmap(
                df[color_cols].corr(),
                annot=True,
                fmt=".2f",
                cmap="vlag",
                center=0,
                vmin=-1,
                vmax=1,
                linewidths=0.5,
                cbar_kws={"shrink": 0.8}
            )
            plt.title(f"{survey_name.upper()} Color Indices Correlation")
            plt.savefig(f'C:\\Users\\shash\\OneDrive\\Documents\\Dessertation\\Astro-DIM\\results\\eda\\{survey_name}_colors_correlation_heatmap.png', dpi=300)
            plt.show()

In [None]:
if lsst_colors:
    visualize_survey_colors(df_lsst_colors, 'lsst', lsst_colors)
if des_colors:
    visualize_survey_colors(df_des_colors, 'des', des_colors)
if euc_colors:
    visualize_survey_colors(df_euc_colors, 'euc', euc_colors)

In [None]:
if lsst_colors:
    df_lsst_colors_only = df_photometry_final[lsst_colors].dropna()
    df_lsst_colors_only.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\lsst_colors_only.csv', index=False)
    print(f"LSST colors-only dataset saved! Shape: {df_lsst_colors_only.shape}")

if des_colors:
    df_des_colors_only = df_photometry_final[des_colors].dropna()
    df_des_colors_only.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\des_colors_only.csv', index=False)
    print(f"DES colors-only dataset saved! Shape: {df_des_colors_only.shape}")

if euc_colors:
    df_euc_colors_only = df_photometry_final[euc_colors].dropna()
    df_euc_colors_only.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\euc_colors_only.csv', index=False)
    print(f"Euclid colors-only dataset saved! Shape: {df_euc_colors_only.shape}")


In [None]:
if not df_lsst_colors.empty:
    df_lsst_colors.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\lsst_colors_data.csv', index=False)
    print("LSST colors dataset saved!")

if not df_des_colors.empty:
    df_des_colors.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\des_colors_data.csv', index=False)
    print("DES colors dataset saved!")

if not df_euc_colors.empty:
    df_euc_colors.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\euc_colors_data.csv', index=False)
    print("Euclid colors dataset saved!")

In [None]:
all_color_indices = df_photometry_final[all_color_cols].copy()
all_color_indices.to_csv(r'C:\Users\shash\OneDrive\Documents\Dessertation\Astro-DIM\data\processed\all_colors_data.csv', index=False)
print("Combined colors-only dataset saved!")

In [None]:
print(f"\nSummary:")
print(f"- Complete photometry + colors dataset: {df_photometry_final.shape}")
print(f"- LSST-specific dataset: {df_lsst_colors.shape}")
print(f"- DES-specific dataset: {df_des_colors.shape}")
print(f"- Euclid-specific dataset: {df_euc_colors.shape}")
print(f"- All colors combined: {all_color_indices.shape}")
print(f"- LSST colors only: {df_lsst_colors_only.shape if lsst_colors else 'No LSST colors'}")
print(f"- DES colors only: {df_des_colors_only.shape if des_colors else 'No DES colors'}")
print(f"- Euclid colors only: {df_euc_colors_only.shape if euc_colors else 'No Euclid colors'}")