In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
import smogn
import h5py
from scipy.ndimage import gaussian_filter1d
from scipy.stats import pearsonr, ks_2samp

In [2]:
c = 3.*10**8

with h5py.File('data/fof_subhalo_tab_033.hdf5', 'r') as f:
    # Positions, velocities and masses of the halos
    M_h =   f['Group/Group_M_Crit200'][:]*1e10      #Msun/h
    R_h =   f['Group/Group_R_Crit200'][:]/c         #kpc/h
    V_h  =  f['Group/GroupVel'][:]                  #km/s
    V_h =   np.linalg.norm(V_h, axis = 1)
    ID_r =  f['Group/GroupFirstSub'][:] #Contains halos without gals as this number = -1
    ID_h =  np.arange(0, M_h.shape[0], 1, dtype = float) #It is the ID of the halos, to match the gal cat

    # Positions, stellar masses,  of the galaxies
    SM     = f['Subhalo/SubhaloMassType'][:, 4]*1e10 #Msun/h
    SFR    = f['Subhalo/SubhaloSFR'][:]*1e10 #Msun/yr
    SR = f['Subhalo/SubhaloHalfmassRadType'][:, 4]
    Colour = f['Subhalo/SubhaloStellarPhotometrics'][:, 5] - f['Subhalo/SubhaloStellarPhotometrics'][:, 6] #g-i 
    ID_g = np.array(f['Subhalo/SubhaloGrNr']) #Gals IDs

indexes = np.where( ID_r != -1)[0]
M_h = M_h[indexes]
R_h = R_h[indexes]
V_h = V_h[indexes]
ID_h = ID_h[indexes]

# Halo catalog
data = np.array( [ M_h, R_h, V_h, ID_h ] ).T
columns = [ 'M_h', 'R_h', 'V_h', 'ID' ] 
halos = pd.DataFrame(data = data, columns = columns)

indexes = np.where(SM > 0)[0]
SM = SM[indexes]
SFR = SFR[indexes]
SR = SR[indexes]
Colour = Colour[indexes]
ID_g = ID_g[indexes]

#Galaxy catalog
data = np.array([SM, SFR, Colour, SR, ID_g]).T
columns = ['SM', 'SFR', 'Colour', 'SR', 'ID']
gals = pd.DataFrame(data=data, columns=columns)

gals = gals.drop_duplicates(subset = ['ID'], keep = 'first')

gals['SFR'] = gals['SFR'].replace(0, 1)
gals['SFR'] = np.log10(gals['SFR'])
gals.loc[gals['SFR'] == 0, 'SFR'] = np.random.normal(8.0, 0.5, len(gals.loc[gals['SFR'] == 0]))
#gals.shape

#SM.shape, M_h.shape

df = pd.merge(left = halos, right = gals, left_on = 'ID', right_on = 'ID')

df['M_h'] = np.log10(df['M_h'] + 0.01)
df['R_h'] = np.log10(df['R_h'] + 0.01)
df['V_h'] = np.log10(df['V_h'] + 0.01)

df['SM']  = np.log10(df['SM'] + 0.01)
#df['SR']  = np.log10(df['SR'] + 0.01)

In [3]:
df.describe()

Unnamed: 0,M_h,R_h,V_h,ID,SM,SFR,Colour,SR
count,1522.0,1522.0,1522.0,1522.0,1522.0,1522.0,1522.0,1522.0
mean,10.708978,-1.99999,2.141859,1105.073587,7.86284,8.308993,0.216815,3.470842
std,0.577714,6e-06,0.242958,1305.457871,0.992273,0.700563,0.058822,3.823537
min,9.15953,-1.999997,0.852127,0.0,6.240346,6.573544,-0.308596,0.0
25%,10.342783,-1.999993,2.001723,380.25,7.110663,7.848868,0.204559,0.0
50%,10.60501,-1.999992,2.158675,785.5,7.534524,8.2184,0.223761,3.179318
75%,10.983194,-1.999989,2.310053,1370.75,8.308563,8.671782,0.238508,5.235649
max,13.653268,-1.999916,2.750756,17953.0,11.630214,10.718828,0.389301,32.569843


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1522 entries, 0 to 1521
Data columns (total 8 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   M_h     1522 non-null   float64
 1   R_h     1522 non-null   float64
 2   V_h     1522 non-null   float64
 3   ID      1522 non-null   float64
 4   SM      1522 non-null   float64
 5   SFR     1522 non-null   float64
 6   Colour  1522 non-null   float64
 7   SR      1522 non-null   float64
dtypes: float64(8)
memory usage: 95.3 KB


In [None]:
fig, axs = plt.subplots(1, 3, figsize = (12, 3), sharey = True, dpi = 100)

fig.suptitle('Halo properties')

axs[0].hist(df['M_h'])
axs[0].set_yscale('log')
axs[0].set_xlabel('M_h')
axs[0].set_ylabel('# halos')

axs[1].hist(df['R_h'])
axs[1].set_yscale('log')
axs[1].set_xlabel('R_h')

axs[2].hist(df['V_h'])
axs[2].set_yscale('log')
axs[2].set_xlabel('V_h')

In [None]:
fig, axs = plt.subplots(1, 4, figsize = (16, 3), sharey = True, dpi = 100)

fig.suptitle('Galaxy properties')

axs[0].hist(df['SM'], bins = 40)
axs[0].set_yscale('log')
axs[0].set_xlabel(r'$M_{\star}$')
axs[0].set_ylabel('# halos')

axs[1].hist(df['Colour'], bins = 40)
axs[1].set_yscale('log')
axs[1].set_xlabel('Color')

axs[2].hist(df['SFR'], bins = 40)
axs[2].set_yscale('log')
axs[2].set_xlabel('SFR')

axs[3].hist(df['SR'], bins = 40)
axs[3].set_yscale('log')
axs[3].set_xlabel('SR')

In [None]:
def correlation_heatmap(df):
    correlations = df.corr(method = 'pearson')

    fig, ax = plt.subplots(figsize = (8,8))
    sns.heatmap(correlations, vmax = 1.0, center = 0, fmt = '.4f',
                square = True, linewidths = .5, annot = True, 
                cbar_kws = {"shrink": .82})
    plt.title('Pearson correlation Heatmap')
    plt.show()
    
    return correlations

correlation_heatmap(df)

In [11]:
class SimpleMLP(nn.Module):
    def __init__(self, input_size, hidden_size, output_size=1):
        super(SimpleMLP, self).__init__()

        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, hidden_size)
        
        self.fc_out = nn.Linear(hidden_size, output_size)
        
        self.relu = nn.ReLU()

    def forward(self, x):

        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))

        x = self.fc_out(x)
        return x

In [12]:
# UPDATED TRAINING FUNCTIONS WITH LOSS TRACKING

def train_neural_network(X_train, X_test, y_train, y_test, num_epochs=500):
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    if y_train.ndim == 1:
        y_train = y_train.reshape(-1, 1)
    if y_test.ndim == 1:
        y_test = y_test.reshape(-1, 1)

    mean_data = np.mean(X_train, axis=0)
    std_data = np.std(X_train, axis=0)
    
    X_train_norm = (X_train - mean_data) / std_data
    X_test_norm = (X_test - mean_data) / std_data

    mean_targets = np.mean(y_train, axis=0)
    std_targets = np.std(y_train, axis=0)
    
    y_train_norm = (y_train - mean_targets) / std_targets
    y_test_norm = (y_test - mean_targets) / std_targets

    X_train_tensor = torch.FloatTensor(X_train_norm.values if hasattr(X_train_norm, 'values') else X_train_norm)
    y_train_tensor = torch.FloatTensor(y_train_norm)
    X_test_tensor = torch.FloatTensor(X_test_norm.values if hasattr(X_test_norm, 'values') else X_test_norm)
    y_test_tensor = torch.FloatTensor(y_test_norm)

    input_size = X_train_tensor.shape[1]
    model = SimpleMLP(input_size, hidden_size=128)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # Track losses
    train_losses = []
    test_losses = []
    
    for epoch in range(num_epochs):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()
        
        train_losses.append(loss.item())
        
        # Evaluate on test set
        model.eval()
        with torch.no_grad():
            test_outputs = model(X_test_tensor)
            test_loss = criterion(test_outputs, y_test_tensor)
            test_losses.append(test_loss.item())

    model.eval()
    with torch.no_grad():
        y_pred_norm = model(X_test_tensor).numpy()

    y_pred = y_pred_norm * std_targets + mean_targets
    
    metrics = {}
    for i in range(y_test.shape[1]):
        mse = mean_squared_error(y_test[:, i], y_pred[:, i])
        pearson_corr, pearson_pval = pearsonr(y_test[:, i], y_pred[:, i])
        ks_stat, ks_pval = ks_2samp(y_test[:, i], y_pred[:, i])
        
        metrics[f'target_{i}'] = {
            'MSE': mse,
            'Pearson_r': pearson_corr,
            'Pearson_pval': pearson_pval,
            'KS_statistic': ks_stat,
            'KS_pval': ks_pval
        }
    
    return y_pred, metrics, train_losses, test_losses


def train_extra_trees(X_train, X_test, y_train, y_test, num_estimators_list=None):
    """
    ExtraTreesRegressor doesn't have traditional epochs, so we track performance
    as trees are added to the ensemble
    """
    if num_estimators_list is None:
        num_estimators_list = [10, 20, 50, 100, 150, 200]
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    if y_train.ndim == 1:
        y_train = y_train.reshape(-1, 1)
    if y_test.ndim == 1:
        y_test = y_test.reshape(-1, 1)

    train_losses = []
    test_losses = []
    
    # Train with increasing number of estimators
    for n_est in num_estimators_list:
        model = ExtraTreesRegressor(
            n_estimators=n_est,
            max_depth=None,
            min_samples_split=2,
            min_samples_leaf=1,
            random_state=42,
            n_jobs=-1
        )

        if y_train.shape[1] == 1:
            y_train_flat = y_train.ravel()
        else:
            y_train_flat = y_train
        
        model.fit(X_train, y_train_flat)
        
        # Training loss
        y_train_pred = model.predict(X_train)
        if y_train_pred.ndim == 1:
            y_train_pred = y_train_pred.reshape(-1, 1)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_losses.append(train_mse)
        
        # Test loss
        y_test_pred = model.predict(X_test)
        if y_test_pred.ndim == 1:
            y_test_pred = y_test_pred.reshape(-1, 1)
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_losses.append(test_mse)
    
    # Final model with 200 estimators
    model = ExtraTreesRegressor(
        n_estimators=200,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        random_state=42,
        n_jobs=-1
    )

    if y_train.shape[1] == 1:
        y_train = y_train.ravel()
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred = np.array(y_pred)
    if y_pred.ndim == 1:
        y_pred = y_pred.reshape(-1, 1)
    
    metrics = {}
    for i in range(y_test.shape[1]):
        mse = mean_squared_error(y_test[:, i], y_pred[:, i])
        pearson_corr, pearson_pval = pearsonr(y_test[:, i], y_pred[:, i])
        ks_stat, ks_pval = ks_2samp(y_test[:, i], y_pred[:, i])
        
        metrics[f'target_{i}'] = {
            'MSE': mse,
            'Pearson_r': pearson_corr,
            'Pearson_pval': pearson_pval,
            'KS_statistic': ks_stat,
            'KS_pval': ks_pval
        }

    return y_pred, metrics, train_losses, test_losses, num_estimators_list


def train_knn(X_train, X_test, y_train, y_test, k_values=None):
    """
    KNN doesn't have training in traditional sense, so we track performance
    across different k values
    """
    if k_values is None:
        k_values = [3, 5, 7, 10, 15, 20, 30]
    
    y_train = np.array(y_train)
    y_test = np.array(y_test)
    if y_train.ndim == 1:
        y_train = y_train.reshape(-1, 1)
    if y_test.ndim == 1:
        y_test = y_test.reshape(-1, 1)

    scaler_X = StandardScaler()
    scaler_y = StandardScaler()
    
    X_train_scaled = scaler_X.fit_transform(X_train)
    X_test_scaled = scaler_X.transform(X_test)
    y_train_scaled = scaler_y.fit_transform(y_train)
    
    train_losses = []
    test_losses = []
    
    # Try different k values
    for k in k_values:
        model = KNeighborsRegressor(
            n_neighbors=k,
            weights='distance',
            algorithm='auto',
            n_jobs=-1
        )

        if y_train_scaled.shape[1] == 1:
            y_train_scaled_flat = y_train_scaled.ravel()
        else:
            y_train_scaled_flat = y_train_scaled
        
        model.fit(X_train_scaled, y_train_scaled_flat)
        
        # Training loss
        y_train_pred_scaled = model.predict(X_train_scaled)
        if y_train_pred_scaled.ndim == 1:
            y_train_pred_scaled = y_train_pred_scaled.reshape(-1, 1)
        y_train_pred = scaler_y.inverse_transform(y_train_pred_scaled)
        train_mse = mean_squared_error(y_train, y_train_pred)
        train_losses.append(train_mse)
        
        # Test loss
        y_test_pred_scaled = model.predict(X_test_scaled)
        if y_test_pred_scaled.ndim == 1:
            y_test_pred_scaled = y_test_pred_scaled.reshape(-1, 1)
        y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled)
        test_mse = mean_squared_error(y_test, y_test_pred)
        test_losses.append(test_mse)
    
    # Final model with k=10
    model = KNeighborsRegressor(
        n_neighbors=10,
        weights='distance',
        algorithm='auto',
        n_jobs=-1
    )

    # Fit with flattened y if needed
    if y_train_scaled.shape[1] == 1:
        y_train_scaled_flat = y_train_scaled.ravel()
    else:
        y_train_scaled_flat = y_train_scaled
    
    model.fit(X_train_scaled, y_train_scaled_flat)
    y_pred_scaled = model.predict(X_test_scaled)
    
    # FIX: Ensure y_pred_scaled is 2D before inverse_transform
    if y_pred_scaled.ndim == 1:
        y_pred_scaled = y_pred_scaled.reshape(-1, 1)
    
    y_pred = scaler_y.inverse_transform(y_pred_scaled)
    
    metrics = {}
    for i in range(y_test.shape[1]):
        mse = mean_squared_error(y_test[:, i], y_pred[:, i])
        pearson_corr, pearson_pval = pearsonr(y_test[:, i], y_pred[:, i])
        ks_stat, ks_pval = ks_2samp(y_test[:, i], y_pred[:, i])
        
        metrics[f'target_{i}'] = {
            'MSE': mse,
            'Pearson_r': pearson_corr,
            'Pearson_pval': pearson_pval,
            'KS_statistic': ks_stat,
            'KS_pval': ks_pval
        }

    return y_pred, metrics, train_losses, test_losses, k_values

In [13]:
SM1 = df['SM'].values
SFR1 = df['SFR'].values
Colour1 = df['Colour'].values
SR1 = df['SR'].values

df1 = df.copy()
df1.drop(['SFR', 'Colour', 'SR'], axis=1, inplace=True)

X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(df1, SM1, test_size=0.2, random_state=42)
X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr = train_test_split(df1, SFR1, test_size=0.2, random_state=42)
X_train_colour, X_test_colour, y_train_colour, y_test_colour = train_test_split(df1, Colour1, test_size=0.2, random_state=42)
X_train_sr, X_test_sr, y_train_sr, y_test_sr = train_test_split(df1, SR1, test_size=0.2, random_state=42)

In [14]:
# Neural Network
y_raw_nn_sm, metrics_raw_nn_sm, train_loss_raw_nn_sm, test_loss_raw_nn_sm = train_neural_network(X_train_sm, X_test_sm, y_train_sm, y_test_sm)
y_raw_nn_sfr, metrics_raw_nn_sfr, train_loss_raw_nn_sfr, test_loss_raw_nn_sfr = train_neural_network(X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr)
y_raw_nn_colour, metrics_raw_nn_colour, train_loss_raw_nn_colour, test_loss_raw_nn_colour = train_neural_network(X_train_colour, X_test_colour, y_train_colour, y_test_colour)
y_raw_nn_sr, metrics_raw_nn_sr, train_loss_raw_nn_sr, test_loss_raw_nn_sr = train_neural_network(X_train_sr, X_test_sr, y_train_sr, y_test_sr)

# Extra Trees
y_raw_et_sm, metrics_raw_et_sm, train_loss_raw_et_sm, test_loss_raw_et_sm, n_est_raw_sm = train_extra_trees(X_train_sm, X_test_sm, y_train_sm, y_test_sm)
y_raw_et_sfr, metrics_raw_et_sfr, train_loss_raw_et_sfr, test_loss_raw_et_sfr, n_est_raw_sfr = train_extra_trees(X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr)
y_raw_et_colour, metrics_raw_et_colour, train_loss_raw_et_colour, test_loss_raw_et_colour, n_est_raw_colour = train_extra_trees(X_train_colour, X_test_colour, y_train_colour, y_test_colour)
y_raw_et_sr, metrics_raw_et_sr, train_loss_raw_et_sr, test_loss_raw_et_sr, n_est_raw_sr = train_extra_trees(X_train_sr, X_test_sr, y_train_sr, y_test_sr)

# KNN
y_raw_knn_sm, metrics_raw_knn_sm, train_loss_raw_knn_sm, test_loss_raw_knn_sm, k_vals_raw_sm = train_knn(X_train_sm, X_test_sm, y_train_sm, y_test_sm)
y_raw_knn_sfr, metrics_raw_knn_sfr, train_loss_raw_knn_sfr, test_loss_raw_knn_sfr, k_vals_raw_sfr = train_knn(X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr)
y_raw_knn_colour, metrics_raw_knn_colour, train_loss_raw_knn_colour, test_loss_raw_knn_colour, k_vals_raw_colour = train_knn(X_train_colour, X_test_colour, y_train_colour, y_test_colour)
y_raw_knn_sr, metrics_raw_knn_sr, train_loss_raw_knn_sr, test_loss_raw_knn_sr, k_vals_raw_sr = train_knn(X_train_sr, X_test_sr, y_train_sr, y_test_sr)

  res = hypotest_fun_out(*samples, **kwds)


In [15]:
df_smogn = df.copy()

X = df_smogn.drop(['SM', 'SFR', 'Colour', 'SR'], axis=1)
y_sm = df_smogn[['SM']]
y_sfr = df_smogn[['SFR']]
y_colour = df_smogn[['Colour']]
y_sr = df_smogn[['SR']]

X_train_sm, X_test_sm, y_train_sm, y_test_sm = train_test_split(X, y_sm, test_size=0.2, random_state=42)
X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr = train_test_split(X, y_sfr, test_size=0.2, random_state=42)
X_train_colour, X_test_colour, y_train_colour, y_test_colour = train_test_split(X, y_colour, test_size=0.2, random_state=42)
X_train_sr, X_test_sr, y_train_sr, y_test_sr = train_test_split(X, y_sr, test_size=0.2, random_state=42)

df_train_sm = X_train_sm.copy()
df_train_sm['SM'] = y_train_sm.values
df_train_sm = df_train_sm.reset_index(drop=True)

df_train_sfr = X_train_sfr.copy()
df_train_sfr['SFR'] = y_train_sfr.values
df_train_sfr = df_train_sfr.reset_index(drop=True)

df_train_colour = X_train_colour.copy()
df_train_colour['Colour'] = y_train_colour.values
df_train_colour = df_train_colour.reset_index(drop=True)

df_train_sr = X_train_sr.copy()
df_train_sr['SR'] = y_train_sr.values
df_train_sr = df_train_sr.reset_index(drop=True)

df_train_sm = smogn.smoter(
    data=df_train_sm,
    y='SM',
    k=3,                     # Further reduced from 5
    pert=0.01,              # Much more conservative
    samp_method='extreme',
    rel_method='manual',
    rel_ctrl_pts_rg=[
        [6.5, 0, 0],        # Low mass: zero
        [10.8, 0, 0],       # Mid-high mass: STILL zero
        [11.2, 0.8, 0],     # Sharp transition
        [11.4, 1, 0],       # Only the very highest
        [11.63, 1, 0]
    ],
    under_samp=True         # Critical for reducing common regions
)

# SFR: Dual focus on extreme low AND extreme high
df_train_sfr = smogn.smoter(
    data=df_train_sfr,
    y='SFR',
    k=5,
    pert=0.025,              # Reduced perturbation
    samp_method='extreme',
    rel_method='manual',
    rel_ctrl_pts_rg=[
        [-2.5, 1, 0],
        [-0.5, 1, 0],
        [2.0, 0, 0],         # Mid range: zero relevance
        [8.5, 0.8, 0],       # High SFR increase
        [9.2, 1, 0],         # Maximum for extreme high
        [9.57, 1, 0]
    ],
    under_samp=True
)

# Colour: Extreme tails only
df_train_colour = smogn.smoter(
    data=df_train_colour,
    y='Colour',
    k=5,
    pert=0.025,
    samp_method='extreme',
    rel_method='manual',
    rel_ctrl_pts_rg=[
        [-0.13, 0.9, 0],
        [0.3, 0, 0],         # Central zero
        [1.05, 0.7, 0],
        [1.25, 1, 0],
        [1.32, 1, 0]
    ],
    under_samp=True
)

# SR: High values
df_train_sr = smogn.smoter(
    data=df_train_sr,
    y='SR',
    k=5,
    pert=0.025,
    samp_method='extreme',
    rel_method='manual',
    rel_ctrl_pts_rg=[
        [-2.0, 0, 0],
        [0.5, 0, 0],
        [1.0, 0.6, 0],
        [1.3, 0.9, 0],
        [1.48, 1, 0]
    ],
    under_samp=True
)

print(f"Original dataset size: {len(df_smogn)}")
print(f"SM training set size after SMOGN: {len(df_train_sm)} (+{len(df_train_sm) - len(X_train_sm)})")
print(f"SFR training set size after SMOGN: {len(df_train_sfr)} (+{len(df_train_sfr) - len(X_train_sfr)})")
print(f"Colour training set size after SMOGN: {len(df_train_colour)} (+{len(df_train_colour) - len(X_train_colour)})")
print(f"SR training set size after SMOGN: {len(df_train_sr)} (+{len(df_train_sr) - len(X_train_sr)})")

X_train_sm = df_train_sm.drop(['SM'], axis=1)
y_train_sm = df_train_sm[['SM']].values

X_train_sfr = df_train_sfr.drop(['SFR'], axis=1)
y_train_sfr = df_train_sfr[['SFR']].values

X_train_colour = df_train_colour.drop(['Colour'], axis=1)
y_train_colour = df_train_colour[['Colour']].values

X_train_sr = df_train_sr.drop(['SR'], axis=1)
y_train_sr = df_train_sr[['SR']].values

X_test_sm = X_test_sm
y_test_sm = y_test_sm.values

X_test_sfr = X_test_sfr
y_test_sfr = y_test_sfr.values

X_test_colour = X_test_colour
y_test_colour = y_test_colour.values

X_test_sr = X_test_sr
y_test_sr = y_test_sr.values

dist_matrix: 100%|##########| 2/2 [00:00<00:00, 2744.07it/s]
synth_matrix: 100%|##########| 2/2 [00:00<00:00,  2.07it/s]
r_index: 100%|##########| 1/1 [00:00<00:00, 594.77it/s]
dist_matrix: 100%|##########| 8/8 [00:00<00:00, 1005.86it/s]
synth_matrix: 100%|##########| 8/8 [00:00<00:00, 40.88it/s]
r_index: 100%|##########| 3/3 [00:00<00:00, 1122.77it/s]
dist_matrix: 100%|##########| 33/33 [00:00<00:00, 384.16it/s]
synth_matrix: 100%|##########| 33/33 [00:00<00:00, 36.11it/s]
r_index: 100%|##########| 26/26 [00:00<00:00, 1202.64it/s]
dist_matrix: 100%|##########| 457/457 [00:15<00:00, 28.64it/s]
r_index: 100%|##########| 319/319 [00:00<00:00, 1135.58it/s]

Original dataset size: 1522
SM training set size after SMOGN: 2423 (+1206)
SFR training set size after SMOGN: 1217 (+0)
Colour training set size after SMOGN: 2365 (+1148)
SR training set size after SMOGN: 1080 (+-137)





In [16]:
# Neural Network
y_smogn_nn_sm, metrics_smogn_nn_sm, train_loss_smogn_nn_sm, test_loss_smogn_nn_sm = train_neural_network(X_train_sm, X_test_sm, y_train_sm, y_test_sm)
y_smogn_nn_sfr, metrics_smogn_nn_sfr, train_loss_smogn_nn_sfr, test_loss_smogn_nn_sfr = train_neural_network(X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr)
y_smogn_nn_colour, metrics_smogn_nn_colour, train_loss_smogn_nn_colour, test_loss_smogn_nn_colour = train_neural_network(X_train_colour, X_test_colour, y_train_colour, y_test_colour)
y_smogn_nn_sr, metrics_smogn_nn_sr, train_loss_smogn_nn_sr, test_loss_smogn_nn_sr = train_neural_network(X_train_sr, X_test_sr, y_train_sr, y_test_sr)

# Extra Trees
y_smogn_et_sm, metrics_smogn_et_sm, train_loss_smogn_et_sm, test_loss_smogn_et_sm, n_est_smogn_sm = train_extra_trees(X_train_sm, X_test_sm, y_train_sm, y_test_sm)
y_smogn_et_sfr, metrics_smogn_et_sfr, train_loss_smogn_et_sfr, test_loss_smogn_et_sfr, n_est_smogn_sfr = train_extra_trees(X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr)
y_smogn_et_colour, metrics_smogn_et_colour, train_loss_smogn_et_colour, test_loss_smogn_et_colour, n_est_smogn_colour = train_extra_trees(X_train_colour, X_test_colour, y_train_colour, y_test_colour)
y_smogn_et_sr, metrics_smogn_et_sr, train_loss_smogn_et_sr, test_loss_smogn_et_sr, n_est_smogn_sr = train_extra_trees(X_train_sr, X_test_sr, y_train_sr, y_test_sr)

# KNN
y_smogn_knn_sm, metrics_smogn_knn_sm, train_loss_smogn_knn_sm, test_loss_smogn_knn_sm, k_vals_smogn_sm = train_knn(X_train_sm, X_test_sm, y_train_sm, y_test_sm)
y_smogn_knn_sfr, metrics_smogn_knn_sfr, train_loss_smogn_knn_sfr, test_loss_smogn_knn_sfr, k_vals_smogn_sfr = train_knn(X_train_sfr, X_test_sfr, y_train_sfr, y_test_sfr)
y_smogn_knn_colour, metrics_smogn_knn_colour, train_loss_smogn_knn_colour, test_loss_smogn_knn_colour, k_vals_smogn_colour = train_knn(X_train_colour, X_test_colour, y_train_colour, y_test_colour)
y_smogn_knn_sr, metrics_smogn_knn_sr, train_loss_smogn_knn_sr, test_loss_smogn_knn_sr, k_vals_smogn_sr = train_knn(X_train_sr, X_test_sr, y_train_sr, y_test_sr)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(10, 5), sharey=False)
axes = axes.flatten()

properties = [
    ('SM', 'Stellar Mass', y_test_sm, y_train_sm),
    ('SFR', 'Star Formation Rate', y_test_sfr, y_train_sfr),
    ('SR', 'Stellar Radius', y_test_sr, y_train_sr),
    ('Colour', 'r - i', y_test_colour, y_train_colour)
]

for i, (prop, xlabel, ytrue_raw, ysmogn_raw) in enumerate(properties):
    ax = axes[i]
    ytrue_compare = ytrue_raw.flatten()
    ysmogn = ysmogn_raw.flatten()

    hist_true, bin_edges = np.histogram(ytrue_compare, bins=50)
    hist_smogn, _ = np.histogram(ysmogn, bins=bin_edges)

    ax.hist(ytrue_compare, bins=bin_edges, histtype='stepfilled',
            color='mediumpurple', alpha=0.65, label='True', linewidth=0)

    ax.step(bin_edges[:-1], hist_smogn, where='post', label='SMOGN',
            color='black', lw=0.42)

    ax.set_yscale('log')
    ax.set_ylim(0.5, max(hist_true.max(), hist_smogn.max()) * 3)
    ax.set_xlabel(xlabel, fontsize=10)
    ax.set_ylabel('# of central galaxies', fontsize=13)
    ax.legend(loc='upper left', fontsize=7, framealpha=0.6)
    ax.grid(True, alpha=0.3, linestyle='--', which='both')

plt.tight_layout()
plt.show()

In [None]:
# Extract predictions from the new training function outputs (first element of tuple)
properties = [
    ('SM', 'Stellar Mass', y_test_sm, y_raw_nn_sm, y_raw_knn_sm, y_raw_et_sm,
         y_smogn_nn_sm, y_smogn_knn_sm, y_smogn_et_sm),
    ('SFR', 'Star Formation Rate', y_test_sfr, y_raw_nn_sfr, y_raw_knn_sfr, y_raw_et_sfr,
         y_smogn_nn_sfr, y_smogn_knn_sfr, y_smogn_et_sfr),
    ('SR', 'Stellar Radius', y_test_sr, y_raw_nn_sr, y_raw_knn_sr, y_raw_et_sr,
         y_smogn_nn_sr, y_smogn_knn_sr, y_smogn_et_sr),
    ('Colour', 'r - i', y_test_colour, y_raw_nn_colour, y_raw_knn_colour, y_raw_et_colour,
         y_smogn_nn_colour, y_smogn_knn_colour, y_smogn_et_colour)
]

fig, axes = plt.subplots(2, 4, figsize=(20, 7), sharey='row')

model_colors = {
    'NN': 'cornflowerblue',
    'kNN': 'deepskyblue',
    'ERT': 'peru'
}

for col, (prop, xlabel, ytrue, nn_raw, knn_raw, ert_raw, nn_smogn, knn_smogn, ert_smogn) in enumerate(properties):
    
    # Raw models (top row)
    ax_raw = axes[0, col]
    hist_true, bins = np.histogram(ytrue.flatten(), bins=50)
    ax_raw.hist(ytrue.flatten(), bins=bins, histtype='stepfilled', color='mediumpurple', alpha=0.65, label='True')
    
    for model_name, pred in zip(['NN', 'kNN', 'ERT'], [nn_raw, knn_raw, ert_raw]):
        hist_pred, _ = np.histogram(pred.flatten(), bins=bins)
        ax_raw.step(bins[:-1], hist_pred, where='post', color=model_colors[model_name], label=model_name, lw=2)
    
    ax_raw.set_yscale('log')
    ax_raw.set_ylim(1, hist_true.max() * 3)
    ax_raw.set_title(f'Raw models - {xlabel}', fontsize=12, fontweight='bold')
    ax_raw.set_xlabel(xlabel, fontsize=11)
    if col == 0:
        ax_raw.set_ylabel('# of central galaxies', fontsize=11)
    ax_raw.grid(True, alpha=0.3, linestyle='--', which='both')
    
    # SMOGN models (bottom row)
    ax_smogn = axes[1, col]
    hist_true, bins = np.histogram(ytrue.flatten(), bins=50)
    ax_smogn.hist(ytrue.flatten(), bins=bins, histtype='stepfilled', color='mediumpurple', alpha=0.65, label='True')
    
    for model_name, pred in zip(['NN', 'kNN', 'ERT'], [nn_smogn, knn_smogn, ert_smogn]):
        hist_pred, _ = np.histogram(pred.flatten(), bins=bins)
        ax_smogn.step(bins[:-1], hist_pred, where='post', color=model_colors[model_name], label=model_name, lw=2)
    
    ax_smogn.set_yscale('log')
    ax_smogn.set_ylim(1, hist_true.max() * 3)
    ax_smogn.set_title(f'SMOGN models - {xlabel}', fontsize=12, fontweight='bold')
    ax_smogn.set_xlabel(xlabel, fontsize=11)
    if col == 0:
        ax_smogn.set_ylabel('# of central galaxies', fontsize=11)
    ax_smogn.grid(True, alpha=0.3, linestyle='--', which='both')

# Add legends
axes[0, 0].legend(fontsize=10, loc='upper right')
axes[1, 0].legend(fontsize=10, loc='upper right')

plt.tight_layout()
plt.show()

In [None]:
# PLOTTING CODE FOR TRAINING AND TESTING LOSS CURVES

# Dictionary to store all loss histories
loss_histories = {
    'SM': {
        'NN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'KNN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'ERT': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None}
    },
    'SFR': {
        'NN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'KNN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'ERT': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None}
    },
    'Colour': {
        'NN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'KNN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'ERT': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None}
    },
    'SR': {
        'NN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'KNN': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None},
        'ERT': {'raw_train': None, 'raw_test': None, 'smogn_train': None, 'smogn_test': None}
    }
}

def plot_loss_curves(loss_histories):
    """
    Plot training and testing loss curves for all targets and models
    """
    targets = ['SM', 'SFR', 'Colour', 'SR']
    target_names = ['Stellar Mass', 'Star Formation Rate', 'r - i', 'Stellar Radius']
    models = ['NN', 'KNN', 'ERT']
    model_names = ['Neural Network', 'KNN', 'Extra Trees']
    
    fig, axes = plt.subplots(4, 3, figsize=(18, 16))
    fig.suptitle('Training and Testing Loss Curves: Raw vs SMOGN', fontsize=16, y=0.995)
    
    for row, (target, target_name) in enumerate(zip(targets, target_names)):
        for col, (model, model_name) in enumerate(zip(models, model_names)):
            ax = axes[row, col]
            
            # Get loss histories
            raw_train = loss_histories[target][model]['raw_train']
            raw_test = loss_histories[target][model]['raw_test']
            smogn_train = loss_histories[target][model]['smogn_train']
            smogn_test = loss_histories[target][model]['smogn_test']
            
            if raw_train is not None and raw_test is not None:
                # For NN, use epochs directly
                if model == 'NN':
                    x_raw = np.arange(len(raw_train))
                    x_smogn = np.arange(len(smogn_train)) if smogn_train is not None else None
                    xlabel = 'Epoch'
                # For ERT, use number of estimators
                elif model == 'ERT':
                    x_raw = [10, 20, 50, 100, 150, 200][:len(raw_train)]
                    x_smogn = [10, 20, 50, 100, 150, 200][:len(smogn_train)] if smogn_train is not None else None
                    xlabel = 'Number of Estimators'
                # For KNN, use k values
                else:  # KNN
                    x_raw = [3, 5, 7, 10, 15, 20, 30][:len(raw_train)]
                    x_smogn = [3, 5, 7, 10, 15, 20, 30][:len(smogn_train)] if smogn_train is not None else None
                    xlabel = 'k (Number of Neighbors)'
                
                # Plot raw model
                ax.plot(x_raw, raw_train, label='Raw Train', color='#1f77b4', linewidth=2, linestyle='-')
                ax.plot(x_raw, raw_test, label='Raw Test', color='#1f77b4', linewidth=2, linestyle='--')
                
                # Plot SMOGN model
                if smogn_train is not None and smogn_test is not None:
                    ax.plot(x_smogn, smogn_train, label='SMOGN Train', color='#ff7f0e', linewidth=2, linestyle='-')
                    ax.plot(x_smogn, smogn_test, label='SMOGN Test', color='#ff7f0e', linewidth=2, linestyle='--')
                
                ax.set_xlabel(xlabel, fontsize=11)
                ax.set_ylabel('MSE Loss', fontsize=11)
                
                # Set title
                if row == 0:
                    ax.set_title(f'{model_name}', fontsize=12, fontweight='bold')
                
                # Add target name on the left
                if col == 0:
                    ax.text(-0.35, 0.5, target_name, transform=ax.transAxes, 
                           fontsize=12, fontweight='bold', rotation=90, 
                           verticalalignment='center', horizontalalignment='center')
                
                ax.legend(loc='best', fontsize=9)
                ax.grid(True, alpha=0.3)
                
                # Use log scale for y-axis if losses vary widely
                if model == 'NN':
                    ax.set_yscale('log')
            else:
                ax.text(0.5, 0.5, 'No data available', 
                       transform=ax.transAxes, ha='center', va='center')
                ax.set_xlabel(xlabel if model == 'NN' else ('Number of Estimators' if model == 'ERT' else 'k'), fontsize=11)
                ax.set_ylabel('MSE Loss', fontsize=11)
    
    plt.tight_layout()
    plt.show()

loss_histories['SM']['NN']['raw_train'] = train_loss_raw_nn_sm
loss_histories['SM']['NN']['raw_test'] = test_loss_raw_nn_sm
loss_histories['SM']['NN']['smogn_train'] = train_loss_smogn_nn_sm
loss_histories['SM']['NN']['smogn_test'] = test_loss_smogn_nn_sm

loss_histories['SM']['KNN']['raw_train'] = train_loss_raw_knn_sm
loss_histories['SM']['KNN']['raw_test'] = test_loss_raw_knn_sm
loss_histories['SM']['KNN']['smogn_train'] = train_loss_smogn_knn_sm
loss_histories['SM']['KNN']['smogn_test'] = test_loss_smogn_knn_sm

loss_histories['SM']['ERT']['raw_train'] = train_loss_raw_et_sm
loss_histories['SM']['ERT']['raw_test'] = test_loss_raw_et_sm
loss_histories['SM']['ERT']['smogn_train'] = train_loss_smogn_et_sm
loss_histories['SM']['ERT']['smogn_test'] = test_loss_smogn_et_sm

loss_histories['SFR']['NN']['raw_train'] = train_loss_raw_nn_sfr
loss_histories['SFR']['NN']['raw_test'] = test_loss_raw_nn_sfr
loss_histories['SFR']['NN']['smogn_train'] = train_loss_smogn_nn_sfr
loss_histories['SFR']['NN']['smogn_test'] = test_loss_smogn_nn_sfr

loss_histories['SFR']['KNN']['raw_train'] = train_loss_raw_knn_sfr
loss_histories['SFR']['KNN']['raw_test'] = test_loss_raw_knn_sfr
loss_histories['SFR']['KNN']['smogn_train'] = train_loss_smogn_knn_sfr
loss_histories['SFR']['KNN']['smogn_test'] = test_loss_smogn_knn_sfr

loss_histories['SFR']['ERT']['raw_train'] = train_loss_raw_et_sfr
loss_histories['SFR']['ERT']['raw_test'] = test_loss_raw_et_sfr
loss_histories['SFR']['ERT']['smogn_train'] = train_loss_smogn_et_sfr
loss_histories['SFR']['ERT']['smogn_test'] = test_loss_smogn_et_sfr

loss_histories['Colour']['NN']['raw_train'] = train_loss_raw_nn_colour
loss_histories['Colour']['NN']['raw_test'] = test_loss_raw_nn_colour
loss_histories['Colour']['NN']['smogn_train'] = train_loss_smogn_nn_colour
loss_histories['Colour']['NN']['smogn_test'] = test_loss_smogn_nn_colour

loss_histories['Colour']['KNN']['raw_train'] = train_loss_raw_knn_colour
loss_histories['Colour']['KNN']['raw_test'] = test_loss_raw_knn_colour
loss_histories['Colour']['KNN']['smogn_train'] = train_loss_smogn_knn_colour
loss_histories['Colour']['KNN']['smogn_test'] = test_loss_smogn_knn_colour

loss_histories['Colour']['ERT']['raw_train'] = train_loss_raw_et_colour
loss_histories['Colour']['ERT']['raw_test'] = test_loss_raw_et_colour
loss_histories['Colour']['ERT']['smogn_train'] = train_loss_smogn_et_colour
loss_histories['Colour']['ERT']['smogn_test'] = test_loss_smogn_et_colour

loss_histories['SR']['NN']['raw_train'] = train_loss_raw_nn_sr
loss_histories['SR']['NN']['raw_test'] = test_loss_raw_nn_sr
loss_histories['SR']['NN']['smogn_train'] = train_loss_smogn_nn_sr
loss_histories['SR']['NN']['smogn_test'] = test_loss_smogn_nn_sr

loss_histories['SR']['KNN']['raw_train'] = train_loss_raw_knn_sr
loss_histories['SR']['KNN']['raw_test'] = test_loss_raw_knn_sr
loss_histories['SR']['KNN']['smogn_train'] = train_loss_smogn_knn_sr
loss_histories['SR']['KNN']['smogn_test'] = test_loss_smogn_knn_sr

loss_histories['SR']['ERT']['raw_train'] = train_loss_raw_et_sr
loss_histories['SR']['ERT']['raw_test'] = test_loss_raw_et_sr
loss_histories['SR']['ERT']['smogn_train'] = train_loss_smogn_et_sr
loss_histories['SR']['ERT']['smogn_test'] = test_loss_smogn_et_sr

plot_loss_curves(loss_histories)

In [None]:
property_names = ['Stellar Mass', 'SFR', 'Radius', 'Colour']
model_names = ['NN', 'kNN', 'ERT']
model_colors = {
    'NN': 'dodgerblue',
    'kNN': 'deepskyblue',
    'ERT': 'peru'
}

# Metrics dictionaries - these work with the new training functions
metrics_raw = {
    'NN':   [metrics_raw_nn_sm, metrics_raw_nn_sfr, metrics_raw_nn_sr, metrics_raw_nn_colour],
    'kNN':  [metrics_raw_knn_sm, metrics_raw_knn_sfr, metrics_raw_knn_sr, metrics_raw_knn_colour],
    'ERT':  [metrics_raw_et_sm, metrics_raw_et_sfr, metrics_raw_et_sr, metrics_raw_et_colour]
}
metrics_smogn = {
    'NN':   [metrics_smogn_nn_sm, metrics_smogn_nn_sfr, metrics_smogn_nn_sr, metrics_smogn_nn_colour],
    'kNN':  [metrics_smogn_knn_sm, metrics_smogn_knn_sfr, metrics_smogn_knn_sr, metrics_smogn_knn_colour],
    'ERT':  [metrics_smogn_et_sm, metrics_smogn_et_sfr, metrics_smogn_et_sr, metrics_smogn_et_colour]
}

# Extract Pearson correlations
pearson_raw = np.array([[metrics_raw[model][i]['target_0']['Pearson_r'] for model in model_names] for i in range(4)])
pearson_smogn = np.array([[metrics_smogn[model][i]['target_0']['Pearson_r'] for model in model_names] for i in range(4)])

# Plot Pearson correlations
fig, axs = plt.subplots(1, 2, figsize=(10, 4), sharex=True)
for ax, pearson, title in zip(
    axs, [pearson_raw, pearson_smogn], 
    ['Pearson Correlation: Raw Models', 'Pearson Correlation: SMOGN Models']
):
    for j, model in enumerate(model_names):
        ax.plot(property_names, pearson[:, j], marker='s', markersize=8, 
                color=model_colors[model], label=model, linewidth=2.5)
    ax.set_ylabel('Pearson Correlation', fontsize=11)
    ax.set_title(title, fontsize=12, fontweight='bold')
    ax.legend(loc='best', fontsize=10)
    ax.grid(True, ls='--', alpha=0.4)
    ax.set_ylim([pearson.min() - 0.05, 1.0])
axs[0].set_xlabel('Property', fontsize=11)
axs[1].set_xlabel('Property', fontsize=11)
plt.tight_layout()
plt.show()

# Extract K-S statistics
ks_raw = np.array([[metrics_raw[model][i]['target_0']['KS_statistic'] for model in model_names] for i in range(4)])
ks_smogn = np.array([[metrics_smogn[model][i]['target_0']['KS_statistic'] for model in model_names] for i in range(4)])

# Plot K-S test statistics
fig, axs = plt.subplots(2, 2, figsize=(10, 7), sharey=False)
for idx, prop in enumerate(property_names):
    ax = axs[idx // 2, idx % 2]
    ax.plot(model_names, ks_raw[idx], marker='o', markersize=8, 
            label='Raw', color='dodgerblue', linewidth=2.5)
    ax.plot(model_names, ks_smogn[idx], marker='s', markersize=8, 
            label='SMOGN', color='coral', linewidth=2.5)
    ax.set_title(f'K-S Test: {prop}', fontsize=12, fontweight='bold')
    ax.set_ylabel('KS Statistic (D)', fontsize=11)
    ax.set_xlabel('Model', fontsize=11)
    ax.legend(fontsize=10, loc='best')
    ax.grid(True, alpha=0.4, ls='--')
plt.tight_layout()
plt.show()

In [None]:
property_names = ['Stellar Mass', 'sSFR', 'Radius', 'Colour']

true_test = [y_test_sm.flatten(), y_test_sfr.flatten(), y_test_sr.flatten(), y_test_colour.flatten()]
pred_raw  = [y_raw_nn_sm.flatten(), y_raw_nn_sfr.flatten(), y_raw_nn_sr.flatten(), y_raw_nn_colour.flatten()]
pred_smogn = [y_smogn_nn_sm.flatten(), y_smogn_nn_sfr.flatten(), y_smogn_nn_sr.flatten(), y_smogn_nn_colour.flatten()]

fig, axes = plt.subplots(4, 2, figsize=(10, 12), sharex=False, sharey=False)
vmin, vmax = 0., 1.
for i, pname in enumerate(property_names):

    x = true_test[i]
    y_raw = pred_raw[i]
    y_smogn = pred_smogn[i]
    for j, (y_pred, title) in enumerate(zip([y_raw, y_smogn], ['Raw NN', 'SMOGN NN'])):
        ax = axes[i, j]
        # Density scatter plot
        hb = ax.hexbin(x, y_pred, gridsize=60, cmap='turbo', bins='log')
        # Diagonal reference
        ax.plot([x.min(), x.max()], [x.min(), x.max()], 'k--', lw=2, label='True x True')
        ax.set_xlim(x.min(), x.max())
        ax.set_ylim(x.min(), x.max())
        ax.set_title(f'{title}: {pname}')
        ax.set_xlabel(f'True {pname}')
        ax.set_ylabel(f'Predicted {pname}')
        if j == 1:
            # Color bar for the right column (once per row)
            cb = fig.colorbar(hb, ax=ax)
        ax.legend()

plt.tight_layout()
plt.show()