In [None]:
# Feature Selection
#his notebook performs feature selection on `ztf_image_search_results_full_standardized.csv` using multiple strategies: missingness filtering, low-variance filtering, correlation-based removal, supervised SelectKBest (mutual information) and RandomForest importance, and an L1-based selector when applicable.

#If a labeled target column (e.g., `label`, `class`, `target`, `type`) is not present, the notebook creates KMeans cluster labels as a proxy target to enable supervised-like feature ranking. Astronomy domain knowledge is used to ensure important astrophysical features (RA/Dec, flux/mag, SNR, seeing, airmass, filter) are preserved where present.

#Outputs:
#- `ztf_selected_features.csv`: cleaned dataset containing only selected features (and target if present).
#- `selected_feature_list.txt`: newline list of selected features.

In [2]:
# Imports and load standardized dataset
import os
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

DATA_PATH = 'ztf_image_search_results_full_standardized.csv'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f'{DATA_PATH} not found. Run the preprocessing/standardization notebook first.')
df = pd.read_csv(DATA_PATH)
print('Loaded dataset shape:', df.shape)
# helper to detect likely target names and astrophysical important columns
possible_targets = [c for c in df.columns if c.lower() in ['label','class','target','type']]
astro_priority = [c for c in df.columns if any(k in c.lower() for k in ['ra','dec','flux','mag','snr','seeing','airmass','filter','band','jd','obsdate','mjd','maglimit'])]
print('Detected possible targets:', possible_targets)
print('Astrophysical priority columns found:', astro_priority)
df.shape

Loaded dataset shape: (62368, 42)
Detected possible targets: []
Astrophysical priority columns found: ['ra', 'dec', 'filtercode', 'obsdate', 'obsjd', 'filefracday', 'seeing', 'airmass', 'maglimit', 'ra1', 'dec1', 'ra2', 'dec2', 'ra3', 'dec3', 'ra4', 'dec4']


(62368, 42)

In [3]:
# 1) Drop features with >90% missing values
thresh = 0.1  # keep columns with at least 10% non-missing
missing_frac = df.isnull().mean()
cols_keep = missing_frac[missing_frac <= (1 - thresh)].index.tolist()
dropped_missing = [c for c in df.columns if c not in cols_keep]
print(f'Dropping {len(dropped_missing)} columns with >90% missing: ', dropped_missing)
df = df[cols_keep].copy()
print('Shape after missingness drop:', df.shape)

Dropping 0 columns with >90% missing:  []
Shape after missingness drop: (62368, 42)


In [4]:
# 2) Basic imputation: numeric -> median, categorical -> mode
num_cols = df.select_dtypes(include=['number']).columns.tolist()
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
for c in num_cols:
    if df[c].isnull().any():
        df[c] = df[c].fillna(df[c].median())
for c in cat_cols:
    if df[c].isnull().any():
        df[c] = df[c].fillna(df[c].mode().iloc[0] if not df[c].mode().empty else 'missing')
print('After imputation, missing per column (top 10):')
print(df.isnull().sum().sort_values(ascending=False).head(10))

After imputation, missing per column (top 10):
ra            0
dec           0
infobits      0
field         0
ccdid         0
qid           0
rcid          0
fid           0
filtercode    0
pid           0
dtype: int64


In [5]:
# 3) Low variance filter (remove near-constant features)
from sklearn.feature_selection import VarianceThreshold
num_df = df.select_dtypes(include=['number']).copy()
if num_df.shape[1] > 0:
    selector_var = VarianceThreshold(threshold=1e-5)
    selector_var.fit(num_df)
    keep_mask = selector_var.get_support()
    lowvar_removed = [col for i,col in enumerate(num_df.columns) if not keep_mask[i]]
    print('Low-variance removed:', lowvar_removed)
    num_df = num_df.loc[:, keep_mask]
    # rebuild df with remaining numeric cols + categorical cols
    df = pd.concat([num_df.reset_index(drop=True), df[cat_cols].reset_index(drop=True)], axis=1)
    print('Shape after low-variance filter:', df.shape)
else:
    print('No numeric columns for variance filtering')

Low-variance removed: ['field', 'itid', 'moonesb', 'crpix1', 'crpix2']
Shape after low-variance filter: (62368, 37)


In [6]:
# 4) Correlation-based removal: remove one of each highly-correlated pair (r>0.95)
num_df = df.select_dtypes(include=['number']).copy()
corr_matrix = num_df.corr().abs()
upper = corr_matrix.where(np.triu(np.ones_like(corr_matrix), k=1).astype(bool))
to_drop_corr = [column for column in upper.columns if any(upper[column] > 0.95)]
print('Correlation-based drop count:', len(to_drop_corr))
num_df = num_df.drop(columns=to_drop_corr)
df = pd.concat([num_df.reset_index(drop=True), df[cat_cols].reset_index(drop=True)], axis=1)
print('Shape after correlation pruning:', df.shape)

Correlation-based drop count: 18
Shape after correlation pruning: (62368, 19)


In [7]:
# 5) Prepare X, y. If no target exists, create KMeans cluster labels as proxy target
possible_targets = [c for c in df.columns if c.lower() in ['label','class','target','type']]
target_col = possible_targets[0] if possible_targets else None
if target_col and target_col in df.columns:
    y = df[target_col].copy()
    if y.dtype == 'object' or y.dtype.name == 'category':
        le = LabelEncoder()
        y = le.fit_transform(y.astype(str))
    X = df.drop(columns=[target_col])
    print('Using provided target column:', target_col)
else:
    print('No labeled target found; creating KMeans-based proxy labels')
    X = df.copy()
    X_num = X.select_dtypes(include=['number']).fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_num)
    kmeans = KMeans(n_clusters=3, random_state=42)
    y = kmeans.fit_predict(X_scaled)
print('X shape, y length:', X.shape, len(y))

No labeled target found; creating KMeans-based proxy labels
X shape, y length: (62368, 19) 62368


In [8]:
# 6) Supervised/Proxy selection methods\n# 6a) SelectKBest with mutual_info_classif (works with discrete y)
num_cols = X.select_dtypes(include=['number']).columns.tolist()
k = min(20, max(1, len(num_cols)))
print('Running SelectKBest mutual_info (k=', k, ') on numeric features')
skb_selected = []
if len(num_cols) > 0:
    skb = SelectKBest(score_func=mutual_info_classif, k=k)
    X_num = X[num_cols].fillna(0)
    try:
        skb.fit(X_num, y)
        skb_selected = [f for f, s in zip(num_cols, skb.get_support()) if s]
        print('SelectKBest selected:', skb_selected)
    except Exception as e:
        print('SelectKBest failed:', e)
else:
    print('No numeric features for SelectKBest')

# 6b) RandomForest feature importance
rf_selected = []
try:
    if len(num_cols) > 0:
        rf = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
        rf.fit(X_num, y)
        importances = pd.Series(rf.feature_importances_, index=num_cols).sort_values(ascending=False)
        rf_selected = importances.head(k).index.tolist()
        print('RandomForest top features:', rf_selected)
    else:
        print('No numeric features for RandomForest')
except Exception as e:
    print('RandomForest failed:', e)

# 6c) L1-based selection (LogisticRegression with L1) - only for classification targets
l1_selected = []
try:
    if len(num_cols) > 0:
        lr = LogisticRegression(penalty='l1', solver='saga', max_iter=5000, random_state=42)
        lr.fit(X_num, y)
        coef = np.abs(lr.coef_).sum(axis=0) if lr.coef_.ndim > 1 else np.abs(lr.coef_)
        coef_series = pd.Series(coef, index=num_cols).sort_values(ascending=False)
        l1_selected = coef_series[coef_series > 1e-6].index.tolist()
        print('L1-selected features (non-zero):', l1_selected[:k])
    else:
        print('No numeric features for L1 selection')
except Exception as e:
    print('L1 selection failed:', e)

# 6d) PCA loadings: features with largest absolute loadings on first components
from sklearn.decomposition import PCA
pca_selected = []
try:
    if len(num_cols) > 0:
        pca = PCA(n_components=min(6, len(num_cols)))
        Xp = pca.fit_transform(X_num)
        loadings = np.abs(pca.components_).sum(axis=0)
        loadings_series = pd.Series(loadings, index=num_cols).sort_values(ascending=False)
        pca_selected = loadings_series.head(k).index.tolist()
        print('PCA top features:', pca_selected)
    else:
        print('No numeric features for PCA')
except Exception as e:
    print('PCA failed:', e)

# Consolidate selections into a ranking count
from collections import Counter
all_methods = [tuple(skb_selected), tuple(rf_selected), tuple(l1_selected), tuple(pca_selected)]
flat = [f for method in all_methods for f in method]
counts = Counter(flat)
ranked = [f for f, _ in counts.most_common()]
print('Ranked features by method votes (top 30):', ranked[:30])

Running SelectKBest mutual_info (k= 14 ) on numeric features
SelectKBest selected: ['ra', 'dec', 'infobits', 'qid', 'fid', 'pid', 'exptime', 'seeing', 'airmass', 'moonillf', 'maglimit', 'cd11', 'cd22', 'ipac_gid']
RandomForest top features: ['ipac_gid', 'cd11', 'cd22', 'pid', 'fid', 'airmass', 'maglimit', 'ra', 'moonillf', 'dec', 'seeing', 'exptime', 'infobits', 'qid']
L1-selected features (non-zero): []
PCA top features: ['infobits', 'qid', 'exptime', 'cd22', 'airmass', 'pid', 'maglimit', 'moonillf', 'cd11', 'seeing', 'fid', 'ipac_gid', 'ra', 'dec']
Ranked features by method votes (top 30): ['ra', 'dec', 'infobits', 'qid', 'fid', 'pid', 'exptime', 'seeing', 'airmass', 'moonillf', 'maglimit', 'cd11', 'cd22', 'ipac_gid']


In [9]:
# 7) Apply astronomy domain knowledge: ensure astrophysical features are kept if present
priority = [c for c in df.columns if any(k in c.lower() for k in ['ra','dec','flux','mag','snr','seeing','airmass','maglimit','filter','band'])]
print('Priority features to preserve (if present):', priority)
# Final selection strategy: take features selected by at least two methods OR in priority list. Limit to 30 features max.
selected_set = set()
for f, cnt in counts.items():
    if cnt >= 2:
        selected_set.add(f)
# add priority features
for p in priority:
    if p in df.columns:
        selected_set.add(p)
# If selection is empty (edge cases), fall back to top RF features or top PCA
if len(selected_set) == 0:
    selected_set.update(rf_selected[:min(20, len(rf_selected))])
selected_list = [f for f in ranked if f in selected_set]
# append any priority features not in ranked at the end
for p in priority:
    if p in df.columns and p not in selected_list:
        selected_list.append(p)
# limit to 30
selected_list = selected_list[:30]
print('Final selected features (count={}):'.format(len(selected_list)), selected_list)

Priority features to preserve (if present): ['ra', 'dec', 'seeing', 'airmass', 'maglimit', 'filtercode']
Final selected features (count=15): ['ra', 'dec', 'infobits', 'qid', 'fid', 'pid', 'exptime', 'seeing', 'airmass', 'moonillf', 'maglimit', 'cd11', 'cd22', 'ipac_gid', 'filtercode']


In [10]:
# 8) Save selected features to CSV and a feature list text file
out_csv = 'ztf_selected_features.csv'
out_list = 'selected_feature_list.txt'

# Make a copy of selected features
keep_cols = selected_list.copy()

# Remove unwanted columns
cols_to_remove = ['pid', 'filtercode']
keep_cols = [c for c in keep_cols if c not in cols_to_remove]

# Keep target if present
if target_col and target_col in df.columns:
    keep_cols = [target_col] + keep_cols

# Ensure columns exist in df
keep_cols = [c for c in keep_cols if c in df.columns]

if len(keep_cols) == 0:
    raise RuntimeError('No features selected â€” check earlier steps')

# Save CSV with only selected features (and target if present)
df[keep_cols].to_csv(out_csv, index=False)

# Save feature list (without target)
with open(out_list, 'w') as fh:
    for c in keep_cols:
        if c != target_col:        # avoid writing the target twice
            fh.write(c + '\n')

print('Saved selected features CSV ->', out_csv)
print('Saved feature list ->', out_list)
print('Example preview:')
df[keep_cols].head()


Saved selected features CSV -> ztf_selected_features.csv
Saved feature list -> selected_feature_list.txt
Example preview:


Unnamed: 0,ra,dec,infobits,qid,fid,exptime,seeing,airmass,moonillf,maglimit,cd11,cd22,ipac_gid
0,-1.149415,1.53096,3.271612,1,2,30,-0.116146,-0.220519,0.19519,-1.552305,1.141877,0.455582,2
1,-1.54184,1.124622,-0.292168,3,2,30,2.021751,-0.192858,-0.135013,0.080815,1.205201,1.795089,2
2,-1.550309,1.524426,-0.292168,2,2,30,-0.320594,-0.712885,-0.093118,0.428065,2.258968,1.853579,2
3,-1.542016,0.238707,-0.292168,3,2,30,-0.971814,-0.718418,-1.855228,-0.023688,0.736894,1.647814,3
4,-1.541997,0.237082,-0.292168,3,1,30,2.016056,-0.707353,-2.049663,-2.488164,0.626096,1.476595,1


**Notes & next steps:**
- Review `selected_feature_list.txt` and confirm domain relevance (I preserved RA/Dec/flux/SNR/seeing/airmass where available).
- For final modeling, re-run feature selection strictly on the training split (to avoid leakage), or save the pipeline that performs the same filtering steps.
- If you have a true labeled target, re-run the supervised parts (SelectKBest, RF, L1) using that target.