In [12]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#  Load preprocessed dataset
data_path = r"C:\Users\sanja\4.Leukemia Subtype Prediction Using Gene Expression\4.Leukemia_Subtype_Prediction\data\leukemia_golub99_preprocessed.csv"
df = pd.read_csv(data_path)

#  Target and Features
target = "Subtype"
X = df.drop(columns=[target])
y = df[target]

# 1️ Handle Missing Values
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X)

# 2️ Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_imputed)

# 3️ Remove low variance features
var_thresh = VarianceThreshold(threshold=0.01)
X_var = var_thresh.fit_transform(X_scaled)
selected_features_var = X.columns[var_thresh.get_support()]

print(f" Features after Variance Threshold: {len(selected_features_var)}")

# 4️ Select top 10 features
K = 10
selector = SelectKBest(score_func=f_classif, k=K)
X_kbest = selector.fit_transform(X_var, y)

selected_features_kbest = selected_features_var[selector.get_support()]

print(f" Top {K} selected features: {list(selected_features_kbest)}")

# 5️ Create reduced DataFrame
X_selected_df = pd.DataFrame(X_kbest, columns=selected_features_kbest)
final_df = pd.concat([X_selected_df, y.reset_index(drop=True)], axis=1)

# 6️ Save reduced dataset
output_path = r"C:\Users\sanja\4.Leukemia Subtype Prediction Using Gene Expression\4.Leukemia_Subtype_Prediction\data\leukemia_golub99_top10_features.csv"
final_df.to_csv(output_path, index=False)

print(f" Reduced dataset saved to: {output_path}")


 Features after Variance Threshold: 3051
 Top 10 selected features: ['D63391_at', 'J04027_at', 'L01087_at', 'S50223_at', 'U46499_at', 'X68688_rna1_s_at', 'M31551_s_at', 'M20203_s_at', 'M26692_s_at', 'X56687_s_at']
 Reduced dataset saved to: C:\Users\sanja\4.Leukemia Subtype Prediction Using Gene Expression\4.Leukemia_Subtype_Prediction\data\leukemia_golub99_top10_features.csv
