In [7]:
import sys, os
sys.path.append(os.path.abspath(".."))

import pandas as pd
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE, chi2
from src.utils import N_selector
# Load processed data
X_train = pd.read_csv("../data/X_train_processed.csv")
X_test = pd.read_csv("../data/X_test_processed.csv")
y_train = pd.read_csv("../data/y_train.csv").squeeze()

# Binary target
y_train_bin = (y_train > 0).astype(int)
n_selec = N_selector(X_train, 0.95)
n_opt = n_selec.calc_n()

xgb_model = XGBClassifier(objective="binary:logistic", random_state=42 ) #objective is set for binary classification
xgb_model.fit(X_train, y_train)
rfe = RFE(xgb_model, n_features_to_select= n_opt)
rfe.fit(X_train, y_train)

feature_stats, p_vals = chi2(X_train, y_train)


rfe_selected_features = X_train.columns[rfe.support_]

chi2_results = pd.DataFrame({
    "Feature": X_train.columns,
    "Chi2_Stat": feature_stats,
    "p_value": p_vals
}).sort_values("p_value")

chi2_significant_features = chi2_results[chi2_results["p_value"] < 0.05]["Feature"]
print(rfe_selected_features)
print(chi2_significant_features)

#Finding the most relevant features according to bith rfe and chi2 
modelling_features = list(set(rfe_selected_features) & set(chi2_significant_features))

X_train_reduc = X_train[modelling_features]
X_test_reduc = X_test[modelling_features]

X_train_reduc.to_csv("../data/X_train_reduc.csv")
X_test_reduc.to_csv("../data/X_test_reduc.csv")


Index(['age', 'trestbps', 'chol', 'thalach', 'ca', 'sex_1', 'cp_3', 'cp_4',
       'restecg_2', 'exang_1', 'slope_2', 'thal_7.0'],
      dtype='object')
16    thal_7.0
12     exang_1
8         cp_4
4           ca
6         cp_2
13     slope_2
7         cp_3
5        sex_1
Name: Feature, dtype: object
