## Import packages

In [10]:
import pandas as pd 
from mrmr import mrmr_classif
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from collections import Counter

## M_6weeks

In [11]:
df = pd.read_csv("../data/dataset_for_training.csv")
y = df[["y"]]
X = df.drop(["y"], axis=1)
cats = list(X.select_dtypes(include=["object"]))
conts = list(X.select_dtypes(include=["float", "int"]))

In [12]:
scaler = RobustScaler()
imputer = SimpleImputer(strategy="median")
scalesteps = [("cont_imputer", imputer), ("continuous", scaler)]
scale_pipe = Pipeline(steps=scalesteps)
cat_pipe = Pipeline(
    steps=[
        (
            "categorical",
            OneHotEncoder(handle_unknown="ignore", drop="if_binary"),
        )
    ]
)
preprocess = ColumnTransformer(
            transformers=[
                ("scal", scale_pipe, conts),
                ("cat", cat_pipe, cats),
            ],
            remainder="passthrough",
        )

In [13]:
best_features = []
for i in range(100):
    X_tr, X_test, y_tr, y_test = train_test_split(X,y, test_size=0.2, random_state=i, stratify=y)
    X_tr = preprocess.fit_transform(X_tr)
    X_test = preprocess.transform(X_test)
    columns = preprocess.get_feature_names_out()
    indices = y_tr.index
    X_tr = pd.DataFrame(data=X_tr, columns=columns, index=indices)
    selected_features = mrmr_classif(X_tr,y_tr, K= 12)
    for feature in selected_features:
        best_features.append(feature)


100%|██████████| 12/12 [00:00<00:00, 17.98it/s]
100%|██████████| 12/12 [00:00<00:00, 16.66it/s]
100%|██████████| 12/12 [00:00<00:00, 17.30it/s]
100%|██████████| 12/12 [00:00<00:00, 19.68it/s]
100%|██████████| 12/12 [00:00<00:00, 28.11it/s]
100%|██████████| 12/12 [00:00<00:00, 29.74it/s]
100%|██████████| 12/12 [00:00<00:00, 33.80it/s]
100%|██████████| 12/12 [00:00<00:00, 24.12it/s]
100%|██████████| 12/12 [00:00<00:00, 17.51it/s]
100%|██████████| 12/12 [00:00<00:00, 23.05it/s]
100%|██████████| 12/12 [00:00<00:00, 26.79it/s]
100%|██████████| 12/12 [00:00<00:00, 18.94it/s]
100%|██████████| 12/12 [00:00<00:00, 24.69it/s]
100%|██████████| 12/12 [00:00<00:00, 21.98it/s]
100%|██████████| 12/12 [00:00<00:00, 26.19it/s]
100%|██████████| 12/12 [00:00<00:00, 27.07it/s]
100%|██████████| 12/12 [00:00<00:00, 16.38it/s]
100%|██████████| 12/12 [00:00<00:00, 15.25it/s]
100%|██████████| 12/12 [00:00<00:00, 18.23it/s]
100%|██████████| 12/12 [00:00<00:00, 21.51it/s]
100%|██████████| 12/12 [00:00<00:00, 23.

In [14]:
counter_dictionary = dict(Counter(best_features))
counter_dictionary

{'scal__who_score_at_diagnosis': 100,
 'cat__radio_6weeks_yes': 48,
 'scal__diff_diag_baseline_in_weeks': 100,
 'scal__bmi': 99,
 'scal__LDH': 100,
 'cat__immuno_6weeks_yes': 100,
 'scal__leukocytes': 100,
 'scal__calcium': 16,
 'cat__CKD_yes': 69,
 'scal__CRP': 98,
 'cat__chemo_6weeks_yes': 99,
 'scal__hemoglobin': 92,
 'cat__stage_IV': 33,
 'cat__CPD_yes': 19,
 'cat__mutation_yes': 17,
 'cat__smoking_status_Ex smoker': 12,
 'scal__CKD_EPI': 35,
 'scal__thrombocytes': 17,
 'cat__CVA_yes': 4,
 'cat__Heart_diseases_yes': 10,
 'scal__age': 4,
 'cat__Liver_yes': 5,
 'cat__Cancer_yes': 6,
 'cat__PVD_yes': 3,
 'cat__targ_6weeks_yes': 5,
 'cat__smoking_status_Smoker': 2,
 'cat__smoking_status_Unknown': 4,
 'cat__smoking_status_No smoker': 1,
 'cat__uses_strong_opioids_yes': 2}