In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")


In [4]:
# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Drop columns where all entries are the same
same_value_cols = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols, inplace=True)
print(f"Dropped {len(same_value_cols)} columns with the same value for every entry.")

# Apply MinMax scaling to float columns
float_cols = radi.select_dtypes(include=['float','int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])


Dropped 16 non-numeric columns.
Dropped 148 columns with the same value for every entry.


In [5]:
radi.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,0.468027,0.393701,0.598392,0.555087,0.455539,0.39272,0.28956,0.31062,0.25494,0.562431,...,0.184251,0.633393,0.646173,0.125643,0.735632,0.658619,0.127238,0.0,0.610644,CN-CN
1,0.281221,0.488189,0.713552,0.58897,0.397306,0.338081,0.311579,0.359443,0.335667,0.61962,...,0.651481,0.323882,0.458297,0.195642,0.492265,0.37379,0.198729,1.0,0.787115,CN-CN
2,0.76498,0.212598,0.530116,0.727218,0.876712,1.0,0.431089,0.622046,0.58396,0.932585,...,0.49244,0.258168,0.679235,0.134842,0.783349,0.776907,0.132002,1.0,0.728291,AD-AD
3,0.651078,0.338583,0.799452,0.547963,0.355153,0.372946,0.453707,0.55516,0.498358,0.760667,...,0.569706,0.433476,0.511718,0.154368,0.506896,0.375681,0.156912,1.0,0.680672,CN-MCI
4,0.382225,0.173228,0.406742,0.504096,0.080905,0.07806,0.545945,0.447539,0.424594,0.566272,...,0.450216,0.239867,0.441321,0.243251,0.680762,0.556234,0.249892,0.0,0.694678,CN-CN


In [None]:
X = radi.drop(["Transition"], axis=1)
y = radi["Transition"]

min_features_to_select = 10 
clf = RandomForestClassifier(random_state=25)
cv = StratifiedKFold(5,random_state=25,shuffle=True)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="f1_macro",
    min_features_to_select=min_features_to_select,
    n_jobs=-1,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

# Get feature rankings
feature_rankings = pd.DataFrame({
    "Feature": X.columns,
    "Rank": rfecv.ranking_,
})

# Sort features by rank (1 = selected)
feature_rankings = feature_rankings.sort_values(by="Rank", ascending=True)

print("Feature rankings (1 = selected):")
print(feature_rankings)

In [None]:
# Filter for top 3 ranked features
top_features = feature_rankings[feature_rankings["Rank"] <= 50]

print(f"Top ranked features:")
print(top_features) 

# Keep only the top 3 ranked features in a new DataFrame
top_feature_names = top_features["Feature"].tolist()
radi_top = radi[top_feature_names + ["Transition"]]

print(f"DataFrame with only top 3 ranked features:")
radi_top.head()

In [None]:
# Drop unique identifier columns in test dataset
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True, errors='ignore')

# Drop the same non-numeric columns as identified from the training data
radi_test.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Drop columns where all entries are the same (identified from the training data)
radi_test.drop(columns=same_value_cols, inplace=True, errors='ignore')

# Apply MinMax scaling to float columns in the test data using the scaler fitted on training data
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use previously fitted scaler

# Keep only the selected top-ranked features and the target column
radi_test_top = radi_test[top_feature_names]

print(f"Processed test dataset with top features:")
radi_test_top.head()


In [None]:
radi_top.head()

In [None]:
radi_top.to_csv("train_full_prep5.csv", index=False)
radi_test_top.to_csv("test_processed_prep5.csv", index=False)