In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import StratifiedKFold


In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")


In [4]:
# Drop unique identifier columns
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)

# Drop non-numeric columns except for 'Transition'
columns_to_drop = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop, inplace=True)
print(f"Dropped {len(columns_to_drop)} non-numeric columns.")

# Drop columns where all entries are the same
same_value_cols = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols, inplace=True)
print(f"Dropped {len(same_value_cols)} columns with the same value for every entry.")

# Apply MinMax scaling to float columns
float_cols = radi.select_dtypes(include=['float','int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])


Dropped 16 non-numeric columns.
Dropped 148 columns with the same value for every entry.


In [5]:
radi.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,0.468027,0.393701,0.598392,0.555087,0.455539,0.39272,0.28956,0.31062,0.25494,0.562431,...,0.184251,0.633393,0.646173,0.125643,0.735632,0.658619,0.127238,0.0,0.610644,CN-CN
1,0.281221,0.488189,0.713552,0.58897,0.397306,0.338081,0.311579,0.359443,0.335667,0.61962,...,0.651481,0.323882,0.458297,0.195642,0.492265,0.37379,0.198729,1.0,0.787115,CN-CN
2,0.76498,0.212598,0.530116,0.727218,0.876712,1.0,0.431089,0.622046,0.58396,0.932585,...,0.49244,0.258168,0.679235,0.134842,0.783349,0.776907,0.132002,1.0,0.728291,AD-AD
3,0.651078,0.338583,0.799452,0.547963,0.355153,0.372946,0.453707,0.55516,0.498358,0.760667,...,0.569706,0.433476,0.511718,0.154368,0.506896,0.375681,0.156912,1.0,0.680672,CN-MCI
4,0.382225,0.173228,0.406742,0.504096,0.080905,0.07806,0.545945,0.447539,0.424594,0.566272,...,0.450216,0.239867,0.441321,0.243251,0.680762,0.556234,0.249892,0.0,0.694678,CN-CN


In [6]:
X = radi.drop(["Transition"], axis=1)
y = radi["Transition"]

min_features_to_select = 10 
clf = RandomForestClassifier(random_state=25)
cv = StratifiedKFold(5,random_state=25,shuffle=True)

rfecv = RFECV(
    estimator=clf,
    step=1,
    cv=cv,
    scoring="f1_macro",
    min_features_to_select=min_features_to_select,
    n_jobs=-1,
)
rfecv.fit(X, y)

print(f"Optimal number of features: {rfecv.n_features_}")

# Get feature rankings
feature_rankings = pd.DataFrame({
    "Feature": X.columns,
    "Rank": rfecv.ranking_,
})

# Sort features by rank (1 = selected)
feature_rankings = feature_rankings.sort_values(by="Rank", ascending=True)

print("Feature rankings (1 = selected):")
print(feature_rankings)

Optimal number of features: 792
Feature rankings (1 = selected):
                                                Feature  Rank
2012                                                Age     1
709                         wavelet-HHH_glcm_SumSquares     1
710                  wavelet-HHH_gldm_DependenceEntropy     1
711            wavelet-HHH_gldm_DependenceNonUniformity     1
713                 wavelet-HHH_gldm_DependenceVariance     1
...                                                 ...   ...
1006  log-sigma-2-0-mm-3D_glrlm_HighGrayLevelRunEmph...  1218
864              log-sigma-1-0-mm-3D_firstorder_Minimum  1219
1057            log-sigma-3-0-mm-3D_firstorder_Variance  1220
1538                     logarithm_glcm_InverseVariance  1221
271                       wavelet-LHL_glrlm_RunVariance  1222

[2013 rows x 2 columns]


In [7]:
# Filter for top 3 ranked features
top_features = feature_rankings[feature_rankings["Rank"] <= 50]

print(f"Top ranked features:")
print(top_features) 

# Keep only the top 3 ranked features in a new DataFrame
top_feature_names = top_features["Feature"].tolist()
radi_top = radi[top_feature_names + ["Transition"]]

print(f"DataFrame with only top 3 ranked features:")
radi_top.head()

Top ranked features:
                                                Feature  Rank
2012                                                Age     1
709                         wavelet-HHH_glcm_SumSquares     1
710                  wavelet-HHH_gldm_DependenceEntropy     1
711            wavelet-HHH_gldm_DependenceNonUniformity     1
713                 wavelet-HHH_gldm_DependenceVariance     1
...                                                 ...   ...
470   wavelet-HLL_glszm_SizeZoneNonUniformityNormalized    46
797                       wavelet-LLL_glcm_JointEntropy    47
1368                square_gldm_LargeDependenceEmphasis    48
685                     wavelet-HHH_firstorder_Variance    49
1901        lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis    50

[841 rows x 2 columns]
DataFrame with only top 3 ranked features:


Unnamed: 0,Age,wavelet-HHH_glcm_SumSquares,wavelet-HHH_gldm_DependenceEntropy,wavelet-HHH_gldm_DependenceNonUniformity,wavelet-HHH_gldm_DependenceVariance,wavelet-HHH_gldm_GrayLevelVariance,wavelet-HHH_gldm_HighGrayLevelEmphasis,wavelet-HHH_gldm_LargeDependenceEmphasis,wavelet-HHH_gldm_LowGrayLevelEmphasis,wavelet-HHH_gldm_SmallDependenceEmphasis,...,log-sigma-5-0-mm-3D_glcm_SumSquares,wavelet-HLH_glcm_Idn,logarithm_firstorder_Variance,wavelet-HLH_glrlm_LongRunHighGrayLevelEmphasis,wavelet-HLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glcm_JointEntropy,square_gldm_LargeDependenceEmphasis,wavelet-HHH_firstorder_Variance,lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis,Transition
0,0.610644,0.610901,0.480943,0.580378,0.500774,0.614496,0.802767,0.883296,0.197233,0.181651,...,0.463114,0.070429,0.098761,0.114143,0.234412,0.382456,0.401727,0.118693,0.588281,CN-CN
1,0.787115,0.997724,0.149889,0.735681,0.130872,0.994018,0.519986,0.731747,0.480014,0.099057,...,0.338657,0.021656,0.064365,0.083615,0.455309,0.221619,0.261642,0.165755,0.910061,CN-CN
2,0.728291,0.969676,0.615477,0.478993,0.526055,0.977648,0.401968,0.325272,0.598032,0.553046,...,0.939358,0.131325,1.0,0.063101,0.230525,0.950604,0.306903,0.371362,0.167578,AD-AD
3,0.680672,0.993088,0.209982,0.808283,0.224884,0.982898,0.411712,0.778498,0.588288,0.14226,...,0.611031,0.045084,0.052615,0.089171,0.370317,0.305336,0.811043,0.250621,0.701622,CN-MCI
4,0.694678,0.973374,0.260323,0.43002,0.229915,0.959592,0.37517,0.616315,0.62483,0.235611,...,0.616082,0.069653,0.015879,0.070012,0.6155,0.101869,0.948325,0.039205,0.431275,CN-CN


In [11]:
# Drop unique identifier columns in test dataset
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True, errors='ignore')

# Drop the same non-numeric columns as identified from the training data
radi_test.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Drop columns where all entries are the same (identified from the training data)
radi_test.drop(columns=same_value_cols, inplace=True, errors='ignore')

# Apply MinMax scaling to float columns in the test data using the scaler fitted on training data
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use previously fitted scaler

# Keep only the selected top-ranked features and the target column
radi_test_top = radi_test[top_feature_names]

print(f"Processed test dataset with top features:")
radi_test_top.head()


Processed test dataset with top features:


Unnamed: 0,Age,wavelet-HHH_glcm_SumSquares,wavelet-HHH_gldm_DependenceEntropy,wavelet-HHH_gldm_DependenceNonUniformity,wavelet-HHH_gldm_DependenceVariance,wavelet-HHH_gldm_GrayLevelVariance,wavelet-HHH_gldm_HighGrayLevelEmphasis,wavelet-HHH_gldm_LargeDependenceEmphasis,wavelet-HHH_gldm_LowGrayLevelEmphasis,wavelet-HHH_gldm_SmallDependenceEmphasis,...,log-sigma-5-0-mm-3D_glszm_SizeZoneNonUniformityNormalized,log-sigma-5-0-mm-3D_glcm_SumSquares,wavelet-HLH_glcm_Idn,logarithm_firstorder_Variance,wavelet-HLH_glrlm_LongRunHighGrayLevelEmphasis,wavelet-HLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glcm_JointEntropy,square_gldm_LargeDependenceEmphasis,wavelet-HHH_firstorder_Variance,lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis
0,0.829132,0.987814,0.243898,0.556744,0.232205,0.999434,0.492122,0.605012,0.507878,0.191339,...,0.016754,0.573948,0.044963,0.057979,0.068132,0.581859,0.460535,0.540937,0.295796,0.470842
1,0.478992,0.85101,0.207867,0.77441,0.178449,0.759688,0.224713,0.717771,0.775287,0.112027,...,0.343457,0.334026,0.047994,0.068466,0.097288,0.170989,0.370086,0.363712,0.095451,0.854966
2,0.456583,0.894222,0.49318,0.672719,0.473686,0.992289,0.525433,0.856318,0.474567,0.292658,...,0.20642,0.617251,0.074171,0.094331,0.099454,0.379891,0.432248,0.697082,0.105147,0.602285
3,0.59944,0.895957,0.45403,0.783722,0.398621,0.988201,0.536259,0.765404,0.463741,0.270362,...,0.234556,0.628901,0.060898,0.069565,0.110873,0.287961,0.248202,0.810975,0.29116,0.670455
4,0.431373,0.61183,0.172753,0.646559,0.151513,0.870936,0.666652,0.666384,0.333348,0.171687,...,0.385135,0.724864,0.012155,0.072304,0.073087,0.573106,0.370051,0.676229,0.26448,0.625845


In [12]:
radi_top.head()

Unnamed: 0,Age,wavelet-HHH_glcm_SumSquares,wavelet-HHH_gldm_DependenceEntropy,wavelet-HHH_gldm_DependenceNonUniformity,wavelet-HHH_gldm_DependenceVariance,wavelet-HHH_gldm_GrayLevelVariance,wavelet-HHH_gldm_HighGrayLevelEmphasis,wavelet-HHH_gldm_LargeDependenceEmphasis,wavelet-HHH_gldm_LowGrayLevelEmphasis,wavelet-HHH_gldm_SmallDependenceEmphasis,...,log-sigma-5-0-mm-3D_glcm_SumSquares,wavelet-HLH_glcm_Idn,logarithm_firstorder_Variance,wavelet-HLH_glrlm_LongRunHighGrayLevelEmphasis,wavelet-HLL_glszm_SizeZoneNonUniformityNormalized,wavelet-LLL_glcm_JointEntropy,square_gldm_LargeDependenceEmphasis,wavelet-HHH_firstorder_Variance,lbp-3D-m2_glrlm_LongRunLowGrayLevelEmphasis,Transition
0,0.610644,0.610901,0.480943,0.580378,0.500774,0.614496,0.802767,0.883296,0.197233,0.181651,...,0.463114,0.070429,0.098761,0.114143,0.234412,0.382456,0.401727,0.118693,0.588281,CN-CN
1,0.787115,0.997724,0.149889,0.735681,0.130872,0.994018,0.519986,0.731747,0.480014,0.099057,...,0.338657,0.021656,0.064365,0.083615,0.455309,0.221619,0.261642,0.165755,0.910061,CN-CN
2,0.728291,0.969676,0.615477,0.478993,0.526055,0.977648,0.401968,0.325272,0.598032,0.553046,...,0.939358,0.131325,1.0,0.063101,0.230525,0.950604,0.306903,0.371362,0.167578,AD-AD
3,0.680672,0.993088,0.209982,0.808283,0.224884,0.982898,0.411712,0.778498,0.588288,0.14226,...,0.611031,0.045084,0.052615,0.089171,0.370317,0.305336,0.811043,0.250621,0.701622,CN-MCI
4,0.694678,0.973374,0.260323,0.43002,0.229915,0.959592,0.37517,0.616315,0.62483,0.235611,...,0.616082,0.069653,0.015879,0.070012,0.6155,0.101869,0.948325,0.039205,0.431275,CN-CN


In [10]:
radi_top.to_csv("train_full_prep5.csv", index=False)
radi_test_top.to_csv("test_processed_prep5.csv", index=False)