In [13]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [14]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [15]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")


In [4]:

# Drop unique identifier columns from training data
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)
print("Dropped unique identifier columns from training dataset.")

# Drop non-numeric columns except for 'Transition'
columns_to_drop_train = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop_train, inplace=True)
print(f"Dropped {len(columns_to_drop_train)} non-numeric columns from training dataset.")

# Drop columns where all entries are the same
same_value_cols_train = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols_train, inplace=True)
print(f"Dropped {len(same_value_cols_train)} constant-value columns from training dataset.")

# Apply MinMax scaling to float columns in training data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Save the full processed training dataset
radi.to_csv("train_full_prep3.csv", index=False)
print("Full processed training dataset saved as 'train_full.csv'.")

# Drop the same columns in the test dataset
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=columns_to_drop_train, inplace=True)  # Ignore if columns don't exist in test
radi_test.drop(columns=same_value_cols_train, inplace=True)  # Ignore if columns don't exist in test

# Apply MinMax scaling to float columns in the test data
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use the same scaler fit on radi

# Save the processed test dataset
radi_test.to_csv("test_processed_prep3.csv", index=False)
print("Processed test dataset saved as 'test_processed.csv'.")

# Step 3: Split the training dataset into train and validation
radi_train, radi_val = train_test_split(radi, test_size=0.2, random_state=25, stratify=radi['Transition'])

# Save the split datasets
radi_train.to_csv("train_split_prep3.csv", index=False)
radi_val.to_csv("test_split_prep3.csv", index=False)
print("Split datasets saved as 'train_split.csv' and 'test_split.csv'.")


Dropped unique identifier columns from training dataset.
Dropped 16 non-numeric columns from training dataset.
Dropped 148 constant-value columns from training dataset.
Full processed training dataset saved as 'train_full.csv'.
Processed test dataset saved as 'test_processed.csv'.
Split datasets saved as 'train_split.csv' and 'test_split.csv'.


In [4]:
radi_train.head()

NameError: name 'radi_train' is not defined

In [6]:
radi_test.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZoneEntropy,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age
0,0.495054,0.07874,0.53679,0.47528,0.316034,0.373173,0.537721,0.486557,0.469768,0.862239,...,0.594669,0.566023,0.280746,0.33531,0.296551,0.468272,0.325807,0.303302,1.0,0.829132
1,0.737522,0.401575,0.782512,0.634503,0.31288,0.307338,0.430676,0.514912,0.540905,0.875525,...,0.489953,0.517394,0.468663,0.476645,0.175973,0.478687,0.350439,0.181719,1.0,0.478992
2,0.756433,0.259843,0.711157,0.571241,0.474246,0.453088,0.349322,0.470691,0.629847,0.875525,...,0.424795,0.458065,0.458581,0.501734,0.17075,0.53937,0.419945,0.171175,1.0,0.456583
3,0.798346,0.291339,0.835729,0.50371,0.421881,0.488026,0.50594,0.672928,0.707844,1.03584,...,0.414088,0.31117,0.64863,0.88364,0.032452,0.780213,0.719552,0.033423,1.0,0.59944
4,0.291812,0.377953,0.62423,0.663701,0.354026,0.312766,0.354822,0.365214,0.250244,0.589231,...,0.599878,0.391127,0.449245,0.502824,0.178915,0.593427,0.470218,0.181981,1.0,0.431373


In [5]:
radi.head()

Unnamed: 0,ID,Image,Mask,diagnostics_Versions_PyRadiomics,diagnostics_Versions_Numpy,diagnostics_Versions_SimpleITK,diagnostics_Versions_PyWavelet,diagnostics_Versions_Python,diagnostics_Configuration_Settings,diagnostics_Configuration_EnabledImageTypes,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,006_S_0681,/notebooks/disk2/DS2_FreeSurfer/ADNI_006_S_068...,/notebooks/disk2/DS2_FreeSurfer/ADNI_006_S_068...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.007881,345733.167092,479.414935,0.001538,0.183049,0.017044,0.001581,0,77.1,CN-CN
1,941_S_1203,/notebooks/disk2/DS2_FreeSurfer/ADNI_941_S_120...,/notebooks/disk2/DS2_FreeSurfer/ADNI_941_S_120...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.013112,244310.331892,378.001763,0.001761,0.146021,0.010782,0.001813,1,83.4,CN-CN
2,011_S_0003,/notebooks/disk2/DS2_FreeSurfer/ADNI_011_S_000...,/notebooks/disk2/DS2_FreeSurfer/ADNI_011_S_000...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.011331,222776.529605,497.261162,0.001567,0.190309,0.019644,0.001596,1,81.3,AD-AD
3,057_S_0779,/notebooks/disk2/DS2_FreeSurfer/ADNI_057_S_077...,/notebooks/disk2/DS2_FreeSurfer/ADNI_057_S_077...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.012197,280222.663268,406.837667,0.001629,0.148247,0.010824,0.001677,1,79.6,CN-MCI
4,033_S_0920,/notebooks/disk2/DS2_FreeSurfer/ADNI_033_S_092...,/notebooks/disk2/DS2_FreeSurfer/ADNI_033_S_092...,2.2.0,1.18.5,1.2.4,1.1.1,3.7.7,"{'minimumROIDimensions': 2, 'minimumROISize': ...","{'Original': {}, 'Wavelet': {}, 'LoG': {'sigma...",...,0.010859,216779.591479,368.838125,0.001912,0.174701,0.014793,0.001979,0,80.1,CN-CN


In [72]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_control = pd.read_csv("../sbsppdaa24/train_radiomics_occipital_CONTROL.csv")
radi_control.info()
radi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2181 entries, ID to Transition
dtypes: float64(2014), int64(147), object(20)
memory usage: 5.1+ MB


In [73]:
print(radi.columns)
print(radi_control.columns)

Index(['ID', 'Image', 'Mask', 'diagnostics_Versions_PyRadiomics',
       'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK',
       'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python',
       'diagnostics_Configuration_Settings',
       'diagnostics_Configuration_EnabledImageTypes',
       ...
       'lbp-3D-k_glszm_ZonePercentage', 'lbp-3D-k_glszm_ZoneVariance',
       'lbp-3D-k_ngtdm_Busyness', 'lbp-3D-k_ngtdm_Coarseness',
       'lbp-3D-k_ngtdm_Complexity', 'lbp-3D-k_ngtdm_Contrast',
       'lbp-3D-k_ngtdm_Strength', 'Sex', 'Age', 'Transition'],
      dtype='object', length=2181)
Index(['ID', 'Image', 'Mask', 'diagnostics_Versions_PyRadiomics',
       'diagnostics_Versions_Numpy', 'diagnostics_Versions_SimpleITK',
       'diagnostics_Versions_PyWavelet', 'diagnostics_Versions_Python',
       'diagnostics_Configuration_Settings',
       'diagnostics_Configuration_EnabledImageTypes',
       ...
       'lbp-3D-k_glszm_ZonePercentage', 'lbp-3D-k_glszm_ZoneVarianc

In [74]:
# Drop unique identifier columns from training data
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)
print("Dropped unique identifier columns from training dataset.")

# Drop non-numeric columns except for 'Transition'
columns_to_drop_train = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop_train, inplace=True)
print(f"Dropped {len(columns_to_drop_train)} non-numeric columns from training dataset.")

# Drop columns where all entries are the same
same_value_cols_train = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols_train, inplace=True)
print(f"Dropped {len(same_value_cols_train)} constant-value columns from training dataset.")

# Apply MinMax scaling to float columns in training data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Save the full processed training dataset
# radi.to_csv("train_full_prep3.csv", index=False)
print("Full processed training dataset")

# Drop the same columns in the test dataset
radi_control.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_control.drop(columns=columns_to_drop_train, inplace=True)  # Ignore if columns don't exist in test
radi_control.drop(columns=same_value_cols_train, inplace=True)  # Ignore if columns don't exist in test

# Apply MinMax scaling to float columns in the control data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
radi_control[float_cols] = scaler.transform(radi_control[float_cols])  # Use the same scaler fit on radi

# Save the processed test dataset
radi_control.to_csv("control_processed_prep3.csv", index=False)
print("Processed test dataset saved as 'control_processed.csv'.")



Dropped unique identifier columns from training dataset.
Dropped 16 non-numeric columns from training dataset.
Dropped 148 constant-value columns from training dataset.
Full processed training dataset


NotFittedError: This MinMaxScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

In [64]:
radi.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,0.468027,0.393701,0.598392,0.555087,0.455539,0.39272,0.28956,0.31062,0.25494,0.562431,...,0.184251,0.633393,0.646173,0.125643,0.735632,0.658619,0.127238,0.0,0.610644,CN-CN
1,0.281221,0.488189,0.713552,0.58897,0.397306,0.338081,0.311579,0.359443,0.335667,0.61962,...,0.651481,0.323882,0.458297,0.195642,0.492265,0.37379,0.198729,1.0,0.787115,CN-CN
2,0.76498,0.212598,0.530116,0.727218,0.876712,1.0,0.431089,0.622046,0.58396,0.932585,...,0.49244,0.258168,0.679235,0.134842,0.783349,0.776907,0.132002,1.0,0.728291,AD-AD
3,0.651078,0.338583,0.799452,0.547963,0.355153,0.372946,0.453707,0.55516,0.498358,0.760667,...,0.569706,0.433476,0.511718,0.154368,0.506896,0.375681,0.156912,1.0,0.680672,CN-MCI
4,0.382225,0.173228,0.406742,0.504096,0.080905,0.07806,0.545945,0.447539,0.424594,0.566272,...,0.450216,0.239867,0.441321,0.243251,0.680762,0.556234,0.249892,0.0,0.694678,CN-CN


In [65]:
radi_control.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
0,5.848123,178,17346,0.315029,0.267306,34.411652,128.734925,105.171289,105.385008,41.773197,...,0.004785,1727126.0,98.443622,0.005262,0.021912,0.000244,0.005402,0,77.1,CN-CN
1,5.238834,190,18010,0.361175,0.272491,32.329625,118.644809,97.082439,101.98039,44.598206,...,0.00844,1012217.0,190.711701,0.0028,0.039662,0.000799,0.002887,1,83.4,CN-CN
2,6.816667,155,22096,0.374464,0.288734,37.469777,129.772712,108.295891,111.758669,51.971146,...,0.006291,1646099.0,285.07863,0.001925,0.047025,0.001358,0.001915,1,81.3,AD-AD
3,6.445162,171,23859,0.355133,0.288648,37.694946,130.59162,111.157546,110.476242,52.952809,...,0.005281,2169425.0,172.000383,0.003052,0.027462,0.000426,0.003078,1,79.6,CN-MCI
4,5.568269,150,17637,0.320548,0.275708,36.716529,133.171648,108.295891,106.381389,44.28318,...,0.004082,2073170.0,75.795004,0.006768,0.016754,0.000135,0.007042,0,80.1,CN-CN


In [66]:
radi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(2013), object(1)
memory usage: 4.7+ MB


In [67]:
radi_control.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 305 entries, 0 to 304
Columns: 2014 entries, diagnostics_Image-original_Mean to Transition
dtypes: float64(1992), int64(21), object(1)
memory usage: 4.7+ MB
