In [1]:
import os
import random
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
def set_seed(seed: int):
    random.seed(seed) # Python
    np.random.seed(seed)  # Numpy, é o gerador utilizado pelo sklearn
    os.environ["PYTHONHASHSEED"] = str(seed)  # sistema operativo

set_seed(25)

In [3]:
# Load datasets
radi = pd.read_csv("../sbsppdaa24/train_radiomics_hipocamp.csv")
radi_test = pd.read_csv("../sbsppdaa24/test_radiomics_hipocamp.csv")


In [4]:

# Drop unique identifier columns from training data
radi.drop(columns=["Mask", "ID", "Image"], inplace=True)
print("Dropped unique identifier columns from training dataset.")

# Drop non-numeric columns except for 'Transition'
columns_to_drop_train = [col for col in radi.columns if radi[col].dtype == 'object' and col != 'Transition']
radi.drop(columns=columns_to_drop_train, inplace=True)
print(f"Dropped {len(columns_to_drop_train)} non-numeric columns from training dataset.")

# Drop columns where all entries are the same
same_value_cols_train = [col for col in radi.columns if radi[col].nunique() == 1]
radi.drop(columns=same_value_cols_train, inplace=True)
print(f"Dropped {len(same_value_cols_train)} constant-value columns from training dataset.")

# Apply MinMax scaling to float columns in training data
float_cols = radi.select_dtypes(include=['float', 'int']).columns
scaler = MinMaxScaler()
radi[float_cols] = scaler.fit_transform(radi[float_cols])

# Save the full processed training dataset
radi.to_csv("train_full_prep3.csv", index=False)
print("Full processed training dataset saved as 'train_full.csv'.")

# Drop the same columns in the test dataset
radi_test.drop(columns=["Mask", "ID", "Image"], inplace=True)
radi_test.drop(columns=columns_to_drop_train, inplace=True)  # Ignore if columns don't exist in test
radi_test.drop(columns=same_value_cols_train, inplace=True)  # Ignore if columns don't exist in test

# Apply MinMax scaling to float columns in the test data
radi_test[float_cols] = scaler.transform(radi_test[float_cols])  # Use the same scaler fit on radi

# Save the processed test dataset
radi_test.to_csv("test_processed_prep3.csv", index=False)
print("Processed test dataset saved as 'test_processed.csv'.")

# Step 3: Split the training dataset into train and validation
radi_train, radi_val = train_test_split(radi, test_size=0.2, random_state=25, stratify=radi['Transition'])

# Save the split datasets
radi_train.to_csv("train_split_prep3.csv", index=False)
radi_val.to_csv("test_split_prep3.csv", index=False)
print("Split datasets saved as 'train_split.csv' and 'test_split.csv'.")


Dropped unique identifier columns from training dataset.
Dropped 16 non-numeric columns from training dataset.
Dropped 148 constant-value columns from training dataset.
Full processed training dataset saved as 'train_full.csv'.
Processed test dataset saved as 'test_processed.csv'.
Split datasets saved as 'train_split.csv' and 'test_split.csv'.


In [5]:
radi_train.head()

Unnamed: 0,diagnostics_Image-original_Mean,diagnostics_Image-original_Maximum,diagnostics_Mask-original_VoxelNum,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,...,lbp-3D-k_glszm_ZonePercentage,lbp-3D-k_glszm_ZoneVariance,lbp-3D-k_ngtdm_Busyness,lbp-3D-k_ngtdm_Coarseness,lbp-3D-k_ngtdm_Complexity,lbp-3D-k_ngtdm_Contrast,lbp-3D-k_ngtdm_Strength,Sex,Age,Transition
121,0.21362,0.314961,0.335558,0.508006,0.341425,0.340244,0.427038,0.436834,0.499171,0.675933,...,0.025589,0.601761,0.526909,0.221443,0.813066,0.758702,0.227083,0.0,0.560224,AD-AD
198,0.307972,0.433071,0.253251,0.848413,0.422372,0.28106,0.172539,0.232358,0.250244,0.566272,...,0.550955,0.128763,0.086476,0.731297,0.2773,0.175126,0.742073,0.0,0.481793,AD-AD
60,0.557907,0.637795,0.726899,0.266324,0.205972,0.222178,0.517104,0.619771,0.666109,0.574898,...,0.685323,0.332006,0.34512,0.267696,0.373086,0.246672,0.271671,1.0,0.577031,CN-CN
183,0.879452,0.031496,0.668891,0.726613,0.238604,0.165218,0.337588,0.363567,0.379887,0.821212,...,0.502064,0.396552,0.52326,0.165003,0.583905,0.464665,0.169135,1.0,0.210084,MCI-MCI
136,0.353786,0.519685,0.426762,0.638138,0.238391,0.180202,0.36545,0.371797,0.333667,0.469002,...,0.387723,0.343192,0.211267,0.453709,0.366279,0.240928,0.471903,1.0,0.708683,CN-CN


In [None]:
radi_test.head()