1. Data Import & Preprocessing

In [1]:
import pandas as pd
import numpy as np
import os

# Read CSV file
data = pd.read_csv('C:/Reza Gonabadi/Polimi/Master Thesis/LIDC-IDRI_Simplified_low_asus_Output/all_radiomics_features.csv')

# Extract relevant columns
image_column = data.iloc[:, 3]  # 4th column in MATLAB (index 3)
mask_column = data.iloc[:, 2]   # 3rd column in MATLAB (index 2)
radiomic_features = data.iloc[:, 42:59]  # Columns from 43 to 59 in MATLAB

# Convert radiomic_features to numpy array
radiomic_data = radiomic_features.to_numpy()

# Get feature names
features_names = radiomic_features.columns


2. Extracting Nodule Data & Initializing Variables

In [2]:
# Initialize variables
num_features = radiomic_features.shape[1]

nodule_annotation_count = {}  # Dictionary to store nodule counts

mask_cell = mask_column.tolist()
interested_nodules = []
IDs = []
data_matrix = []
n = 0

for i in range(len(mask_cell)):
    if 'nodule_4' in mask_cell[i]:
        n += 1
        parts = mask_cell[i].split('\\')
        patient_id = parts[-2]
        nodule_path = parts[-1]
        nodule_path = os.path.splitext(nodule_path)[0]  # Remove file extension

        if 'nodule_' in nodule_path and 'annotation_4' in nodule_path:
            nodule_identifier = nodule_path.split('nodule_')[1]
            IDs.append(f"{patient_id}_nodule_{nodule_identifier}")

        Z = np.zeros((5, radiomic_features.shape[1]))
        for m in range(i - 3, i + 1):
            parts = mask_cell[m].split('/')
            nodule_path = parts[-1]
            nodule_path = os.path.splitext(nodule_path)[0]
            Z[m - (i - 3), :] = radiomic_data[m, :]
        data_matrix.append(Z)


3. 3D Matrix Construction for Feature Values

In [3]:
num_N = len(data_matrix)
num_M = data_matrix[0].shape[0]  # 5, the number of observations for each nodule
num_Z = data_matrix[0].shape[1]  # 851, the number of features

# Initialize a 3D matrix with zeros
features_values = np.zeros((num_N, num_M, num_Z))

# Loop through the data and populate the matrix
for n in range(num_N):
    for m in range(num_M):
        features_values[n, m, :] = data_matrix[n][m, :]


4. ICC Computation

In [13]:
import numpy as np
from scipy import stats

def compute_icc(values):
    """
    Compute ICC for the given data (one-way random effects model).
    - values: 2D array where rows are subjects and columns are sessions/raters.
    Returns the ICC.
    """
    n, k = values.shape  # n = number of subjects, k = number of raters/sessions

    # Compute means
    mean_subject = np.mean(values, axis=1)  # Mean per subject
    mean_session = np.mean(values, axis=0)  # Mean per session
    grand_mean = np.mean(values)            # Grand mean

    # Calculate variance components
    ss_between = np.sum((mean_subject - grand_mean) ** 2) * k
    ss_within = np.sum((values - mean_subject[:, np.newaxis]) ** 2)
    ss_total = np.sum((values - grand_mean) ** 2)

    # Calculate ICC
    icc = (ss_between - ss_within) / ss_total
    return icc

# Example of how to use the function
ICC_values = np.zeros(len(features_names))

for i in range(len(features_names)):
    feature_values = features_values[:, :, i]  # Extract all values for this feature
    ICC_values[i] = compute_icc(feature_values)

# Set negative ICC values to 0
ICC_values[ICC_values < 0] = 0


5. Identifying Instable Features

In [14]:
# Identify instable features with ICC < 0.75
instable_features_idx_1 = np.where(ICC_values < 0.75)[0]
instable_features_1 = features_names[instable_features_idx_1]

6. Handling Shifted Mask Features

In [15]:
# Read the filtered and shifted data
filtered_data = data[data['Mask'].str.contains('consensus') | data['Mask'].str.contains('only_ann')]
data_shifted = pd.read_csv('/Volumes/Liu_ExternalDisk/Full_LIDC-IDRI/python/ldsimu_denois_train_shifted_features.csv')

# Extract columns for both original and shifted data
image_column_shifted = data_shifted.iloc[:, 0]
mask_column_shifted = data_shifted.iloc[:, 1]
radiomic_features_shifted = data_shifted.iloc[:, 39:53]
radiomic_data_shifted = radiomic_features_shifted.to_numpy()

# Check if patient IDs, images, and masks match
if not np.array_equal(image_column, image_column_shifted):
    raise ValueError('Patient IDs, images, or masks do not match between original and shifted data.')

# Combine original and shifted radiomic data
combined_radiomic_data = np.stack((radiomic_data, radiomic_data_shifted), axis=2)
combined_radiomic_data_swapped = np.transpose(combined_radiomic_data, (0, 2, 1))

features_values = combined_radiomic_data_swapped


FileNotFoundError: [Errno 2] No such file or directory: '/Volumes/Liu_ExternalDisk/Full_LIDC-IDRI/python/ldsimu_denois_train_shifted_features.csv'

7. Second ICC Computation on Shifted Data

In [None]:
ICC_values_shifted = np.zeros(len(features_names))

# Compute ICC for each feature in shifted data
print('ICCs computation for shifted data...')
for i in range(len(features_names)):
    feature_values = features_values[:, :, i]
    icc = pg.intraclass_corr(data=feature_values, targets='subject', raters='session', ratings='rating')
    ICC_values_shifted[i] = icc['ICC'][0]

ICC_values_shifted[ICC_values_shifted < 0] = 0


8. Feature Removal Based on ICC

In [None]:
remove_features_idx_2 = np.where(ICC_values_shifted > 0.5)[0]
remove_features_2 = features_names[remove_features_idx_2]

# Find common and unique indices between the two sets of features
common_indices = np.intersect1d(instable_features_idx_1, remove_features_idx_2)
total_unique_indices = np.union1d(instable_features_idx_1, remove_features_idx_2)

print(f'Common Indices: {common_indices}')
print(f'Total Unique Indices: {len(total_unique_indices)}')

# Remove features from the dataset
radiomic_features = radiomic_features.drop(columns=remove_features_idx_2)
features_names = features_names.drop(total_unique_indices)


9. Correlation Removal

In [None]:
# Function to remove highly correlated features
def remove_high_corr(data, threshold=0.85):
    corr_matrix = np.corrcoef(data.T)
    upper_triangle = np.triu(corr_matrix, 1)
    to_remove = np.where(upper_triangle > threshold)
    return data.drop(columns=to_remove[1]), to_remove[1]

# Remove highly correlated features
features_uncorr, rem_idx = remove_high_corr(radiomic_features, 0.85)
table_complete = pd.concat([IDs, features_uncorr], axis=1)


10. Final Step: Saving Data

In [None]:
# Save the cleaned and processed data
table_complete.to_csv('/Volumes/Liu_ExternalDisk/Full_LIDC-IDRI/python/train_features_hdct_for_train_onlySS.csv', index=False)
test_radiomic_features.to_csv('/Volumes/Liu_ExternalDisk/Full_LIDC-IDRI/python/test_features_ldct_for_test_onlySS.csv', index=False)


In [12]:
import numpy as np
print(np.__version__)


2.1.3
