# Extracted features analysis

In [None]:
# %% [markdown]
# # Radiomic Features Data Debugging Pipeline
#
# In this notebook, we'll:
#
# 1. Load the dataset.
# 2. Examine basic information and summary statistics.
# 3. Check for missing values, infinite values, and constant (or near-constant) columns.
# 4. Identify rows with problematic values.
# 5. Visualize the distributions of each numeric column (to spot potential outliers).
#
# This will help us pinpoint issues before applying any standardization or clustering steps.

# %% 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Change this path to point to your dataset
file_path = "/home/jbetancur/Desktop/codes/clustering/feature_extraction/output/aggregated_lesion_features.csv" 

# Load the dataset
df = pd.read_csv(file_path)
print("Dataset loaded successfully!")
print("Dataset shape:", df.shape)

# %% [markdown]
# ## Basic Information and Summary Statistics

# %%
# Print basic info and the first few rows of the dataset
print("\n--- DataFrame Info ---")
print(df.info())
print("\n--- First 5 Rows ---")
print(df.head())

print("\n--- Descriptive Statistics for Numeric Columns ---")
print(df.describe())

# %% [markdown]
# ## Missing and Infinite Values Check

# %%
# Count missing values per column
missing_counts = df.isnull().sum()
print("\n--- Missing Values per Column ---")
print(missing_counts[missing_counts > 0])


# %% [markdown]
# ## Check for Constant or Near-Constant Columns
#
# These columns have zero (or almost zero) standard deviation, which might interfere with scaling.

# %%
numeric_cols = df.select_dtypes(include=[np.number]).columns
print("\n--- Columns with Zero or Near-Zero Standard Deviation ---")
for col in numeric_cols:
    std = df[col].std()
    if std == 0 or np.isnan(std):
        print(f"{col}: std = {std}")
        
# Alternatively, if you want to flag columns with extremely low variation (e.g., std < a small threshold)
low_variation_cols = [col for col in numeric_cols if 0 < df[col].std() < 1e-8]
if low_variation_cols:
    print("\nColumns with extremely low variation (std < 1e-8):")
    print(low_variation_cols)

# %% [markdown]
# ## Unique Value Counts per Column
#
# Sometimes, columns with too few unique values may indicate categorical or constant data that need special handling.

# %%
print("\n--- Unique Value Counts per Column ---")
for col in df.columns:
    unique_count = df[col].nunique()
    print(f"{col}: {unique_count} unique values")

# %% [markdown]
# ## Identify Rows with Problematic Values
#
# Let's locate any rows that contain missing or infinite values.

# %%
# Identify rows with any NaN or infinite values
problematic_rows = df[df.isnull().any(axis=1)]
print("\n--- Rows with Missing or Infinite Values ---")
print(problematic_rows)

# %% [markdown]
# ## Visualize Distributions for Each Numeric Column
#
# Histograms and boxplots help to spot outliers or weird distributions that could affect standardization.

# %%
for col in numeric_cols:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    sns.histplot(df[col].dropna(), kde=True, ax=axes[0])
    axes[0].set_title(f"Distribution of {col}")
    sns.boxplot(x=df[col].dropna(), ax=axes[1])
    axes[1].set_title(f"Boxplot of {col}")
    plt.tight_layout()
    plt.show()


Dataset loaded successfully!
Dataset shape: (3766, 87)

--- DataFrame Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3766 entries, 0 to 3765
Data columns (total 87 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   label_id                            3766 non-null   int64  
 1   num_voxels                          3766 non-null   int64  
 2   volume_physical                     3766 non-null   float64
 3   T1_mean                             3766 non-null   float64
 4   T1_std                              3766 non-null   float64
 5   T2_mean                             3766 non-null   float64
 6   T2_std                              3766 non-null   float64
 7   QSM_mean                            3766 non-null   float64
 8   QSM_std                             3766 non-null   float64
 9   T1_min                              3766 non-null   float64
 10  T1_max                       

TypeError: ufunc 'isinf' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''