# Study real and synthetic radiomic features differences


## Using PCA
 * No Conclusions

In [None]:
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [None]:
# Function to process each CSV file
def process_csv(df, file_name, features):

    data = df[features]

    scaler = StandardScaler()
    wavelet_data_scaled = scaler.fit_transform(data)

    pca = PCA(n_components=1)
    pca_result = pca.fit_transform(wavelet_data_scaled)

    pca_df = pd.DataFrame(data=pca_result, columns=['PC1'])
    pca_df['File'] = file_name

    return pca_df

In [None]:
import os
import pandas as pd
import plotly.express as px

# Define the directory path
directory_path = "./metrics/RADIOMICS"

columns_to_ignore = ['diagnostics_Versions_PyWavelet'] 

# List to hold results from each file
results = []

# Process each CSV file in the directory
for folder_name in os.listdir(directory_path):
    file_path = os.path.join(directory_path, folder_name, 'tumour_radiomics.csv')
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        wavelet_features = [col for col in df.columns if col not in columns_to_ignore and 'wavelet' in col.lower()]
        results.append(process_csv(df, folder_name, features=wavelet_features))

# Combine all results into a single DataFrame
combined_results = pd.concat(results, ignore_index=True)

# Plot the results using Plotly
fig = px.box(combined_results, x='File', y='PC1', title='Side-by-Side Box Plot of PCA (1 Component) for Each CSV File')
fig.update_layout(
    xaxis_title='CSV File',
    yaxis_title='Principal Component 1',
    xaxis=dict(tickangle=45)
)
fig.show()


## Using Random Forest

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

def get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm, verbose=False):
    # Load the CSV files into DataFrames
    real_cases_df = pd.read_csv(real_features_csv)
    synthetic_cases_df = pd.read_csv(synthetic_features_csv)

    # Add a label column to each DataFrame
    real_cases_df['label'] = 'real'
    synthetic_cases_df['label'] = 'synthetic'

    # Combine the DataFrames
    combined_df = pd.concat([real_cases_df, synthetic_cases_df], ignore_index=True)

    # List of columns to exclude 
    exclude_columns = [
        "diagnostics_Versions_PyRadiomics",
        "diagnostics_Versions_Numpy",
        "diagnostics_Versions_SimpleITK",	
        "diagnostics_Versions_PyWavelet",	
        "diagnostics_Versions_Python",	
        "diagnostics_Configuration_Settings",	
        "diagnostics_Configuration_EnabledImageTypes",	
        "diagnostics_Image-original_Hash",
        "diagnostics_Image-original_Dimensionality",
        "diagnostics_Image-original_Spacing",
        "diagnostics_Image-original_Size",
        'diagnostics_Mask-original_Hash',
        'diagnostics_Mask-original_Spacing',
        'diagnostics_Mask-original_Size',
        'diagnostics_Mask-original_BoundingBox',
        'diagnostics_Mask-original_VoxelNum',
        'diagnostics_Mask-original_VolumeNum',
        'diagnostics_Mask-original_CenterOfMassIndex',
        'diagnostics_Mask-original_CenterOfMass',
        "Filename"
    ]

    # Drop the excluded columns
    combined_df = combined_df.drop(columns=exclude_columns)

    # Handle missing values only for numeric columns
    numeric_columns = combined_df.select_dtypes(include='number').columns
    combined_df[numeric_columns] = combined_df[numeric_columns].fillna(combined_df[numeric_columns].mean())

    if norm:
        # Normalize the features
        scaler = StandardScaler()
        features = combined_df.drop(columns=['label'])
        features_scaled = scaler.fit_transform(features)
    else:
        features = combined_df.drop(columns=['label'])
        features_scaled = features

    # Create a new DataFrame with scaled features and the label
    combined_df_scaled = pd.DataFrame(features_scaled, columns=features.columns)
    combined_df_scaled['label'] = combined_df['label']

    # Encode the label column to numeric values
    label_encoder = LabelEncoder()
    combined_df_scaled['label'] = label_encoder.fit_transform(combined_df_scaled['label'])

    return combined_df_scaled


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

def show_correlation(combined_df_scaled, verbose=False):
    # Calculate correlation with the target variable
    correlations = combined_df_scaled.corr()['label'].abs().sort_values(ascending=False)

    # Display the top correlated features
    if verbose:
        print("Top correlated features with the target variable:\n", correlations.head(10))
    return correlations

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, confusion_matrix, roc_curve
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree, DecisionTreeClassifier

def get_best_RF_model(combined_df_scaled, selected_features):
    # Train a Random Forest model
    X = combined_df_scaled.drop(columns=['label'])[selected_features]
    y = combined_df_scaled['label']

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10]
    }

    # Initialize the Random Forest model
    rf = RandomForestClassifier(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)

    # Best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best Parameters: {best_params}")
    print(f"Best Cross-Validation Score: {best_score:.2f}")

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred_best = best_model.predict(X)
    y_pred_proba_best = best_model.predict_proba(X)[:, 1]

    # Evaluate the model
    accuracy_best = accuracy_score(y, y_pred_best)
    roc_auc_best = roc_auc_score(y, y_pred_proba_best)

    print(f"Test Accuracy with Best Model: {accuracy_best:.2f}")
    print(f"Test ROC AUC with Best Model: {roc_auc_best:.2f}")

    # Get feature importances
    feature_importances = best_model.feature_importances_
    importance_df = pd.DataFrame({'Feature': selected_features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Display the top important features
    print("\nTop important features according to the Random Forest model:\n", importance_df.head(10))

    # Visualize feature importances
    plt.figure(figsize=(12, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
    plt.title('Top 10 Important Features')
    plt.show()

    # Select the first tree from the trained random forest
    tree = best_model.estimators_[0]
    feature_names = X.columns.tolist()
    # Plot the tree
    plt.figure(figsize=(100,100))
    plot_tree(tree, filled=True, feature_names=feature_names, class_names=["Class 0", "Class 1"])
    plt.show()
    
    return best_model

def get_best_DT_model(combined_df_scaled, selected_features, verbose=False):
    # Train a Random Forest model
    if selected_features=="All":
        X = combined_df_scaled.drop(columns=['label'])
    else:
        X = combined_df_scaled.drop(columns=['label'])[selected_features]
    y = combined_df_scaled['label']

    # Define the parameter grid for hyperparameter tuning
    param_grid = {
    'criterion': ['gini', 'entropy'],  # Criterion to measure the quality of a split
    'max_depth': [1], #[None, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': [None, 'sqrt', 'log2', ]  # Number of features to consider when looking for the best split
    }

    # Initialize the Random Forest model
    dt = DecisionTreeClassifier(random_state=42)

    # Perform grid search with cross-validation
    grid_search = GridSearchCV(estimator=dt, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X, y)

    # Best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    
    if verbose:
        print(f"Best Parameters: {best_params}")
        print(f"Best Cross-Validation Score: {best_score:.2f}")

    # Evaluate the best model on the test set
    best_model = grid_search.best_estimator_
    y_pred_best = best_model.predict(X)
    y_pred_proba_best = best_model.predict_proba(X)[:, 1]

    # Evaluate the model
    accuracy_best = accuracy_score(y, y_pred_best)
    roc_auc_best = roc_auc_score(y, y_pred_proba_best)

    print(f"Test Accuracy with Best Model: {accuracy_best:.6f}")
    if verbose:
        print(f"Test ROC AUC with Best Model: {roc_auc_best:.6f}")

    # Get feature importances
    feature_importances = best_model.feature_importances_
    if selected_features=="All":
        selected_features = X.columns.tolist()
   
    importance_df = pd.DataFrame({'Feature': selected_features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Display the top important features
    print(f"Top feature: {importance_df.head(1)}")
    if verbose:
        print("\nTop important features according to the Random Forest model:\n", importance_df.head(10))

    if verbose:
        # Visualize feature importances
        plt.figure(figsize=(12, 6))
        sns.barplot(x='Importance', y='Feature', data=importance_df.head(10))
        plt.title('Top 10 Important Features')
        plt.show()

        # Get feature names
        feature_names = X.columns.tolist()

        # Plot the Decision Tree
        plt.figure(figsize=(5, 5))
        plot_tree(best_model, filled=True, feature_names=feature_names, class_names=["Real", "Synth"], rounded=True)
        plt.title("Decision Tree")
        plt.show()
    
    return best_model, importance_df, accuracy_best

In [None]:
real_features_csv_200 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
synthetic_features_csv = "./metrics/RADIOMICS/Dataset500_HnN/tumour_radiomics.csv"
combined_df_scaled = get_combined_df_scaled(real_features_csv_200, synthetic_features_csv, norm=False, verbose=False)

combined_df_scaled.to_excel('./radiomics/tumour_radiomics_clip_200_Dataset532_HnN.xlsx', index=False)

correlations = show_correlation(combined_df_scaled, verbose=False)
selected_features = correlations.head(10).index.tolist()
selected_features.remove('label')

best_model, importance_df, accuracy_best = get_best_DT_model(combined_df_scaled, selected_features="All", verbose=True)
first_row_list = importance_df.iloc[0].tolist()
print(first_row_list)

#### Check all datasets

In [None]:
from os import listdir
from os.path import join

RADIOMICS = "./metrics/RADIOMICS"
real_features_csv_200 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
real_features_csv_1000 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_1000.csv"

save_best_feature = {}
missing = []


for nnunet_id in listdir(RADIOMICS):
    try:
        if "HnN" not in nnunet_id:
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics.csv")
        if nnunet_id in ["Dataset570_HnN", "Dataset571_HnN", "Dataset572_HnN", "Dataset573_HnN", "Dataset574_HnN", "Dataset580_HnN", "Dataset581_HnN", "Dataset582_HnN", "Dataset583_HnN", "Dataset584_HnN", "Dataset540_HnN", "Dataset541_HnN","Dataset542_HnN","Dataset543_HnN","Dataset544_HnN"]:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_1000, synthetic_features_csv, norm=False, verbose=False)
            #combined_df_scaled.to_excel('/projects/brats2023_a_f/Aachen/aritifcial-head-and-neck-cts/nnUNet/trash/tumour_radiomics_clip_200_Dataset532_HnN.xlsx', index=False)
        else:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_200, synthetic_features_csv, norm=False, verbose=False)

        correlations = show_correlation(combined_df_scaled, verbose=False)
        selected_features = correlations.head(10).index.tolist()
        selected_features.remove('label')
        best_model, importance_df, accuracy_best = get_best_DT_model(combined_df_scaled, selected_features="All", verbose=False)              
        save_best_feature[nnunet_id] = [importance_df.iloc[0].tolist()[0], accuracy_best]
        print("##################")
    except:
        pass



In [None]:
from os import listdir
from os.path import join

RADIOMICS = "./metrics/RADIOMICS"
real_features_csv_200 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
real_features_csv_1000 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_1000.csv"

save_glcm_Imc_features = {}
missing = []

nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

for nnunet_id in sorted_list_with_condition:
    try:
        if "HnN" not in nnunet_id:
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics.csv")
        if nnunet_id in ["Dataset570_HnN", "Dataset571_HnN", "Dataset572_HnN", "Dataset573_HnN", "Dataset574_HnN", "Dataset580_HnN", "Dataset581_HnN", "Dataset582_HnN", "Dataset583_HnN", "Dataset584_HnN", "Dataset540_HnN", "Dataset541_HnN","Dataset542_HnN","Dataset543_HnN","Dataset544_HnN"]:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_1000, synthetic_features_csv, norm=False, verbose=False)
            #combined_df_scaled.to_excel('/projects/brats2023_a_f/Aachen/aritifcial-head-and-neck-cts/nnUNet/trash/tumour_radiomics_clip_200_Dataset532_HnN.xlsx', index=False)
        else:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_200, synthetic_features_csv, norm=False, verbose=False)
                              
        #selected_features = combined_df_scaled.filter(regex='wavelet-HLL_glcm_InverseVariance').columns.tolist()
        best_model, importance_df, accuracy_best = get_best_DT_model(combined_df_scaled, selected_features="All", verbose=False)        

        #best_model, importance_df, accuracy_best = get_best_DT_model(combined_df_scaled, selected_features=selected_features, verbose=False)              
        save_glcm_Imc_features[nnunet_id] = [importance_df.iloc[0].tolist()[0], accuracy_best]
        print("##################")
        
    except:
        pass

In [None]:
save_glcm_Imc_features

In [None]:
for key in save_best_feature:
    if save_glcm_Imc_features[key][0]=="wavelet-HHL_glcm_Imc1":
        print(f"{key}")
        #print(f"{key}: {save_best_feature[key]} | {save_glcm_Imc_features[key]}")



In [None]:
from collections import Counter

features_list_from_all = []
for key in save_best_feature:
    features_list_from_all.append(save_best_feature[key][0])
Counter(features_list_from_all)

#### Check all correlated features

In [None]:
from os import listdir
from os.path import join

RADIOMICS = "./metrics/RADIOMICS"
real_features_csv_200 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
real_features_csv_1000 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_1000.csv"

save_all_corr_features = []
missing = []
for nnunet_id in listdir(RADIOMICS):
    try:
        if "HnN" not in nnunet_id:
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics.csv")
        if nnunet_id in ["Dataset570_HnN", "Dataset571_HnN", "Dataset572_HnN", "Dataset573_HnN", "Dataset574_HnN", "Dataset580_HnN", "Dataset581_HnN", "Dataset582_HnN", "Dataset583_HnN", "Dataset584_HnN", "Dataset540_HnN", "Dataset541_HnN","Dataset542_HnN","Dataset543_HnN","Dataset544_HnN"]:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_1000, synthetic_features_csv)
        else:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_200, synthetic_features_csv)
        correlations = show_correlation(combined_df_scaled)
        selected_features = correlations.head(10).index.tolist()
        selected_features.remove('label')
        save_all_corr_features.append(selected_features)
    except:
        missing.append(nnunet_id)
print(f"missing: {missing}")


In [None]:
from collections import Counter

correct_save_all_corr_features = []
for list_ in save_all_corr_features:
    for element in list_:
        correct_save_all_corr_features.append(element)
        
print(f"Total of cases: {len(save_all_corr_features)}")
# Count the frequency of each element
frequency = Counter(correct_save_all_corr_features)

# Display the frequency of each element
frequency

## Using only the most important feature

In [None]:
from os import listdir
from os.path import join

RADIOMICS = "./metrics/RADIOMICS"
real_features_csv_200 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
real_features_csv_1000 = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_1000.csv"

save_best_feature = {}
missing = []


for nnunet_id in listdir(RADIOMICS):
    try:
        if "HnN" not in nnunet_id:
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics.csv")
        if nnunet_id in ["Dataset570_HnN", "Dataset571_HnN", "Dataset572_HnN", "Dataset573_HnN", "Dataset574_HnN", "Dataset580_HnN", "Dataset581_HnN", "Dataset582_HnN", "Dataset583_HnN", "Dataset584_HnN", "Dataset540_HnN", "Dataset541_HnN","Dataset542_HnN","Dataset543_HnN","Dataset544_HnN"]:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_1000, synthetic_features_csv, norm=False, verbose=False)
            #combined_df_scaled.to_excel('/projects/brats2023_a_f/Aachen/aritifcial-head-and-neck-cts/nnUNet/trash/tumour_radiomics_clip_200_Dataset532_HnN.xlsx', index=False)
        else:
            combined_df_scaled = get_combined_df_scaled(real_features_csv_200, synthetic_features_csv, norm=False, verbose=False)

        correlations = show_correlation(combined_df_scaled, verbose=False)
        selected_features = correlations.head(10).index.tolist()
        selected_features.remove('label')
        best_model, importance_df, accuracy_best = get_best_DT_model(combined_df_scaled, selected_features="All", verbose=False)              
        save_best_feature[nnunet_id] = [importance_df.iloc[0].tolist()[0], accuracy_best]
        print("##################")
    except:
        pass



In [None]:
all_features = []
for key in save_best_feature:
    all_features.append(save_best_feature[key][0])

features_to_search = list(set(all_features))

feature_keys = {}
# Iterate over the dictionary and collect keys for each feature
for feature in features_to_search:
    feature_keys[feature] = [key for key, value in save_best_feature.items() if value[0] == feature]

# Print the results
for feature, keys in feature_keys.items():
    print(f"Keys with feature '{feature}': {keys}")


In [None]:
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio
figs = []
# Define the directory path
directory_path = "./metrics/RADIOMICS"



for feature, keys in feature_keys.items():
    results = []
    ## Adding the real radiomics
    file_path = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
    df = pd.read_csv(file_path)
    features = [feature]
    data = df[features].copy()  # Create a copy to avoid SettingWithCopyWarning
    data.loc[:, 'File'] = "Real_clip_200"  # Use .loc to set the new column
    results.append(data)
    for folder_name in keys:
        file_path = os.path.join(directory_path, folder_name, 'tumour_radiomics.csv')
        if os.path.exists(file_path): 
            df = pd.read_csv(file_path)
            features = [feature]
            data = df[features].copy()  
            data.loc[:, 'File'] = folder_name  
            results.append(data)
    ## Adding the real radiomics
    file_path = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_1000.csv"
    df = pd.read_csv(file_path)
    features = [feature]
    data = df[features].copy()  # Create a copy to avoid SettingWithCopyWarning
    data.loc[:, 'File'] = "Real_clip_1000"  # Use .loc to set the new column
    results.append(data)

    # Combine all results into a single DataFrame
    combined_results = pd.concat(results, ignore_index=True)

    # Plot the results using Plotly
    fig = px.box(combined_results, x='File', y=feature, title=feature)
    fig.update_layout(
        xaxis_title="Dataset",
        yaxis_title=feature,
        xaxis=dict(tickangle=45)
    )
    fig.show()



In [None]:
import os
import pandas as pd
import plotly.express as px
import plotly.io as pio

name_conversor = {
    "Dataset500_HnN": "M<sub>all_conv</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M",
    "Dataset501_HnN": "M<sub>all_conv</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M Karras",
    "Dataset502_HnN": "M<sub>all_conv</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M SDE",
    "Dataset503_HnN": "M<sub>all_conv</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset504_HnN": "M<sub>all_conv</sub><sup>WDM<sub>200</sub></sup> <br> Linear (1000)",
    "Dataset510_HnN": "M<sub>all_d</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M",
    "Dataset511_HnN": "M<sub>all_d</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M Karras",
    "Dataset512_HnN": "M<sub>all_d</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M SDE",
    "Dataset513_HnN": "M<sub>all_d</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset514_HnN": "M<sub>all_d</sub><sup>WDM<sub>200</sub></sup> <br> Linear (1000)",
    "Dataset520_HnN": "M<sub>all_w</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M",
    "Dataset521_HnN": "M<sub>all_w</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M Karras",
    "Dataset522_HnN": "M<sub>all_w</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M SDE",
    "Dataset523_HnN": "M<sub>all_w</sub><sup>WDM<sub>200</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset524_HnN": "M<sub>all_w</sub><sup>WDM<sub>200</sub></sup> <br> Linear (1000)",
    "Dataset530_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M",
    "Dataset531_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M Karras",
    "Dataset532_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M SDE",
    "Dataset533_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset534_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> Linear (1000)",
    "Dataset540_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M",
    "Dataset541_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M Karras",
    "Dataset542_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M SDE",
    "Dataset543_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset544_HnN": "M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> Linear (1000)",
    "Dataset550_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M",
    "Dataset551_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M Karras",
    "Dataset552_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M SDE",
    "Dataset553_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset554_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> Linear (1000)",
    "Dataset560_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M",
    "Dataset561_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M Karras",
    "Dataset562_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M SDE",
    "Dataset563_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset564_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>200</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>200</sub></sup> <br> Linear (1000)",
    "Dataset570_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M",
    "Dataset571_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M Karras",
    "Dataset572_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M SDE",
    "Dataset573_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset574_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> Linear (1000)",
    "Dataset580_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M",
    "Dataset581_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M Karras",
    "Dataset582_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M SDE",
    "Dataset583_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> DPM++ 2M Karras SDE",
    "Dataset584_HnN": "M<sub>ROI_d</sub><sup>WDM<sub>1000</sub></sup> + M<sub>all_cat</sub><sup>DDPM<sub>1000</sub></sup> <br> Linear (1000)"
}



# Define the directory path
directory_path = "./metrics/RADIOMICS"

# List of folder names to highlight in red
nnunet_id_1000_origin = {"Real_clip_1000", "Dataset570_HnN", "Dataset571_HnN", "Dataset572_HnN", "Dataset573_HnN", "Dataset574_HnN",
                   "Dataset580_HnN", "Dataset581_HnN", "Dataset582_HnN", "Dataset583_HnN", "Dataset584_HnN",
                   "Dataset540_HnN", "Dataset541_HnN", "Dataset542_HnN", "Dataset543_HnN", "Dataset544_HnN"}

nnunet_id_1000 = [name_conversor[key] for key in keys_list if key in nnunet_id_1000_origin]

figs = []

for feature, keys in feature_keys.items():
    results = []
    real_clip_200_added = False  # Track if Real_clip_200 is added
    
    # Process other datasets
    for folder_name in keys:
        file_path = os.path.join(directory_path, folder_name, 'tumour_radiomics.csv')
        if os.path.exists(file_path):
            df = pd.read_csv(file_path)
            data = df[[feature]].copy()
            data.loc[:, 'File'] = folder_name
            results.append(data)
    
    # Combine all results into a single DataFrame
    combined_results = pd.concat(results, ignore_index=True) if results else pd.DataFrame(columns=['File', feature])

    # Get actual files present in the DataFrame
    actual_files = set(combined_results['File'])

    # Check if at least one dataset from nnunet_id_1000 is present
    existing_nnunet_id_1000 = sorted(f for f in nnunet_id_1000 if f in actual_files and f != "Real_clip_1000")
    nnunet_present = bool(existing_nnunet_id_1000)

    # Only add Real_clip_200 if at least one dataset outside nnunet_id_1000 exists
    non_nnunet_cases = actual_files - set(nnunet_id_1000)
    if non_nnunet_cases:
        file_path = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_200.csv"
        df = pd.read_csv(file_path)
        data = df[[feature]].copy()
        data.loc[:, 'File'] = "Real_clip_200"
        results.insert(0, data)  # Ensure Real_clip_200 is always at the beginning
        real_clip_200_added = True

    # Only add Real_clip_1000 if another dataset from nnunet_id_1000 is present
    if nnunet_present:
        file_path = "./metrics/RADIOMICS/Dataset910_HnN/tumour_radiomics_clip_1000.csv"
        df = pd.read_csv(file_path)
        data = df[[feature]].copy()
        data.loc[:, 'File'] = "Real_clip_1000"
        results.append(data)
        existing_nnunet_id_1000.append("Real_clip_1000")

    # Update combined_results after adding real clips
    combined_results = pd.concat(results, ignore_index=True) if results else pd.DataFrame(columns=['File', feature])
    actual_files = set(combined_results['File'])

    # Get sorted order: Real_clip_200 (if present), then non-nnunet cases, then nnunet cases
    sorted_files = (["Real_clip_200"] if real_clip_200_added else []) + sorted(non_nnunet_cases) + existing_nnunet_id_1000

    # Define color map
    color_map = {folder: 'red' if folder in nnunet_id_1000 else 'blue' for folder in actual_files}

    # Create the box plot
    fig = px.box(combined_results, x='File', y=feature, title=feature, 
                color='File', color_discrete_map=color_map,
                category_orders={"File": sorted_files})

    # Update the legend to show only Real_clip_1000 and Real_clip_200
    for trace in fig.data:
        if trace.name not in {"Real_clip_1000", "Real_clip_200"}:
            trace.showlegend = False  # Hide other entries in the legend

    # Update the legend to show only Real_clip_1000 and Real_clip_200
    for trace in fig.data:
        if trace.name == "Real_clip_1000":
            trace.name = "Clip 1000"  # Replace with desired name
        elif trace.name == "Real_clip_200":
            trace.name = "Clip 200"  # Replace with desired name
        else:
            trace.showlegend = False  # Hide other entries in the legend

    # Customize layout
    fig.update_layout(
        xaxis_title="Dataset",
        yaxis_title=feature,
        xaxis=dict(tickangle=0),
        legend_title_text="Legend"  # Add this line to set the legend title
    )

    fig.show()
    # Append figure to list
    figs.append(fig)

## Save all figures to an HTML file
#pio.write_html(figs, file="./metrics/RADIOMICS/uitls/most_important_feature.html", auto_open=True)


In [None]:
import plotly.io as pio


# If you want to save them to a single HTML file:
pio.write_html(figs[0], file="./metrics/RADIOMICS/uitls/most_important_feature.html", auto_open=False)

# If you want to append all figures to the same file, one after another:
with open("./metrics/RADIOMICS/uitls/most_important_feature.html", "w") as f:
    for fig in figs:
        f.write(fig.to_html(full_html=False))

# This will append each figure into the same HTML file without reloading the entire HTML.


## Clustering plot

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

def get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm, verbose=False):
    # Load the CSV files into DataFrames
    real_cases_df = pd.read_csv(real_features_csv)
    synthetic_cases_df = pd.read_csv(synthetic_features_csv)

    # Add a label column to each DataFrame
    real_cases_df['label'] = 'real'
    synthetic_cases_df['label'] = 'synthetic'

    # Combine the DataFrames
    combined_df = pd.concat([real_cases_df, synthetic_cases_df], ignore_index=True)

    # List of columns to exclude 
    exclude_columns = [
        "diagnostics_Versions_PyRadiomics",
        "diagnostics_Versions_Numpy",
        "diagnostics_Versions_SimpleITK",	
        "diagnostics_Versions_PyWavelet",	
        "diagnostics_Versions_Python",	
        "diagnostics_Configuration_Settings",	
        "diagnostics_Configuration_EnabledImageTypes",	
        "diagnostics_Image-original_Hash",
        "diagnostics_Image-original_Dimensionality",
        "diagnostics_Image-original_Spacing",
        "diagnostics_Image-original_Size",
        'diagnostics_Mask-original_Hash',
        'diagnostics_Mask-original_Spacing',
        'diagnostics_Mask-original_Size',
        'diagnostics_Mask-original_BoundingBox',
        'diagnostics_Mask-original_VoxelNum',
        'diagnostics_Mask-original_VolumeNum',
        'diagnostics_Mask-original_CenterOfMassIndex',
        'diagnostics_Mask-original_CenterOfMass',
        "Filename"
    ]

    # Drop the excluded columns
    combined_df = combined_df.drop(columns=exclude_columns)

    # Handle missing values only for numeric columns
    numeric_columns = combined_df.select_dtypes(include='number').columns
    combined_df[numeric_columns] = combined_df[numeric_columns].fillna(combined_df[numeric_columns].mean())

    if norm:
        # Normalize the features
        scaler = StandardScaler()
        features = combined_df.drop(columns=['label'])
        features_scaled = scaler.fit_transform(features)
    else:
        features = combined_df.drop(columns=['label'])
        features_scaled = features

    # Create a new DataFrame with scaled features and the label
    combined_df_scaled = pd.DataFrame(features_scaled, columns=features.columns)
    combined_df_scaled['label'] = combined_df['label']

    # Encode the label column to numeric values
    label_encoder = LabelEncoder()
    combined_df_scaled['label'] = label_encoder.fit_transform(combined_df_scaled['label'])
    

    return combined_df_scaled


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from os import listdir
from os.path import join
import plotly.express as px
import plotly.subplots as sp
import numpy as np
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import accuracy_score 
from sklearn.tree import plot_tree, DecisionTreeClassifier

def plot_clustering(data):
    # Separate features and labels
    X = data.drop('label', axis=1)  # Features
    y = data['label']  # Labels

    max_depth = 2
    # Define the parameter grid for GridSearchCV, fixing max_depth to 10

    param_grid = {
        'criterion': ['gini', 'entropy'],  # Explore different splitting criteria
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': [None, 'sqrt', 'log2'],
    }
    """
    param_grid = {
        'criterion': ['gini'],  # Explore different splitting criteria
        'min_samples_split': [2],
        'min_samples_leaf': [1],
        'max_features': [None],
    }
    """
    # First decision tree to decide the best 2/3 features
    tree_classifier = DecisionTreeClassifier(random_state=42, max_depth=max_depth)
    grid_search = GridSearchCV(tree_classifier, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X, y)
    best_params = grid_search.best_params_
    print(f"Best Hyperparameters: {best_params}")
    best_tree_classifier = grid_search.best_estimator_


    # Get feature importances and train with the best 2
    feature_importances = best_tree_classifier.feature_importances_
    top_2_features = X.columns[feature_importances.argsort()[-2:]]  # Select the 2 most important features
    print("Selected Features:", top_2_features)
    # Train a new Decision Tree using only those 2 features
    X_selected = X[top_2_features]
    grid_search = GridSearchCV(tree_classifier, param_grid, cv=5, scoring='accuracy')
    grid_search.fit(X_selected, y)
    best_tree_classifier = grid_search.best_estimator_
    feature_importances = best_tree_classifier.feature_importances_

    # Assuming best_tree_classifier is a DecisionTreeClassifier
    plt.figure(figsize=(12, 8))
    plot_tree(best_tree_classifier, 
            feature_names=X_selected.columns,   # Replace X.columns with your feature names
            class_names=['Class 0', 'Class 1'],  # Update with your class labels
            filled=True,
            rounded=True,
            fontsize=10)
    plt.title("Decision Tree Visualization")
    plt.show()
    
    # Create a DataFrame to display feature importances
    importance_df = pd.DataFrame({'Feature': X_selected.columns, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    # Display the top N important features (e.g., top 10)
    top_n = 2
    print(importance_df)

    # Select the top N important features for PCA
    #selected_features = importance_df['Feature'][:top_n].tolist()
    #X_selected = X_selected[selected_features]

    # Initialize and apply PCA to reduce to 2 dimensions
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_selected)

    # **Define pca_df**
    pca_df = pd.DataFrame(data=X_pca, columns=['PC1', 'PC2'])
    pca_df['label'] = y.map({0: 'Real', 1: 'Synthetic'})  # Map labels for better visualization


    # Create the scatter plot with both real and synthetic data
    fig1 = px.scatter(pca_df, x='PC1', y='PC2', color='label',
                    title='PCA of Synthetic vs. Real Data',
                    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'},
                    opacity=0.7)            # Adjust opacity for all points
    fig1.update_layout(
        title={'font': {'size': 72}},  # Adjust title font size
        xaxis={'title': {'font': {'size': 64}}, 'tickfont': {'size': 64}},  # Adjust x-axis label size and tick label size
        yaxis={'title': {'font': {'size': 64}}, 'tickfont': {'size': 64}},  # Adjust y-axis label size and tick label size
        legend={'font': {'size': 72}},  # Adjust legend font size
    )

    
    # Add jittering
    jitter_amount = 5  # Adjust the amount of jittering
    pca_df['PC1_jittered'] = pca_df['PC1'] + np.random.normal(0, jitter_amount, size=len(pca_df))
    pca_df['PC2_jittered'] = pca_df['PC2'] + np.random.normal(0, jitter_amount, size=len(pca_df))

    # Add overlay trace with green color for the intersection of real and synthetic
    real_data = pca_df[pca_df['label'] == 'Real']
    synthetic_data = pca_df[pca_df['label'] == 'Synthetic']

    # Update marker size and opacity for all traces
    for trace in fig1.data:
        trace.update(marker={'size': 8, 'opacity': 0.7})  # Adjust size and opacity as needed

    # Update traces: Real (bottom layer) and Synthetic (top layer) for clear visibility
    fig1.for_each_trace(lambda trace: trace.update(marker={'size': 8, 'opacity': 0.9}
                                               if trace.name == 'Real' 
                                               else {'size': 8, 'opacity': 0.4}))
    # Force layout computation
    fig1 = fig1.full_figure_for_development()

    x_range = [fig1.layout.xaxis.range[0], fig1.layout.xaxis.range[1]]
    y_range = [fig1.layout.yaxis.range[0], fig1.layout.yaxis.range[1]]

    #fig1.show()

    # Create the scatter plot with only real data, using the same scale
    fig2 = px.scatter(pca_df[pca_df['label'] == 'Real'], x='PC1', y='PC2',
                    title='PCA of Real Data',
                    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'})

    # Update marker size and opacity for fig2
    for trace in fig2.data:
        trace.update(marker={'size': 8, 'opacity': 0.7})  # Adjust size and opacity as needed

    fig2.update_xaxes(range=x_range)
    fig2.update_yaxes(range=y_range)
    #fig2.show()

    # Create the scatter plot with only synthetic data, using the same scale
    fig3 = px.scatter(pca_df[pca_df['label'] == 'Synthetic'], x='PC1', y='PC2',
                    title='PCA of Synthetic Data',
                    labels={'PC1': 'Principal Component 1', 'PC2': 'Principal Component 2'})

    # Update marker size and opacity for fig3
    for trace in fig3.data:
        trace.update(marker={'size': 8, 'opacity': 0.7})  # Adjust size and opacity as needed
    
    fig3.update_xaxes(range=x_range)
    fig3.update_yaxes(range=y_range)
    #fig3.show()

    # Predict on the training data
    y_pred = best_tree_classifier.predict(X_selected)

    # Calculate accuracy
    accuracy = accuracy_score(y, y_pred)
    print(f"Training Accuracy: {accuracy}")

    return fig1, fig2, fig3, top_2_features

##### All datasets together

In [None]:
figs = []

In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_1.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

save_combined_df_scaled = None

for idx, nnunet_id in enumerate(sorted_list_with_condition[5:]):
    #try:
        if "BraTS" not in nnunet_id or nnunet_id=="Dataset960_BraTS" or nnunet_id=="Dataset996_BraTS_GAN":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_1.csv")
        
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        if idx == 0:
            save_combined_df_scaled = combined_df_scaled
        else:
            save_combined_df_scaled = pd.concat([save_combined_df_scaled, combined_df_scaled])
       
        print("##################")
        
    #except:
    #    pass

save_combined_df_scaled = save_combined_df_scaled.reset_index(drop=True)

fig1, fig2, fig3 = plot_clustering(data=save_combined_df_scaled)
figs.append(fig1)
figs.append(fig2)
figs.append(fig3)

In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_1.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

save_combined_df_scaled = None

for idx, nnunet_id in enumerate(sorted_list_with_condition[5:]):
    #try:
        if "BraTS" not in nnunet_id or nnunet_id=="Dataset960_BraTS" or nnunet_id=="Dataset996_BraTS_GAN":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_2.csv")
        
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        if idx == 0:
            save_combined_df_scaled = combined_df_scaled
        else:
            save_combined_df_scaled = pd.concat([save_combined_df_scaled, combined_df_scaled])
       
        print("##################")
    #except:
    #    pass

save_combined_df_scaled = save_combined_df_scaled.reset_index(drop=True)

fig1, fig2, fig3 = plot_clustering(data=save_combined_df_scaled)
figs.append(fig1)
figs.append(fig2)
figs.append(fig3)

In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_1.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)



save_combined_df_scaled = None

for idx, nnunet_id in enumerate(sorted_list_with_condition[5:]):
    #try:
        if "BraTS" not in nnunet_id or nnunet_id=="Dataset960_BraTS" or nnunet_id=="Dataset996_BraTS_GAN":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_3.csv")
        
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        if idx == 0:
            save_combined_df_scaled = combined_df_scaled
        else:
            save_combined_df_scaled = pd.concat([save_combined_df_scaled, combined_df_scaled])
       
        print("##################")
    #except:
    #    pass

save_combined_df_scaled = save_combined_df_scaled.reset_index(drop=True)

fig1, fig2, fig3 = plot_clustering(data=save_combined_df_scaled)
figs.append(fig1)
figs.append(fig2)
figs.append(fig3)

##### Each dataset individually.

In [None]:
def contains_imaginary(value):
    try:
        float(value)  # Try converting to float
        return False
    except ValueError:
        return 'j' in str(value).lower() or 'i' in str(value).lower()

In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_1.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

individual_figs = []
save_top_features_1 = {}
save_combined_df_scaled = None

sorted_list_with_condition = ["Dataset602_BraTS", "Dataset613_BraTS", "Dataset996_BraTS_GAN"]
for idx, nnunet_id in enumerate(sorted_list_with_condition):
    #try:
        if "BraTS" not in nnunet_id or nnunet_id=="Dataset960_BraTS":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_1.csv")
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        
        # Drop columns with imaginary values
        columns_with_imaginary = combined_df_scaled.applymap(contains_imaginary).any()
        combined_df_scaled = combined_df_scaled.drop(columns=columns_with_imaginary[columns_with_imaginary].index)

        fig1, fig2, fig3, top_2_features = plot_clustering(data=combined_df_scaled)
        save_top_features_1[nnunet_id] = top_2_features
        individual_figs.append([fig1, fig2, fig3])
        
        fig1.write_image(f"./metrics/RADIOMICS/uitls/MRI_label_1_clusterplot_{nnunet_id}.png", scale=0.8, width=2000, height=1200)
        
        print("##################")
    #except:
    #    pass

import plotly.io as pio

individual_figs_to_save = [x[0] for x in individual_figs]

# If you want to save them to a single HTML file:
pio.write_html(individual_figs_to_save[0], file="./metrics/RADIOMICS/uitls/MRI_cluster_most_important_feature_tumour_1.html", auto_open=False)

# If you want to append all figures to the same file, one after another:
with open("./metrics/RADIOMICS/uitls/MRI_cluster_most_important_feature_tumour_1.html", "w") as f:
    for fig in individual_figs_to_save:
        f.write(fig.to_html(full_html=False))


In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_2.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

individual_figs = []
save_top_features_2 = {}
save_combined_df_scaled = None

sorted_list_with_condition = ["Dataset602_BraTS", "Dataset613_BraTS", "Dataset996_BraTS_GAN"]

for idx, nnunet_id in enumerate(sorted_list_with_condition):
    #try:
        if "BraTS" not in nnunet_id or nnunet_id=="Dataset960_BraTS":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_2.csv")
        
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        
        # Drop columns with imaginary values
        columns_with_imaginary = combined_df_scaled.applymap(contains_imaginary).any()
        combined_df_scaled = combined_df_scaled.drop(columns=columns_with_imaginary[columns_with_imaginary].index)

        fig1, fig2, fig3, top_2_features = plot_clustering(data=combined_df_scaled)
        save_top_features_2[nnunet_id] = top_2_features
        individual_figs.append([fig1, fig2, fig3])
        fig1.write_image(f"./metrics/RADIOMICS/uitls/MRI_label_2_clusterplot_{nnunet_id}.png", scale=0.8, width=2000, height=1200)
       
        print("##################")
    #except:
    #    pass


import plotly.io as pio

individual_figs_to_save = [x[0] for x in individual_figs]

# If you want to save them to a single HTML file:
pio.write_html(individual_figs_to_save[0], file="./metrics/RADIOMICS/uitls/MRI_cluster_most_important_feature_tumour_2.html", auto_open=False)

# If you want to append all figures to the same file, one after another:
with open("./metrics/RADIOMICS/uitls/MRI_cluster_most_important_feature_tumour_2.html", "w") as f:
    for fig in individual_figs_to_save:
        f.write(fig.to_html(full_html=False))


In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_3.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

individual_figs = []
save_top_features_3 = {}
save_combined_df_scaled = None

sorted_list_with_condition = ["Dataset602_BraTS", "Dataset613_BraTS", "Dataset996_BraTS_GAN"]
for idx, nnunet_id in enumerate(sorted_list_with_condition):
    #try:
        if "BraTS" not in nnunet_id or nnunet_id=="Dataset960_BraTS":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_3.csv")
        
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        
        fig1, fig2, fig3, top_2_features = plot_clustering(data=combined_df_scaled)
        save_top_features_3[nnunet_id] = top_2_features
        individual_figs.append([fig1, fig2, fig3])
        fig1.write_image(f"./metrics/RADIOMICS/uitls/MRI_label_3_clusterplot_{nnunet_id}.png", scale=0.8, width=2000, height=1200)
        
        print("##################")
    #except:
    #    pass


import plotly.io as pio

individual_figs_to_save = [x[0] for x in individual_figs]

# If you want to save them to a single HTML file:
pio.write_html(individual_figs_to_save[0], file="./metrics/RADIOMICS/uitls/MRI_cluster_most_important_feature_tumour_3.html", auto_open=False)

# If you want to append all figures to the same file, one after another:
with open("./metrics/RADIOMICS/uitls/MRI_cluster_most_important_feature_tumour_3.html", "w") as f:
    for fig in individual_figs_to_save:
        f.write(fig.to_html(full_html=False))


In [None]:
from collections import Counter

def show_feat_freq(save_top_features):
    new_list = []
    for key in save_top_features:
        features = save_top_features[key]
        for feat in features:
            new_list.append(feat)

    # Count the number of individual elements
    element_count = len(new_list)

    # Count the number of times each element appears
    element_frequency = Counter(new_list)

    print("Number of individual elements:", element_count)
    # Sort the elements by frequency in descending order
    sorted_by_frequency = element_frequency.most_common()

    print("Frequency of each element, sorted by frequency:")
    for element, count in sorted_by_frequency:
        print(f"{element}: {count}")
show_feat_freq(save_top_features=save_top_features_1)
print("#################")
show_feat_freq(save_top_features=save_top_features_2)
print("#################")
show_feat_freq(save_top_features=save_top_features_3)
print("#################")

In [None]:
RADIOMICS = "./metrics/RADIOMICS"
real_features_csv = "./metrics/RADIOMICS/Dataset960_BraTS/tumour_radiomics_1.csv"


nnunet_id_list = listdir(RADIOMICS)
sorted_list_with_condition = sorted(
    [x for x in nnunet_id_list if 'Dataset' in x],
    key=lambda x: int(x.split('Dataset')[1].split('_')[0])
)

individual_figs = []
save_top_features_1 = {}
save_combined_df_scaled = None

sorted_list_with_condition = ["Dataset602_BraTS", "Dataset613_BraTS", "Dataset996_BraTS_GAN"]
for idx, nnunet_id in enumerate(sorted_list_with_condition):
    #try:
        if nnunet_id!="Dataset996_BraTS_GAN":
            continue
        print(nnunet_id)
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_1.csv")
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        
        # Drop columns with imaginary values
        columns_with_imaginary = combined_df_scaled.applymap(contains_imaginary).any()
        combined_df_scaled = combined_df_scaled.drop(columns=columns_with_imaginary[columns_with_imaginary].index)

        fig1, fig2, fig3, top_2_features = plot_clustering(data=combined_df_scaled)
        save_top_features_1[nnunet_id] = top_2_features
        individual_figs.append([fig1, fig2, fig3])
        
        fig1.write_image(f"./metrics/RADIOMICS/uitls/MRI_label_1_clusterplot_{nnunet_id}.png", scale=0.8, width=2000, height=1200)
        
        ####################################
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_2.csv")
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        
        # Drop columns with imaginary values
        columns_with_imaginary = combined_df_scaled.applymap(contains_imaginary).any()
        combined_df_scaled = combined_df_scaled.drop(columns=columns_with_imaginary[columns_with_imaginary].index)

        fig1, fig2, fig3, top_2_features = plot_clustering(data=combined_df_scaled)
        save_top_features_1[nnunet_id] = top_2_features
        individual_figs.append([fig1, fig2, fig3])
        
        fig1.write_image(f"./metrics/RADIOMICS/uitls/MRI_label_2_clusterplot_{nnunet_id}.png", scale=0.8, width=2000, height=1200)
        
        ####################################
        synthetic_features_csv = join(RADIOMICS, nnunet_id, "tumour_radiomics_3.csv")
        combined_df_scaled = get_combined_df_scaled(real_features_csv, synthetic_features_csv, norm=False, verbose=False)
        
        # Drop columns with imaginary values
        columns_with_imaginary = combined_df_scaled.applymap(contains_imaginary).any()
        combined_df_scaled = combined_df_scaled.drop(columns=columns_with_imaginary[columns_with_imaginary].index)

        fig1, fig2, fig3, top_2_features = plot_clustering(data=combined_df_scaled)
        save_top_features_1[nnunet_id] = top_2_features
        individual_figs.append([fig1, fig2, fig3])
        
        fig1.write_image(f"./metrics/RADIOMICS/uitls/MRI_label_3_clusterplot_{nnunet_id}.png", scale=0.8, width=2000, height=1200)
        
        print("##################")
    #except:
    #    pass
