In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import SegmentationGuideline as SG

In [None]:
PATH_PANCANCER_RADIOMICS_CSV = "LOCAL PATH"
PATH_ERODED_CSV = "LOCAL PATH"
PATH_PANCANCER_STEPHAN_XLSX = "LOCAL PATH"
PATH_PANCANCER_CLINICAL_XLSX = "LOCAL PATH"
PATH_MEGASET_CSV = "LOCAL PATH"

# Discovery datasets

## Load radiomic datasets

In [None]:
nki_radiomics_source = pd.read_csv(PATH_PANCANCER_RADIOMICS_CSV)
nki_radiomics_source

In [None]:
eroded_df = pd.read_csv(PATH_ERODED_CSV)
eroded_df

In [None]:
new_naming_df = pd.read_excel(PATH_PANCANCER_STEPHAN_XLSX)
new_naming_df

## Data harmonization across versions + Adaption to the new naming convention

### Index the datasets

In [None]:
nki_radiomics_source.set_index(['Anon_Name', 'Scan_Name', 'Segment ID'], inplace=True)
nki_radiomics_source

In [None]:
eroded_df.set_index(['Anon_Name', 'Scan_Name', 'Segment ID'], inplace=True)
eroded_df

In [None]:
new_naming_df.set_index(['Anon_Name', 'Scan_Name', 'Segment ID'], inplace=True)
new_naming_df

### Drop the extra rows (missing lesions)

In [None]:
missing_df = nki_radiomics_source.loc[~nki_radiomics_source.index.isin(new_naming_df.index) | ~nki_radiomics_source.index.isin(eroded_df.index)]
missing_df

In [None]:
nki_radiomics_source.drop(missing_df.index, inplace=True)
nki_radiomics_source

In [None]:
eroded_df = eroded_df.loc[eroded_df.index.isin(nki_radiomics_source.index)]
eroded_df

### Update the segment names

In [None]:
nki_radiomics_source["Segment Name"].update(new_naming_df["Segment Name"])
nki_radiomics_source

In [None]:
eroded_df["Segment Name"].update(new_naming_df["Segment Name"])
eroded_df

### Reset the dataset indexing

In [None]:
nki_radiomics_source.reset_index(inplace=True)
nki_radiomics_source

In [None]:
eroded_df.reset_index(inplace=True)
eroded_df

## Load NKI clinical dataset

This is to get the tumor type data

In [None]:
nki_clinical_source = pd.read_excel(PATH_PANCANCER_CLINICAL_XLSX)
nki_clinical_source

## Add tumor type to nki_radiomics_source and eroded_df

In [None]:
Anon_Name_to_tumtype = nki_clinical_source.drop_duplicates(subset="Anon_Name").set_index("Anon_Name")["tumtype"]
Anon_Name_to_tumtype

In [None]:
nki_radiomics_source["tumtype"] = nki_radiomics_source["Anon_Name"].map(Anon_Name_to_tumtype)
nki_radiomics_source

In [None]:
eroded_df["tumtype"] = eroded_df["Anon_Name"].map(Anon_Name_to_tumtype)
eroded_df

In [None]:
nki_radiomics_source['tumtype'].isnull().sum()

In [None]:
eroded_df['tumtype'].isnull().sum()

In [None]:
nki_radiomics_source = nki_radiomics_source.dropna(subset=['tumtype'])
nki_radiomics_source

In [None]:
eroded_df = eroded_df.dropna(subset=['tumtype'])
eroded_df

## Drop irrelavant tumor type categories

In [None]:
nki_radiomics_source["tumtype"].unique()

In [None]:
eroded_df["tumtype"].unique()

We drop the rows with 'other' and 'CUP' as tumor type

In [None]:
irrelevant_tumtypes_df = nki_radiomics_source[(nki_radiomics_source["tumtype"] == 'other ') | (nki_radiomics_source["tumtype"] == 'CUP')]
irrelevant_tumtypes_df

In [None]:
nki_radiomics_source = nki_radiomics_source[(nki_radiomics_source["tumtype"] != 'other ') & (nki_radiomics_source["tumtype"] != 'CUP')]
nki_radiomics_source

In [None]:
eroded_df = eroded_df[(eroded_df["tumtype"] != 'other ') & (eroded_df["tumtype"] != 'CUP')]
eroded_df

In [None]:
nki_radiomics_source["tumtype"].unique()

In [None]:
eroded_df["tumtype"].unique()

## Screen the segment names for errors (deviating from the guideline)

In [None]:
SG.naming_pattern()

In [None]:
nki_radiomics_source['matches_structure'] = nki_radiomics_source['Segment Name'].str.match(SG.naming_pattern())
nki_radiomics_source

In [None]:
eroded_df['matches_structure'] = eroded_df['Segment Name'].str.match(SG.naming_pattern())
eroded_df

If correct, the output of the following 2 cells will be an empty dataframe

In [None]:
non_matching_rows = nki_radiomics_source[~nki_radiomics_source['matches_structure']]
non_matching_rows

In [None]:
non_matching_rows_eroded = eroded_df[~eroded_df['matches_structure']]
non_matching_rows_eroded

## Add segment name interpretation columns

In [None]:
def append_loc_details(df):

    """Receives one DataFrame and adds 4 columns to it, characterizing the lesions based on Segment Names"""
    
    df['Segment Parsed'] = df['Segment Name'].apply(SG.interpret_segment)

    df['prim_met_status'] = df['Segment Parsed'].apply(lambda x: x.get('prim_met_status') if isinstance(x, dict) else None)
    df['lesion_type'] = df['Segment Parsed'].apply(lambda x: x.get('lesion_type') if isinstance(x, dict) else None)
    df['location'] = df['Segment Parsed'].apply(lambda x: x.get('location') if isinstance(x, dict) else None)
    df['lesion_num'] = df['Segment Parsed'].apply(lambda x: x.get('lesion_num') if isinstance(x, dict) else None)

    df.drop(columns=['Segment Parsed'], inplace=True)

In [None]:
append_loc_details(nki_radiomics_source)
nki_radiomics_source

In [None]:
append_loc_details(eroded_df)
eroded_df

If correct, the output of the following 2 cells have to be False

In [None]:
nki_radiomics_source[['prim_met_status', 'lesion_type', 'location', 'lesion_num']].isnull().any().any()

In [None]:
eroded_df[['prim_met_status', 'lesion_type', 'location', 'lesion_num']].isnull().any().any()

# Validation dataset

## Load the dataset and drop the missing values in tumor type column

In [None]:
validation_source = pd.read_csv(PATH_MEGASET_CSV)
validation_source

In [None]:
validation_source['tumtype'].isnull().sum()

In [None]:
validation_source = validation_source.dropna(subset=['tumtype'])
validation_source

## Screen the segment names for errors (deviating from the guideline)

In [None]:
validation_source['matches_structure'] = validation_source['Segment Name'].str.match(SG.naming_pattern())
validation_source

In [None]:
non_matching_rows_v = validation_source[~validation_source['matches_structure']]
non_matching_rows_v

In [None]:
validation_source = validation_source[validation_source['matches_structure']]
validation_source

## Add segment name interpretation columns

In [None]:
append_loc_details(validation_source)
validation_source

In [None]:
validation_source['location'].isnull().sum()

In [None]:
seg_error_V = validation_source[validation_source['location'].isnull()]
seg_error_V

In [None]:
validation_source = validation_source[~validation_source.index.isin(seg_error_V.index)]
validation_source

If correct, the output of the following cell has to be False

In [None]:
validation_source[['prim_met_status', 'lesion_type', 'location', 'lesion_num']].isnull().any().any()

# Get overall Study participants' characteristics data

In [None]:
nki_study_size = nki_radiomics_source["Anon_Name"].nunique()
nki_study_size

In [None]:
validation_study_size = validation_source["Case"].nunique()
validation_study_size

In [None]:
validation_num_datasets = validation_source["Project"].nunique()
validation_num_datasets

In [None]:
validation_source["Project"].unique()

In [None]:
nki_total_num_lesion = len(nki_radiomics_source)
nki_total_num_lesion

In [None]:
validation_total_num_lesion = len(validation_source)
validation_total_num_lesion

In [None]:
nki_radiomics_source["tumtype"].nunique()

In [None]:
nki_radiomics_source["tumtype"].unique()

In [None]:
validation_source["tumtype"].nunique()

In [None]:
validation_source["tumtype"].unique()

# Manage missing feature values + Standardize feature values

In [None]:
def radiomics_standardizer(df):
    
    """Receives one DataFrame, fills the missing values in radiomic feature columns with the median of that column, 
    and standardize the radiomic feature values"""
    
    feature_start = df.columns.get_loc('original_shape_Elongation')
    feature_end = df.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1
    
    features_df = df.iloc[:, feature_start:feature_end]

    features_df = features_df.apply(lambda col: col.fillna(col.median()), axis=0)
    
    scaler = StandardScaler()
    standardized_values = scaler.fit_transform(features_df)

    features_standardized_df = pd.DataFrame(standardized_values, columns=features_df.columns)

    non_feature_columns = df.drop(columns=features_df.columns)
    
    final_df = pd.concat([non_feature_columns.reset_index(drop=True), features_standardized_df.reset_index(drop=True)], axis=1)

    return final_df

In [None]:
nki_radiomics_source

In [None]:
nki_radiomics_source_st = radiomics_standardizer(nki_radiomics_source)
nki_radiomics_source_st

In [None]:
eroded_df

In [None]:
eroded_df_st = radiomics_standardizer(eroded_df)
eroded_df_st

In [None]:
validation_source

In [None]:
validation_source_st = radiomics_standardizer(validation_source)
validation_source_st

To Confirm there is no missing feature value the output of the three cells checking _st datasets has to be False

In [None]:
feature_start = nki_radiomics_source.columns.get_loc('original_shape_Elongation')
feature_end = nki_radiomics_source.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1

nki_radiomics_source.iloc[:, feature_start:feature_end].isnull().any().any()

In [None]:
feature_start = nki_radiomics_source_st.columns.get_loc('original_shape_Elongation')
feature_end = nki_radiomics_source_st.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1

nki_radiomics_source_st.iloc[:, feature_start:feature_end].isnull().any().any()

In [None]:
feature_start = validation_source.columns.get_loc('original_shape_Elongation')
feature_end = validation_source.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1

validation_source.iloc[:, feature_start:feature_end].isnull().any().any()

In [None]:
feature_start = validation_source_st.columns.get_loc('original_shape_Elongation')
feature_end = validation_source_st.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1

validation_source_st.iloc[:, feature_start:feature_end].isnull().any().any()

In [None]:
feature_start = eroded_df.columns.get_loc('original_shape_Elongation')
feature_end = eroded_df.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1

eroded_df.iloc[:, feature_start:feature_end].isnull().any().any()

In [None]:
feature_start = eroded_df_st.columns.get_loc('original_shape_Elongation')
feature_end = eroded_df_st.columns.get_loc('lbp-3D-k_ngtdm_Strength')+1

eroded_df_st.iloc[:, feature_start:feature_end].isnull().any().any()

# Export the datasets to csv

In [None]:
nki_radiomics_source_st.to_csv('Z:/active_Sajjad/11- Personal/CT tumor morphology/Processed Datasets/discovery_all_st.csv')
eroded_df_st.to_csv('Z:/active_Sajjad/11- Personal/CT tumor morphology/Processed Datasets/eroded_discovery_all_st.csv')
validation_source_st.to_csv('Z:/active_Sajjad/11- Personal/CT tumor morphology/Processed Datasets/validation_all_st.csv')