In [2]:
import pandas as pd

# Load clinical and omics data
clinical_data = pd.read_csv('data_clinical_patient.csv')
mrna_data = pd.read_csv('data_mrna_seq_v2_rsem.csv')
methylation_data = pd.read_csv('data_methylation_hm27_hm450_merged.csv')
cna_data = pd.read_csv('data_log2_cna.csv')


Transpose files

In [3]:
import pandas as pd

# Load the CNA data file
cna_data = pd.read_csv('data_log2_cna.csv')

# Separate the metadata columns from sample data
metadata = cna_data[['Hugo_Symbol', 'Entrez_Gene_Id', 'Cytoband']]
sample_data = cna_data.drop(columns=['Hugo_Symbol', 'Entrez_Gene_Id', 'Cytoband'])

# Transpose the sample data
sample_data_transposed = sample_data.T

# Set sample IDs as a column (originally column headers), and reset the index
sample_data_transposed.columns = metadata['Hugo_Symbol']  # Use Hugo_Symbol as column headers
sample_data_transposed.index.name = 'Sample_ID'
sample_data_transposed.reset_index(inplace=True)

# Print the total number of missing values before removing columns
total_missing_before = sample_data_transposed.isnull().sum().sum()
print(f"\nTotal number of missing values before removing columns in the data_cna DataFrame: {total_missing_before}")

# Remove columns that have ANY missing data
sample_data_transposed = sample_data_transposed.loc[:, sample_data_transposed.isnull().sum() == 0]

# Print the total number of missing values after dropping columns
total_missing_after = sample_data_transposed.isnull().sum().sum()
print(f"\nTotal number of missing values after removing columns in the data_cna DataFrame: {total_missing_after}")

# Save the transposed CNA data if needed
sample_data_transposed.to_csv('data_log2_cna.csv', index=False)
print("Transposed data (columns with no missing values) saved as 'data_log2_cna.csv'")



Total number of missing values before removing columns in the data_cna DataFrame: 0

Total number of missing values after removing columns in the data_cna DataFrame: 0
Transposed data (columns with no missing values) saved as 'data_log2_cna.csv'


In [4]:
# Load the mrna data file
mrna_data = pd.read_csv('data_mrna_seq_v2_rsem.csv')

# Separate the metadata columns from sample data
metadata = mrna_data[['Hugo_Symbol', 'Entrez_Gene_Id']]
sample_data = mrna_data.drop(columns=['Hugo_Symbol', 'Entrez_Gene_Id'])

# Transpose the sample data
sample_data_transposed = sample_data.T

# Set sample IDs as a column (originally column headers), and reset the index
sample_data_transposed.columns = metadata['Hugo_Symbol']  # Use Hugo_Symbol as column headers
sample_data_transposed.index.name = 'Sample_ID'
sample_data_transposed.reset_index(inplace=True)

# Save the transposed mrna data if needed
sample_data_transposed.to_csv('data_mrna_seq_v2_rsem.csv', index=False)
print("Transposed data saved as 'data_mrna_seq_v2_rsem.csv'")

Transposed data saved as 'data_mrna_seq_v2_rsem.csv'


In [5]:
# Load the DNA methylation data
methylation_data = pd.read_csv('data_methylation_hm27_hm450_merged.csv')

# Drop metadata columns
methylation_data = methylation_data.drop(columns=['ENTITY_STABLE_ID', 'DESCRIPTION', 'TRANSCRIPT_ID'])

# Transpose the data so that samples become rows and CpG sites become columns
# Set ENTITY_STABLE_ID as the index before transposing
methylation_data = methylation_data.set_index('NAME').T

# Rename the index to Sample_ID and reset the index to make Sample_ID a column
methylation_data.index.name = 'Sample_ID'
methylation_data.reset_index(inplace=True)

# Save the transposed DataFrame back to the original file
methylation_data.to_csv('data_methylation_hm27_hm450_merged.csv', index=False)
print("Transposed and saved the methylation data file.")

Transposed and saved the methylation data file.


In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# List of omics data files that have already been transposed
omics_files = [
    'data_mrna_seq_v2_rsem.csv', 
    'data_methylation_hm27_hm450_merged.csv',
    'data_log2_cna.csv'
]

# Load, remove columns with missing values, normalize, and save each file
for file in omics_files:
    # Load the transposed data
    df = pd.read_csv(file)
    
    # Print the total missing values before dropping columns
    total_missing_before = df.isnull().sum().sum()
    print(f"\nTotal number of missing values before dropping columns in the {file} DataFrame: {total_missing_before}")
    
    # Remove columns that have ANY missing data
    df = df.loc[:, df.isnull().sum() == 0]
    
    # Print the total missing values after dropping columns
    total_missing_after_drop = df.isnull().sum().sum()
    print(f"Total number of missing values after dropping columns in the {file} DataFrame: {total_missing_after_drop}")
    
    # Apply StandardScaler for normalization on all columns except 'Sample_ID'
    scaler = StandardScaler()
    df.iloc[:, 1:] = scaler.fit_transform(df.iloc[:, 1:])  # Skip the 'Sample_ID' column
    
    # Save the normalized DataFrame back to the original file
    df.to_csv(file, index=False)
    print(f"Normalized and saved file: {file}")
    
    # Print the total missing values after normalization
    total_missing_after_norm = df.isnull().sum().sum()
    print(f"Total number of missing values after normalization in the {file} DataFrame: {total_missing_after_norm}")



Total number of missing values before dropping columns in the data_mrna_seq_v2_rsem.csv DataFrame: 1070496
Total number of missing values after dropping columns in the data_mrna_seq_v2_rsem.csv DataFrame: 0
Normalized and saved file: data_mrna_seq_v2_rsem.csv
Total number of missing values after normalization in the data_mrna_seq_v2_rsem.csv DataFrame: 0

Total number of missing values before dropping columns in the data_methylation_hm27_hm450_merged.csv DataFrame: 3247
Total number of missing values after dropping columns in the data_methylation_hm27_hm450_merged.csv DataFrame: 0
Normalized and saved file: data_methylation_hm27_hm450_merged.csv
Total number of missing values after normalization in the data_methylation_hm27_hm450_merged.csv DataFrame: 0

Total number of missing values before dropping columns in the data_log2_cna.csv DataFrame: 0
Total number of missing values after dropping columns in the data_log2_cna.csv DataFrame: 0
Normalized and saved file: data_log2_cna.csv
Tota

In [7]:
import pandas as pd

patient_data = pd.read_csv('data_clinical_patient.csv')
sample_data = pd.read_csv('data_clinical_sample.csv')

# Merge data_clinical_patient with the cleaned clinical_sample_data based on PATIENT_ID
merged_data = pd.merge(patient_data, sample_data, on='PATIENT_ID', how='inner')

# Display the merged DataFrame
merged_data.head()

# Optionally, save the merged data to a CSV file
merged_data.to_csv('data_clinical_patient.csv', index=False)
print("Merged data saved to 'data_clinical_patient.csv'")


Merged data saved to 'data_clinical_patient.csv'


In [8]:
import pandas as pd
# Drop empty columns
# Load the data_clinical_patient file
clinical_data = pd.read_csv('data_clinical_patient.csv')

# Drop columns that are entirely NaN
clinical_data = clinical_data.dropna(axis=1, how='all')

# Save the cleaned DataFrame back to the same file
clinical_data.to_csv('data_clinical_patient.csv', index=False)
print("Empty columns dropped and data saved to 'data_clinical_patient.csv'")

Empty columns dropped and data saved to 'data_clinical_patient.csv'


In [9]:
import pandas as pd

# ------------------ 1. Load Data Files ------------------ #
# Load clinical data (contains at least Patient_ID and Sample_ID)
clinical_data = pd.read_csv('data_clinical_patient.csv')

# Load sample acquisition data (has Sample_ID, METHOD_OF_SAMPLE_PROCUREMENT, and START_DATE as numeric surgery date)
sample_acq = pd.read_csv('data_timeline_sample_acquisition.csv')

# Load treatment timeline data (has Patient_ID, START_DATE as numeric treatment start date, TREATMENT_TYPE, and RADIATION_TYPE)
treatment = pd.read_csv('data_timeline_treatment.csv')

# ------------------ 2. Merge Surgery Information ------------------ #
# Merge sample acquisition info (surgery type and numeric surgery date) into clinical_data using Sample_ID.
clinical_data = clinical_data.merge(
    sample_acq[['Sample_ID', 'METHOD_OF_SAMPLE_PROCUREMENT', 'START_DATE']],
    on='Sample_ID',
    how='left'
)

# Rename columns for clarity: rename METHOD_OF_SAMPLE_PROCUREMENT to Surgery_Type and START_DATE to Surgery_Date.
clinical_data.rename(columns={
    'METHOD_OF_SAMPLE_PROCUREMENT': 'Surgery_Type',
    'START_DATE': 'Surgery_Date'
}, inplace=True)

# If Surgery_Type is missing, fill it with "Unknown"
clinical_data['Surgery_Type'].fillna('Unknown', inplace=True)

# ------------------ 3. Define Function to Derive Treatment Flags ------------------ #
def get_treatment_flags(row, treatment_df):
    """
    For a given clinical record (row), compute six flags based on treatment data:
      - Chemo_Given: 1 if any chemotherapy event is recorded.
      - Chemo_within_6wks: 1 if any chemotherapy start date is within 42 days of surgery.
      - Radiation_Internal_Given: 1 if any internal radiation event is recorded.
      - Radiation_Internal_within_6wks: 1 if internal radiation started within 42 days of surgery.
      - Radiation_External_Given: 1 if any external radiation event is recorded.
      - Radiation_External_within_6wks: 1 if external radiation started within 42 days of surgery.
    
    Assumes both Surgery_Date and treatment START_DATE are numeric values (e.g., days from a reference).
    """
    patient_id = row['PATIENT_ID']
    surgery_date = row['Surgery_Date']  # Numeric surgery date
    
    # If surgery date is missing, return zeros.
    if pd.isna(surgery_date):
        return pd.Series({
            'Chemo_Given': 0,
            'Chemo_within_6wks': 0,
            'Radiation_Internal_Given': 0,
            'Radiation_Internal_within_6wks': 0,
            'Radiation_External_Given': 0,
            'Radiation_External_within_6wks': 0
        })
    
    # Filter treatment events for this patient.
    patient_treatments = treatment_df[treatment_df['PATIENT_ID'] == patient_id]
    
    # Initialize flags.
    chemo_given = 0
    chemo_within = 0
    rad_internal_given = 0
    rad_internal_within = 0
    rad_external_given = 0
    rad_external_within = 0
    
    # Define the threshold for "within 6 weeks" as 42 days.
    threshold = 42
    
    for _, tr in patient_treatments.iterrows():
        treat_type = str(tr['TREATMENT_TYPE']).lower()  # Normalize treatment type text.
        treat_date = tr['START_DATE']  # Numeric treatment start date.
        
        # Calculate the time difference.
        if pd.notna(treat_date) and pd.notna(surgery_date):
            diff = treat_date - surgery_date
        else:
            diff = None
        
        # Check for chemotherapy.
        if 'chemo' in treat_type:
            chemo_given = 1
            if diff is not None and diff >= 0 and diff <= threshold:
                chemo_within = 1
        
        # Check for radiation treatment.
        elif 'radiation' in treat_type:
            rad_type = str(tr['RADIATION_TYPE']).lower() if pd.notna(tr['RADIATION_TYPE']) else ""
            if 'internal' in rad_type:
                rad_internal_given = 1
                if diff is not None and diff >= 0 and diff <= threshold:
                    rad_internal_within = 1
            elif 'external' in rad_type:
                rad_external_given = 1
                if diff is not None and diff >= 0 and diff <= threshold:
                    rad_external_within = 1
    
    return pd.Series({
        'Chemo_Given': chemo_given,
        'Chemo_within_6wks': chemo_within,
        'Radiation_Internal_Given': rad_internal_given,
        'Radiation_Internal_within_6wks': rad_internal_within,
        'Radiation_External_Given': rad_external_given,
        'Radiation_External_within_6wks': rad_external_within
    })

# ------------------ 4. Apply the Treatment Flag Function ------------------ #
# Apply the function row by row to compute the treatment flags for each clinical record.
treatment_flags = clinical_data.apply(lambda row: get_treatment_flags(row, treatment), axis=1)
clinical_data = pd.concat([clinical_data, treatment_flags], axis=1)

# ------------------ 5. Remove the Surgery_Date Column ------------------ #
# Drop the Surgery_Date column since you don't want it in the final clinical data.
clinical_data.drop(columns=['Surgery_Date'], inplace=True)

# ------------------ 6. Save the Final Clinical Data ------------------ #
# Save the final clinical dataset that now includes:
# - Surgery_Type (from sample acquisition, with missing filled as "Unknown")
# - Treatment flags (Chemo_Given, Chemo_within_6wks, Radiation_Internal_Given, Radiation_Internal_within_6wks, 
#   Radiation_External_Given, Radiation_External_within_6wks)
clinical_data.to_csv('data_clinical_patient.csv', index=False)
print("Final clinical data with treatment flags saved to 'data_clinical_patient.csv'")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  clinical_data['Surgery_Type'].fillna('Unknown', inplace=True)


Final clinical data with treatment flags saved to 'data_clinical_patient.csv'


In [10]:
import pandas as pd

# Load the clinical data
clinical_data = pd.read_csv('data_clinical_patient.csv')

# Identify columns that need to be split (columns with values like '0:DiseaseFree')
columns_to_split = ['PFS_STATUS']  # Add other relevant columns

# Split the values in the specified columns and keep only the numeric/categorical part
for col in columns_to_split:
    clinical_data[col] = clinical_data[col].astype(str).str.split(':').str[0]

# Convert columns to appropriate types (e.g., integer if applicable)
for col in columns_to_split:
    clinical_data[col] = pd.to_numeric(clinical_data[col], errors='coerce')

# Save the cleaned DataFrame
clinical_data.to_csv('data_clinical_patient.csv', index=False)
print("Values with descriptions have been split, and data saved to 'data_clinical_patient.csv'.")


Values with descriptions have been split, and data saved to 'data_clinical_patient.csv'.


In [11]:
import pandas as pd

# Load the clinical data
clinical_data = pd.read_csv('data_clinical_patient.csv')

# Fill PERSON_NEOPLASM_CANCER_STATUS based on DFS_STATUS and PFS_STATUS
clinical_data['PERSON_NEOPLASM_CANCER_STATUS'] = clinical_data['PERSON_NEOPLASM_CANCER_STATUS'].fillna(
    clinical_data.apply(
        lambda row: 'With Tumor' if row['DFS_STATUS'] == 1 or row['PFS_STATUS'] == 1 else row['PERSON_NEOPLASM_CANCER_STATUS'],
        axis=1
    )
)

# Fill DAYS_TO_BIRTH based on AGE
clinical_data['DAYS_TO_BIRTH'] = clinical_data['DAYS_TO_BIRTH'].fillna(
    clinical_data['AGE'].apply(lambda x: -x * 365.25 if pd.notna(x) else pd.NA)
)

# Fill NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT based on DFS_STATUS and PFS_STATUS
clinical_data['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'] = clinical_data['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'].fillna(
    clinical_data.apply(
        lambda row: 'YES' if row['DFS_STATUS'] == 1 or row['PFS_STATUS'] == 1 else row['NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'],
        axis=1
    )
)

# Save the updated DataFrame
clinical_data.to_csv('data_clinical_patient.csv', index=False)
print("Imputed missing values and saved to 'data_clinical_patient.csv'.")


Imputed missing values and saved to 'data_clinical_patient.csv'.


In [12]:
import pandas as pd

# Load the data
clinical_data = pd.read_csv('data_clinical_patient.csv')

# List of metadata or irrelevant columns
columns_to_drop = [
    'OTHER_PATIENT_ID', 'FORM_COMPLETION_DATE', 'ICD_10', 
    'ICD_O_3_HISTOLOGY', 'ICD_O_3_SITE', 'INFORMED_CONSENT_VERIFIED', 'IN_PANCANPATHWAYS_FREEZE',
    'TISSUE_PROSPECTIVE_COLLECTION_INDICATOR',
    'AJCC_STAGING_EDITION',
    'TISSUE_RETROSPECTIVE_COLLECTION_INDICATOR',
    'TISSUE_SOURCE_SITE_CODE',
    'TISSUE_SOURCE_SITE',
    'SAMPLE_TYPE',
    'CANCER_TYPE', 'CANCER_TYPE_ACRONYM', # We have cancer type detailed
    'SOMATIC_STATUS',
    'PATIENT_ID',
    'DAYS_LAST_FOLLOWUP',
    'DAYS_TO_INITIAL_PATHOLOGIC_DIAGNOSIS',
    'OS_STATUS',
    'OS_MONTHS',
    'DSS_STATUS',
    'DSS_MONTHS',
    'DFS_MONTHS',
    'DFS_STATUS',
    'SEX',
    'NEW_TUMOR_EVENT_AFTER_INITIAL_TREATMENT'
]

# Drop the irrelevant columns
clinical_data_cleaned = clinical_data.drop(columns=columns_to_drop)

# Save the cleaned DataFrame back to the same file
clinical_data_cleaned.to_csv('data_clinical_patient.csv', index=False)
print("Cleaned clinical data saved, with irrelevant columns removed.")


Cleaned clinical data saved, with irrelevant columns removed.


In [13]:
import pandas as pd 
# Load and preprocess the data
clinical_data = pd.read_csv('data_clinical_patient.csv')

# Convert Disease Free Months to numeric
clinical_data['PFS_MONTHS'] = pd.to_numeric(clinical_data['PFS_MONTHS'], errors='coerce')

# Create a new column for clear categorization
def categorize_recurrence(row):
    if pd.isna(row['PFS_STATUS']) or pd.isna(row['PFS_MONTHS']):
        return 'Unknown/Missing'
    elif row['PFS_STATUS'] == 1 and row['PFS_MONTHS'] <= 60:
        return 'Recurred/Progressed within 60 months'
    elif row['PFS_STATUS'] == 1 and row['PFS_MONTHS'] > 60:
        return 'Recurred/Progressed after 60 months'
    elif row['PFS_STATUS'] == 0 and row['PFS_MONTHS'] <= 60:
        return 'No recurrence yet, but follow-up ≤ 60 months'
    elif row['PFS_STATUS'] == 0 and row['PFS_MONTHS'] > 60:
        return 'No recurrence with > 60 months follow-up'
    else:
        return 'Other'

clinical_data['Recurrence_Category'] = clinical_data.apply(categorize_recurrence, axis=1)

# For binary classification (what you'll likely want for modeling)
clinical_data['Recurrence_Binary'] = clinical_data.apply(
    lambda row: 1 if (row['PFS_STATUS'] == 1 and row['PFS_MONTHS'] <= 60) else 0, 
    axis=1
)

# Display the distributions
print("Detailed Categories:")
print(clinical_data['Recurrence_Category'].value_counts())
print("\nBinary Classification:")
print(clinical_data['Recurrence_Binary'].value_counts())

# Save categorized data if needed
clinical_data.to_csv('categorized_clinical_data.csv', index=False)

Detailed Categories:
Recurrence_Category
No recurrence yet, but follow-up ≤ 60 months    320
Recurred/Progressed within 60 months            114
No recurrence with > 60 months follow-up         89
Recurred/Progressed after 60 months               5
Unknown/Missing                                   1
Name: count, dtype: int64

Binary Classification:
Recurrence_Binary
0    415
1    114
Name: count, dtype: int64


In [14]:
# 1. Load the clinical data
clinical_data = pd.read_csv('data_clinical_patient.csv')

# Make sure PFS_MONTHS is numeric if needed
clinical_data['PFS_MONTHS'] = pd.to_numeric(clinical_data['PFS_MONTHS'], errors='coerce')

# 2. Apply categorization function
clinical_data['Recurrence_Category'] = clinical_data.apply(categorize_recurrence, axis=1)

# 3. Define the categories you want to remove
categories_to_remove = [
    "No recurrence yet, but follow-up ≤ 60 months",
    "Recurred/Progressed after 60 months",
    "Unknown/Missing"
]

# 4. Build a condition to KEEP rows NOT in those categories
keep_condition = ~clinical_data['Recurrence_Category'].isin(categories_to_remove)

clinical_data_filtered = clinical_data[keep_condition].copy()

num_kept = len(clinical_data_filtered)
num_removed = len(clinical_data) - num_kept
print(f"Kept {num_kept} rows, removed {num_removed} rows (categories: {categories_to_remove}).")

# 5. Extract valid Sample IDs
valid_sample_ids = clinical_data_filtered['Sample_ID'].unique()
print(f"Number of valid sample IDs: {len(valid_sample_ids)}")

# Remove the Recurrence_Category column after filtering
clinical_data_filtered.drop(columns=['Recurrence_Category'], inplace=True)

# 6. Save the filtered clinical data
clinical_data_filtered.to_csv('data_clinical_patient.csv', index=False)
print("Filtered clinical data saved to 'data_clinical_patient.csv'")

# 7. Filter each omics dataset
omics_files = [
    'data_mrna_seq_v2_rsem.csv', 
    'data_methylation_hm27_hm450_merged.csv',
    'data_log2_cna.csv'
]

for file in omics_files:
    omics_df = pd.read_csv(file)
    before_count = len(omics_df)
    
    # Keep only rows whose Sample_ID is in the filtered clinical data
    omics_df = omics_df[omics_df['Sample_ID'].isin(valid_sample_ids)].copy()
    after_count = len(omics_df)
    
    print(f"{file}: Removed {before_count - after_count} rows; kept {after_count} rows.")
    
    # Save the filtered omics data
    omics_df.to_csv(file, index=False)
    print(f"Filtered omics data saved to '{file}'")

print("All datasets have been filtered to remove 'No recurrence yet, but follow-up ≤ 60 months' and 'Unknown/Missing'.")

Kept 203 rows, removed 326 rows (categories: ['No recurrence yet, but follow-up ≤ 60 months', 'Recurred/Progressed after 60 months', 'Unknown/Missing']).
Number of valid sample IDs: 203
Filtered clinical data saved to 'data_clinical_patient.csv'
data_mrna_seq_v2_rsem.csv: Removed 325 rows; kept 202 rows.
Filtered omics data saved to 'data_mrna_seq_v2_rsem.csv'
data_methylation_hm27_hm450_merged.csv: Removed 326 rows; kept 203 rows.
Filtered omics data saved to 'data_methylation_hm27_hm450_merged.csv'
data_log2_cna.csv: Removed 323 rows; kept 200 rows.
Filtered omics data saved to 'data_log2_cna.csv'
All datasets have been filtered to remove 'No recurrence yet, but follow-up ≤ 60 months' and 'Unknown/Missing'.


In [16]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# -------------------- 1. Load & Filter Clinical Data -------------------- #
clinical_data = pd.read_csv('data_clinical_patient.csv')

# If PFS_MONTHS is critical, remove rows missing it
if clinical_data['PFS_MONTHS'].isnull().any():
    print("Warning: 'PFS_MONTHS' contains missing values. These rows will be excluded.")
    clinical_data = clinical_data.dropna(subset=['PFS_MONTHS'])

# Ensure PFS_MONTHS is numeric
clinical_data['PFS_MONTHS'] = pd.to_numeric(clinical_data['PFS_MONTHS'], errors='coerce')

# -------------------- 2. Create Survival_Label from PFS -------------------- #
# If patient progresses by 60 months => label = 1, otherwise 0
clinical_data['Survival_Label'] = clinical_data['PFS_MONTHS'].apply(
    lambda x: 1 if x <= 60 else 0
)

# -------------------- 3. (Optional) Drop Unused Columns -------------------- #
# If you do not want to keep DFS_MONTHS, OS_MONTHS, or OS_STATUS as features:
# clinical_data.drop(columns=['DFS_MONTHS', 'OS_MONTHS', 'OS_STATUS'], inplace=True)

# -------------------- 4. Identify Categorical vs Numerical -------------------- #
categorical_columns_numeric = ['Chemo_Given', 
                               'Chemo_within_6wks', 'Radiation_Internal_Given',
                               'Radiation_Internal_within_6wks', 'Radiation_External_Given',
                               'Radiation_External_within_6wks']
categorical_columns = [
    'SUBTYPE', 'ETHNICITY', 'HISTORY_NEOADJUVANT_TRTYN', 'PERSON_NEOPLASM_CANCER_STATUS',
    'PRIOR_DX', 'RACE', 'RADIATION_THERAPY', 'GENETIC_ANCESTRY_LABEL', 'ONCOTREE_CODE',
    'CANCER_TYPE_DETAILED', 'TUMOR_TYPE', 'GRADE', 'TUMOR_TISSUE_SITE', 'Surgery_Type'
] + categorical_columns_numeric

# Exclude the Survival_Label from one-hot encoding
categorical_columns = [col for col in categorical_columns if col != 'Survival_Label']

# -------------------- 5. Exclude Purely Binary Columns from One-Hot Encoding -------------------- #
binary_columns = []
for col in categorical_columns:
    unique_vals = clinical_data[col].dropna().unique()
    if len(unique_vals) == 2 and set(unique_vals).issubset({0,1}):
        binary_columns.append(col)

multi_category_columns = list(set(categorical_columns) - set(binary_columns))

# -------------------- 6. One-Hot Encode Only Multi-Category Columns -------------------- #
clinical_data = pd.get_dummies(clinical_data, columns=multi_category_columns)

# -------------------- 7. KNN Imputation for Numerical Columns -------------------- #
# Identify numeric columns that might need KNN Imputation
numerical_columns = clinical_data.select_dtypes(include=['number']).columns.difference(categorical_columns_numeric)

# Ensure Survival_Label is NOT included in numerical columns for imputation or normalization
numerical_columns = numerical_columns.difference(['Survival_Label'])

imputer = KNNImputer(n_neighbors=5)
clinical_data[numerical_columns] = imputer.fit_transform(clinical_data[numerical_columns])

# Ensure one-hot-encoded columns are integers (0 or 1)
one_hot_encoded_columns = [
    col for col in clinical_data.columns
    if clinical_data[col].dropna().isin([0, 1]).all()
]

valid_binary_cols = []
for col in one_hot_encoded_columns:
    # Check if column has any NA/inf
    if clinical_data[col].isna().any() or not clinical_data[col].isin([0, 1]).all():
        print(f"Dropping {col} because it has NA or non-0/1 values.")
    else:
        valid_binary_cols.append(col)

# Now cast only the valid binary columns
clinical_data[valid_binary_cols] = clinical_data[valid_binary_cols].astype(int)

# -------------------- 8. Normalize Numerical Data (EXCLUDING Survival_Label) -------------------- #
scaler = StandardScaler()
clinical_data[numerical_columns] = scaler.fit_transform(clinical_data[numerical_columns])

# -------------------- 9. Save the Preprocessed Data -------------------- #
clinical_data.to_csv('data_clinical_patient_preprocessed.csv', index=False)
print("Preprocessed data saved to 'data_clinical_patient_preprocessed.csv'")


Preprocessed data saved to 'data_clinical_patient_preprocessed.csv'


In [17]:
import pandas as pd

# Load the data
clinical_data = pd.read_csv('data_clinical_patient_preprocessed.csv')

# List of metadata or irrelevant columns
columns_to_drop = [
    'PFS_MONTHS',
    'PFS_STATUS'    
]

# Drop the irrelevant columns
clinical_data_cleaned = clinical_data.drop(columns=columns_to_drop)

# Save the cleaned DataFrame back to the same file
clinical_data_cleaned.to_csv('data_clinical_patient_preprocessed.csv', index=False)
print("Cleaned clinical data saved, with irrelevant columns removed.")

Cleaned clinical data saved, with irrelevant columns removed.


In [18]:
import pandas as pd

# Define file paths for the filtered files (assuming these were saved previously)
files = {
    "Clinical Data": "data_clinical_patient_preprocessed.csv",
    "mRNA Data": "data_mrna_seq_v2_rsem.csv",
    "Methylation Data": "data_methylation_hm27_hm450_merged.csv",
    "CNA Data": "data_log2_cna.csv"
}

for desc, file_path in files.items():
    # Load the file
    df = pd.read_csv(file_path)
    
    # Count total missing values in the file
    total_missing_before = df.isnull().sum().sum()
    print(f"{desc} before cleaning: {total_missing_before} missing values.")
    
    # Drop rows that contain any missing values
    df_clean = df.dropna(how='any')
    
    # Count missing values after cleaning (should be zero)
    total_missing_after = df_clean.isnull().sum().sum()
    print(f"{desc} after cleaning: {total_missing_after} missing values.")
    
    # Optionally, print the number of unique Sample_IDs
    if 'Sample_ID' in df_clean.columns:
        unique_samples = df_clean['Sample_ID'].nunique()
        print(f"{desc} contains {unique_samples} unique Sample_IDs after cleaning.")
    
    # Save the cleaned file (overwrite the original filtered file)
    df_clean.to_csv(file_path, index=False)
    
print("All files have been cleaned to remove any missing values.")


Clinical Data before cleaning: 0 missing values.
Clinical Data after cleaning: 0 missing values.
Clinical Data contains 203 unique Sample_IDs after cleaning.
mRNA Data before cleaning: 0 missing values.
mRNA Data after cleaning: 0 missing values.
mRNA Data contains 202 unique Sample_IDs after cleaning.
Methylation Data before cleaning: 0 missing values.
Methylation Data after cleaning: 0 missing values.
Methylation Data contains 203 unique Sample_IDs after cleaning.
CNA Data before cleaning: 0 missing values.
CNA Data after cleaning: 0 missing values.
CNA Data contains 200 unique Sample_IDs after cleaning.
All files have been cleaned to remove any missing values.


In [19]:
import pandas as pd

# Define the file paths with descriptive names
files = {
    "Clinical Data": "data_clinical_patient_preprocessed.csv",
    "mRNA Data": "data_mrna_seq_v2_rsem.csv",
    "Methylation Data": "data_methylation_hm27_hm450_merged.csv",
    "CNA Data": "data_log2_cna.csv"
}

# Dictionary to store unique Sample_IDs for each file
sample_id_sets = {}

# Load each file and extract unique Sample_IDs
for name, file_path in files.items():
    df = pd.read_csv(file_path)
    if 'Sample_ID' in df.columns:
        sample_id_sets[name] = set(df['Sample_ID'].unique())
    else:
        print(f"{name} does not have a 'Sample_ID' column.")

# Compute the intersection of Sample_IDs across all files
common_sample_ids = set.intersection(*sample_id_sets.values())
print("Number of common Sample_IDs across all files:", len(common_sample_ids))

# Optionally, filter each file to retain only the common Sample_IDs
for name, file_path in files.items():
    df = pd.read_csv(file_path)
    before_count = df['Sample_ID'].nunique()
    df_filtered = df[df['Sample_ID'].isin(common_sample_ids)].copy()
    after_count = df_filtered['Sample_ID'].nunique()
    print(f"{name}: Before filtering = {before_count} unique Sample_IDs, After filtering = {after_count} unique Sample_IDs.")
    
    # Save the filtered file (this will create new files with '_filtered' appended)
    df_filtered.to_csv(file_path, index=False)


Number of common Sample_IDs across all files: 199
Clinical Data: Before filtering = 203 unique Sample_IDs, After filtering = 199 unique Sample_IDs.
mRNA Data: Before filtering = 202 unique Sample_IDs, After filtering = 199 unique Sample_IDs.
Methylation Data: Before filtering = 203 unique Sample_IDs, After filtering = 199 unique Sample_IDs.
CNA Data: Before filtering = 200 unique Sample_IDs, After filtering = 199 unique Sample_IDs.


In [20]:
import pandas as pd

# Define file paths for the filtered files (assuming these were saved previously)
files = {
    "Clinical Data": "data_clinical_patient_preprocessed.csv",
    "mRNA Data": "data_mrna_seq_v2_rsem.csv",
    "Methylation Data": "data_methylation_hm27_hm450_merged.csv",
    "CNA Data": "data_log2_cna.csv"
}

for desc, file_path in files.items():
    # Load the file
    df = pd.read_csv(file_path)
    
    # Count total missing values in the file
    total_missing_before = df.isnull().sum().sum()
    print(f"{desc} before cleaning: {total_missing_before} missing values.")
    
    # Drop rows that contain any missing values
    df_clean = df.dropna(how='any')
    
    # Count missing values after cleaning (should be zero)
    total_missing_after = df_clean.isnull().sum().sum()
    print(f"{desc} after cleaning: {total_missing_after} missing values.")
    
    # Optionally, print the number of unique Sample_IDs
    if 'Sample_ID' in df_clean.columns:
        unique_samples = df_clean['Sample_ID'].nunique()
        print(f"{desc} contains {unique_samples} unique Sample_IDs after cleaning.")
    
    # Save the cleaned file (overwrite the original filtered file)
    df_clean.to_csv(file_path, index=False)
    
print("All files have been cleaned to remove any missing values.")


Clinical Data before cleaning: 0 missing values.
Clinical Data after cleaning: 0 missing values.
Clinical Data contains 199 unique Sample_IDs after cleaning.
mRNA Data before cleaning: 0 missing values.
mRNA Data after cleaning: 0 missing values.
mRNA Data contains 199 unique Sample_IDs after cleaning.
Methylation Data before cleaning: 0 missing values.
Methylation Data after cleaning: 0 missing values.
Methylation Data contains 199 unique Sample_IDs after cleaning.
CNA Data before cleaning: 0 missing values.
CNA Data after cleaning: 0 missing values.
CNA Data contains 199 unique Sample_IDs after cleaning.
All files have been cleaned to remove any missing values.


In [21]:
import pandas as pd

# Path to clinical data
clinical_file = 'data_clinical_patient_preprocessed.csv'

# Load clinical data
clinical_data = pd.read_csv(clinical_file)

# Extract only Sample_ID and Survival_Label
survival_data = clinical_data[['Sample_ID', 'Survival_Label']]

# Save it as an Excel file for verification
survival_data.to_csv('survival_labels.csv', index=False)

print("Survival labels saved to 'survival_labels.csv'. Verify the file to ensure correctness.")


Survival labels saved to 'survival_labels.csv'. Verify the file to ensure correctness.


In [22]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

def perform_feature_selection(data_path, sample_id_column, survival_file, output_path, n_estimators=1000):
    """
    Perform feature selection using Boruta with Survival_Label.

    Parameters:
    - data_path: Path to the omics dataset (CSV file).
    - sample_id_column: The column containing Sample IDs.
    - survival_file: Path to the survival labels CSV.
    - output_path: Path to save the reduced dataset.
    - n_estimators: Number of trees in the random forest.
    """
    # Load omics data
    print(f"Loading {data_path}...")
    data = pd.read_csv(data_path)

    # Load survival labels
    survival_data = pd.read_csv(survival_file)

    # Merge with survival labels
    merged_data = pd.merge(data, survival_data, on=sample_id_column, how='inner')

    # Ensure Survival_Label exists
    if 'Survival_Label' not in merged_data.columns:
        raise ValueError("Survival_Label not found after merging!")

    # Separate features, target, and sample IDs
    sample_ids = merged_data[sample_id_column]
    target = merged_data['Survival_Label']
    features = merged_data.drop(columns=[sample_id_column, 'Survival_Label'])

    # Convert target to binary if necessary
    if target.dtype == 'object':
        target = np.where(target == "High-Risk", 1, 0)  # Convert categorical labels to 0/1 if needed

    # Initialize Random Forest model
    rf = RandomForestClassifier(n_estimators=n_estimators, n_jobs=-1, random_state=42)

    # Initialize Boruta
    boruta = BorutaPy(rf, n_estimators='auto', random_state=42, max_iter=1000)

    # Fit Boruta on data
    print(f"Performing feature selection on {data_path}...")
    boruta.fit(features.values, target)

    # Get selected features
    selected_features = features.columns[boruta.support_]
    print(f"Selected {len(selected_features)} features out of {features.shape[1]}.")

    # Reduce dataset to selected features 
    reduced_data = merged_data[selected_features].copy()

    # Now safely assign 'Sample_ID' and 'Survival_Label'
    reduced_data.insert(0, sample_id_column, sample_ids.values)  # Ensures Sample_ID is the first column
    reduced_data["Survival_Label"] = target.values  # Ensure labels are properly assigned

    # Save reduced dataset
    reduced_data.to_csv(output_path, index=False)
    print(f"Reduced dataset saved to {output_path}.\n")

    return reduced_data

# Paths to omics datasets and survival labels
omics_files = {
    'mRNA': 'data_mrna_seq_v2_rsem.csv',
    'Methylation': 'data_methylation_hm27_hm450_merged.csv',
    'CNA': 'data_log2_cna.csv'
}
survival_file = 'survival_labels.csv'

# Perform feature selection for each dataset
for omics_type, file_path in omics_files.items():
    output_file = f"{omics_type.lower()}_selected.csv"
    try:
        perform_feature_selection(file_path, 'Sample_ID', survival_file, output_file)
    except Exception as e:
        print(f"Error processing {omics_type}: {e}")


Loading data_mrna_seq_v2_rsem.csv...
Performing feature selection on data_mrna_seq_v2_rsem.csv...
Selected 36 features out of 17507.
Reduced dataset saved to mrna_selected.csv.

Loading data_methylation_hm27_hm450_merged.csv...
Performing feature selection on data_methylation_hm27_hm450_merged.csv...
Selected 29 features out of 21710.
Reduced dataset saved to methylation_selected.csv.

Loading data_log2_cna.csv...
Performing feature selection on data_log2_cna.csv...
Selected 3 features out of 25128.
Reduced dataset saved to cna_selected.csv.

