In [None]:
# Jupyter Notebook: Preparing Combined Metadata for PyDESeq2 (B-ALL and GTEx)

# ## 1. Introduction and Setup
# This notebook guides you through:
# 1. Loading clinical data for your B-ALL (unhealthy) samples.
# 2. Loading the JSON manifest that links B-ALL data files to cases/samples.
# 3. Extracting sample identifiers and relevant clinical information (e.g., B-ALL subtype).
# 4. Creating a metadata DataFrame for B-ALL samples with a 'condition' column.
# 5. Loading the previously processed healthy GTEx metadata.
# 6. Combining the B-ALL and healthy GTEx metadata into a single file.
#
# This final combined metadata file will be used with your combined raw count matrix
# for normalization/transformation tools like PyDESeq2.

import pandas as pd
import json # For loading the JSON manifest
import os

# ## 2. Configuration and File Paths
# --- ADJUST THESE PATHS AND PARAMETERS AS NEEDED ---

# --- B-ALL (Unhealthy) Data Files ---
# Path to your B-ALL clinical data file(s)
B_ALL_CLINICAL_FILE = 'metadata/clinical.tsv' # Primary clinical data
B_ALL_PATHOLOGY_DETAIL_FILE = 'metadata/pathology_detail.tsv' # Additional pathology details

# Path to your JSON manifest for B-ALL data (the one you shared previously)
# This file links data files (like RNA-Seq counts) to case/sample identifiers.
B_ALL_JSON_MANIFEST_FILE = 'metadata/metadata.cohort.2025-06-03.json' # UPDATE THIS PATH

# --- Healthy GTEx Data File ---
# Path to your processed healthy GTEx metadata (output from gtex_metadata_prep_notebook)
HEALTHY_GTEX_METADATA_FILE = 'metadata/processed/gtex_healthy_metadata_for_pydeseq2.csv'

# --- Output File ---
# Path to save the final combined metadata file
COMBINED_METADATA_OUTPUT_FILE = 'metadata/processed/combined_metadata_for_pydeseq2.csv'

# --- Column Names & Identifiers (CRITICAL - VERIFY THESE) ---
# Identifier for patients/cases in your clinical files (e.g., 'case_id', 'submitter_id', 'patient_id')
# This MUST match the identifier used to link to the JSON manifest's case_id.
B_ALL_CLINICAL_CASE_ID_COL = 'cases.case_id' # VERIFY THIS in clinical.tsv

# Column in clinical.tsv or pathology_detail.tsv that contains B-ALL subtype or primary diagnosis
# This will be used to create the 'condition' for B-ALL samples.
# Examples: 'primary_diagnosis', 'ajcc_pathologic_stage', 'site_of_resection_or_biopsy', 'disease_type', 'subtype_column_name'
B_ALL_SUBTYPE_COL = 'diagnoses.morphology'

# Default label for B-ALL samples if specific subtypes are not used or are too varied.
# You might create more specific labels like 'B_ALL_Ph_positive', 'B_ALL_KMT2Ar' later.
B_ALL_DEFAULT_CONDITION_LABEL = 'B_ALL_unhealthy'

# Identifier in your B-ALL RNA-Seq count matrix for samples (rows or columns).
# This should match one of the IDs from the JSON manifest's 'associated_entities',
# likely 'entity_submitter_id'. This will be the index of your B-ALL metadata.
B_ALL_RNASEQ_SAMPLE_ID_JSON_KEY = 'entity_submitter_id'


# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)


In [None]:

# ## 3. Load B-ALL (Unhealthy) Clinical Data
print(f"Loading B-ALL clinical data from: {B_ALL_CLINICAL_FILE}")
b_all_clinical_df = pd.DataFrame()
if os.path.exists(B_ALL_CLINICAL_FILE):
    try:
        b_all_clinical_df = pd.read_csv(B_ALL_CLINICAL_FILE, sep='\t', header=0) # header=0 if first row is headers
        # Remove duplicate metadata rows if any, keeping the first occurrence
        # Based on the case ID column, as one patient might have multiple file entries but same clinical data
        if B_ALL_CLINICAL_CASE_ID_COL in b_all_clinical_df.columns:
            b_all_clinical_df = b_all_clinical_df.drop_duplicates(subset=[B_ALL_CLINICAL_CASE_ID_COL], keep='first')
            print(f"B-ALL clinical data loaded. Shape after dropping duplicates: {b_all_clinical_df.shape}")
            print("Clinical data columns:", b_all_clinical_df.columns.tolist())
            print("Clinical data head:\n", b_all_clinical_df.head())
        else:
            print(f"WARNING: Case ID column '{B_ALL_CLINICAL_CASE_ID_COL}' not found in {B_ALL_CLINICAL_FILE}. Check column name.")
    except Exception as e:
        print(f"Error loading B-ALL clinical data: {e}")
else:
    print(f"ERROR: B-ALL clinical file not found at {B_ALL_CLINICAL_FILE}")

# Optionally, load pathology detail if it contains more specific subtype info and merge
b_all_pathology_df = pd.DataFrame()
if os.path.exists(B_ALL_PATHOLOGY_DETAIL_FILE):
    print(f"\nLoading B-ALL pathology detail from: {B_ALL_PATHOLOGY_DETAIL_FILE}")
    try:
        b_all_pathology_df = pd.read_csv(B_ALL_PATHOLOGY_DETAIL_FILE, sep='\t', header=0)
        if B_ALL_CLINICAL_CASE_ID_COL in b_all_pathology_df.columns:
            b_all_pathology_df = b_all_pathology_df.drop_duplicates(subset=[B_ALL_CLINICAL_CASE_ID_COL], keep='first')
            print(f"B-ALL pathology detail loaded. Shape after dropping duplicates: {b_all_pathology_df.shape}")
            # If you want to merge it with clinical_df:
            # if not b_all_clinical_df.empty and B_ALL_CLINICAL_CASE_ID_COL in b_all_clinical_df.columns:
            #     b_all_clinical_df = pd.merge(b_all_clinical_df, b_all_pathology_df, on=B_ALL_CLINICAL_CASE_ID_COL, how='left', suffixes=('', '_pathology'))
            #     print(f"Clinical data shape after merging with pathology: {b_all_clinical_df.shape}")
        else:
            print(f"WARNING: Case ID column '{B_ALL_CLINICAL_CASE_ID_COL}' not found in {B_ALL_PATHOLOGY_DETAIL_FILE}.")
    except Exception as e:
        print(f"Error loading B-ALL pathology detail: {e}")
else:
    print(f"INFO: B-ALL pathology detail file not found at {B_ALL_PATHOLOGY_DETAIL_FILE} (this might be optional).")



In [None]:

# This cell helps verify the loaded clinical dataframes.
print("\n--- Initial Inspection of Loaded B-ALL Clinical Data ---")
if not b_all_clinical_df.empty:
    print(f"\n--- {B_ALL_CLINICAL_FILE} ---")
    print(f"Shape: {b_all_clinical_df.shape}")
    print("Columns:", b_all_clinical_df.columns.tolist())
    print("Head:\n", b_all_clinical_df.head())
    if B_ALL_CLINICAL_CASE_ID_COL in b_all_clinical_df.columns:
        print(f"\nUnique values in '{B_ALL_CLINICAL_CASE_ID_COL}' (first 5): {b_all_clinical_df[B_ALL_CLINICAL_CASE_ID_COL].unique()[:5]}")
        print(f"Number of unique '{B_ALL_CLINICAL_CASE_ID_COL}': {b_all_clinical_df[B_ALL_CLINICAL_CASE_ID_COL].nunique()}")
    if B_ALL_SUBTYPE_COL in b_all_clinical_df.columns:
        print(f"\nUnique values in '{B_ALL_SUBTYPE_COL}':")
        print(b_all_clinical_df[B_ALL_SUBTYPE_COL].value_counts(dropna=False)) # Show counts of each subtype
    else:
        print(f"\nWARNING: Configured subtype column '{B_ALL_SUBTYPE_COL}' not found in {B_ALL_CLINICAL_FILE}.")
else:
    print(f"\n{B_ALL_CLINICAL_FILE} is empty or was not loaded.")

if not b_all_pathology_df.empty:
    print(f"\n--- {B_ALL_PATHOLOGY_DETAIL_FILE} ---")
    print(f"Shape: {b_all_pathology_df.shape}")
    print("Columns:", b_all_pathology_df.columns.tolist())
    print("Head:\n", b_all_pathology_df.head())
    if B_ALL_CLINICAL_CASE_ID_COL in b_all_pathology_df.columns:
        print(f"\nUnique values in '{B_ALL_CLINICAL_CASE_ID_COL}' (first 5): {b_all_pathology_df[B_ALL_CLINICAL_CASE_ID_COL].unique()[:5]}")
        print(f"Number of unique '{B_ALL_CLINICAL_CASE_ID_COL}': {b_all_pathology_df[B_ALL_CLINICAL_CASE_ID_COL].nunique()}")
    # You might want to inspect a potential subtype column in pathology data as well
    # Example:
    # if 'some_pathology_subtype_column' in b_all_pathology_df.columns:
    #     print(f"\nUnique values in 'some_pathology_subtype_column' from pathology data:")
    #     print(b_all_pathology_df['some_pathology_subtype_column'].value_counts(dropna=False))
else:
    print(f"\n{B_ALL_PATHOLOGY_DETAIL_FILE} is empty or was not loaded.")
print("--- End of Initial Clinical Data Inspection ---")



In [None]:

# ## 4. Load and Process B-ALL JSON Manifest
# This manifest links RNA-Seq files to case_ids and sample_ids (entity_submitter_id)
print(f"\nLoading B-ALL JSON manifest from: {B_ALL_JSON_MANIFEST_FILE}")
b_all_manifest_data = []
if os.path.exists(B_ALL_JSON_MANIFEST_FILE):
    try:
        with open(B_ALL_JSON_MANIFEST_FILE, 'r') as f:
            b_all_manifest_data = json.load(f) # This is a list of dictionaries
        print(f"JSON manifest loaded. Number of file entries: {len(b_all_manifest_data)}")
    except Exception as e:
        print(f"Error loading or parsing JSON manifest: {e}")
else:
    print(f"ERROR: B-ALL JSON manifest file not found at {B_ALL_JSON_MANIFEST_FILE}")

# Extract RNA-Seq sample information
b_all_rnaseq_samples_info = []
if b_all_manifest_data:
    for file_entry in b_all_manifest_data:
        # Criteria to identify RNA-Seq gene quantification files
        # Adjust these based on your specific manifest content if needed
        is_rnaseq = file_entry.get('experimental_strategy') == 'RNA-Seq'
        is_gene_quant = file_entry.get('data_type') == 'Gene Expression Quantification'
        # You might also filter by file_name extension if necessary, e.g., '.tsv', 'counts.tsv'
        # is_correct_format = 'augmented_star_gene_counts.tsv' in file_entry.get('file_name', '')

        if is_rnaseq and is_gene_quant: # Add other conditions if needed
            for entity in file_entry.get('associated_entities', []):
                case_id = entity.get('case_id')
                # Use B_ALL_RNASEQ_SAMPLE_ID_JSON_KEY to get the sample ID that matches your count matrix
                sample_id_in_counts = entity.get(B_ALL_RNASEQ_SAMPLE_ID_JSON_KEY)

                if case_id and sample_id_in_counts:
                    b_all_rnaseq_samples_info.append({
                        'SAMPID': sample_id_in_counts, # This will be the index for the metadata
                        B_ALL_CLINICAL_CASE_ID_COL: case_id # Used to merge with clinical data
                        # 'file_name': file_entry.get('file_name') # Optional: for reference
                    })

    b_all_rnaseq_meta_df = pd.DataFrame(b_all_rnaseq_samples_info)
    if not b_all_rnaseq_meta_df.empty:
        b_all_rnaseq_meta_df = b_all_rnaseq_meta_df.drop_duplicates(subset=['SAMPID'], keep='first')
        print(f"\nExtracted info for {len(b_all_rnaseq_meta_df)} unique B-ALL RNA-Seq samples from manifest.")
        print("B-ALL RNA-Seq samples info head:\n", b_all_rnaseq_meta_df.head())
    else:
        print("\nWARNING: No RNA-Seq gene quantification samples found in the JSON manifest matching criteria.")
else:
    b_all_rnaseq_meta_df = pd.DataFrame()



In [None]:

# ## 5. Merge B-ALL RNA-Seq Sample Info with Clinical Data
if not b_all_rnaseq_meta_df.empty and not b_all_clinical_df.empty and B_ALL_CLINICAL_CASE_ID_COL in b_all_clinical_df.columns and B_ALL_CLINICAL_CASE_ID_COL in b_all_rnaseq_meta_df.columns:
    print(f"\nMerging B-ALL RNA-Seq sample info with clinical data on '{B_ALL_CLINICAL_CASE_ID_COL}'...")
    # Perform a left merge to keep all RNA-Seq samples and add clinical info
    final_b_all_metadata_df = pd.merge(
        b_all_rnaseq_meta_df,
        b_all_clinical_df, # Use the potentially merged clinical_df + pathology_df
        on=B_ALL_CLINICAL_CASE_ID_COL,
        how='left'
    )
    print(f"Merged B-ALL metadata shape: {final_b_all_metadata_df.shape}")

    # Check if the subtype column exists
    if B_ALL_SUBTYPE_COL not in final_b_all_metadata_df.columns:
        print(f"WARNING: Specified subtype column '{B_ALL_SUBTYPE_COL}' not found after merge. Using default label.")
        final_b_all_metadata_df['condition'] = B_ALL_DEFAULT_CONDITION_LABEL
    else:
        # Create the 'condition' column
        # You might want more sophisticated logic here to map various subtype descriptions
        # to standardized condition labels. For now, we'll use the subtype column directly
        # or the default label if the subtype is missing for a sample.
        final_b_all_metadata_df['condition'] = final_b_all_metadata_df[B_ALL_SUBTYPE_COL].fillna(B_ALL_DEFAULT_CONDITION_LABEL)
        # Example of more specific labeling (conceptual):
        # final_b_all_metadata_df['condition'] = final_b_all_metadata_df[B_ALL_SUBTYPE_COL].apply(
        #     lambda x: 'B_ALL_Ph_positive' if 'BCR-ABL1' in str(x) else B_ALL_DEFAULT_CONDITION_LABEL
        # )

    # Set SAMPID as index
    if 'SAMPID' in final_b_all_metadata_df.columns:
        final_b_all_metadata_df = final_b_all_metadata_df.set_index('SAMPID', drop=True)
        print("\nFinal B-ALL metadata (head):\n", final_b_all_metadata_df[['condition'] + [col for col in [B_ALL_SUBTYPE_COL, B_ALL_CLINICAL_CASE_ID_COL] if col in final_b_all_metadata_df.columns]].head())
    else:
        print("WARNING: 'SAMPID' column not found in merged B-ALL metadata. Index not set.")

elif b_all_rnaseq_meta_df.empty:
    print("Cannot merge: B-ALL RNA-Seq sample info is empty.")
    final_b_all_metadata_df = pd.DataFrame()
else:
    print(f"Cannot merge: Clinical data is empty or '{B_ALL_CLINICAL_CASE_ID_COL}' missing. Using default labels for B-ALL samples if RNA-Seq samples were found.")
    if not b_all_rnaseq_meta_df.empty:
        final_b_all_metadata_df = b_all_rnaseq_meta_df.copy()
        final_b_all_metadata_df['condition'] = B_ALL_DEFAULT_CONDITION_LABEL
        if 'SAMPID' in final_b_all_metadata_df.columns:
             final_b_all_metadata_df = final_b_all_metadata_df.set_index('SAMPID', drop=True)
        print("Created B-ALL metadata with default condition labels.")
    else:
        final_b_all_metadata_df = pd.DataFrame()


# List unique conditions in the final B-ALL metadata
print("\nUnique conditions in final B-ALL metadata:")
print(final_b_all_metadata_df['condition'].value_counts(dropna=False))

# list unique primary diagnoses if available
if B_ALL_SUBTYPE_COL in final_b_all_metadata_df.columns:
    print(f"\nUnique values in '{B_ALL_SUBTYPE_COL}' column:")
    print(final_b_all_metadata_df[B_ALL_SUBTYPE_COL].value_counts(dropna=False))
else:
    print(f"\nWARNING: Subtype column '{B_ALL_SUBTYPE_COL}' not found in final B-ALL metadata.")


In [None]:


# ## 6. Load Processed Healthy GTEx Metadata
print(f"\nLoading processed healthy GTEx metadata from: {HEALTHY_GTEX_METADATA_FILE}")
healthy_gtex_metadata_df = pd.DataFrame()
if os.path.exists(HEALTHY_GTEX_METADATA_FILE):
    try:
        healthy_gtex_metadata_df = pd.read_csv(HEALTHY_GTEX_METADATA_FILE, index_col='SAMPID') # SAMPID is index
        print(f"Healthy GTEx metadata loaded. Shape: {healthy_gtex_metadata_df.shape}")
        print("Healthy GTEx metadata head:\n", healthy_gtex_metadata_df.head())
    except Exception as e:
        print(f"Error loading healthy GTEx metadata: {e}")
else:
    print(f"ERROR: Healthy GTEx metadata file not found at {HEALTHY_GTEX_METADATA_FILE}")



In [None]:

# ## 7. Combine B-ALL and Healthy GTEx Metadata
# if not final_b_all_metadata_df.empty and not healthy_gtex_metadata_df.empty:
#     print("\nCombining B-ALL and healthy GTEx metadata...")
#
#     # Ensure only common columns are kept, or handle missing columns appropriately
#     # For PyDESeq2, the crucial column is 'condition'. Other columns are for user info.
#     # We'll prioritize keeping 'condition' and any other columns present in both,
#     # or just concatenate and fill NaNs if columns differ significantly beyond 'condition'.
#
#     # Select essential columns for the final combined metadata
#     # For B-ALL, we need 'condition'. We might want to keep B_ALL_SUBTYPE_COL for reference.
#     b_all_cols_to_keep = ['condition']
#     if B_ALL_SUBTYPE_COL in final_b_all_metadata_df.columns:
#         b_all_cols_to_keep.append(B_ALL_SUBTYPE_COL)
#     # Add B_ALL_CLINICAL_CASE_ID_COL if it exists and you want to keep it
#     if B_ALL_CLINICAL_CASE_ID_COL in final_b_all_metadata_df.columns: # This column might have been removed if SAMPID was the same
#          b_all_cols_to_keep.append(B_ALL_CLINICAL_CASE_ID_COL)
#
#     b_all_cols_to_keep = list(set(b_all_cols_to_keep)) # Unique columns
#     b_all_cols_present = [col for col in b_all_cols_to_keep if col in final_b_all_metadata_df.columns]
#
#     final_b_all_metadata_subset_df = final_b_all_metadata_df[b_all_cols_present]
#
#     # For healthy GTEx, we need 'condition' and can keep 'SMTSD', 'SMRIN' for reference.
#     healthy_cols_to_keep = ['condition', 'SMTSD', 'SMRIN'] # SMTS was also kept in gtex_metadata_prep
#     healthy_cols_present = [col for col in healthy_cols_to_keep if col in healthy_gtex_metadata_df.columns]
#     healthy_gtex_metadata_subset_df = healthy_gtex_metadata_df[healthy_cols_present]
#
#     # Concatenate the two metadata DataFrames
#     combined_metadata_df = pd.concat([healthy_gtex_metadata_subset_df, final_b_all_metadata_subset_df], axis=0, sort=False)
#     # `sort=False` maintains order, `join='outer'` (default for concat) keeps all columns and fills NaNs.
#
#     print(f"Combined metadata DataFrame shape: {combined_metadata_df.shape}")
#     print("Combined metadata (head):\n", combined_metadata_df.head())
#     print("Combined metadata (tail):\n", combined_metadata_df.tail())
#     print("\nValue counts for 'condition' column in combined data:")
#     print(combined_metadata_df['condition'].value_counts(dropna=False))
#
# elif final_b_all_metadata_df.empty:
#     print("Cannot combine: Processed B-ALL metadata is empty.")
#     combined_metadata_df = pd.DataFrame()
# else: # healthy_gtex_metadata_df is empty
#     print("Cannot combine: Processed healthy GTEx metadata is empty.")
#     combined_metadata_df = pd.DataFrame()
#
#
# # ## 8. Save Combined Metadata
# if not combined_metadata_df.empty:
#     print(f"\nSaving combined metadata to: {COMBINED_METADATA_OUTPUT_FILE}")
#     try:
#         combined_metadata_df.to_csv(COMBINED_METADATA_OUTPUT_FILE, index=True) # index=True saves SAMPID
#         print(f"Successfully saved combined metadata for {len(combined_metadata_df)} samples.")
#         print("This file is ready to be used as the 'METADATA_FILE' in the PyDESeq2 normalization notebook.")
#     except Exception as e:
#         print(f"Error saving combined metadata: {e}")
# else:
#     print("\nNo combined metadata to save.")
#
# print("\n--- Combined Metadata Preparation Notebook execution finished ---")