In [21]:
# Jupyter Notebook: Preparing GTEx Sample Metadata for PyDESeq2

# ## 1. Introduction and Setup
# This notebook takes the full GTEx Sample Attributes file
# (GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt) and processes it to create a
# metadata file suitable for use with PyDESeq2.
#
# This involves:
# - Selecting samples from specific tissue types to act as your "healthy" controls.
# - Filtering samples based on RNA quality (SMRIN score).
# - Creating a 'condition' column to label these samples.
# - Ensuring sample IDs are set up correctly as the index.

import pandas as pd
import os

# ## 2. Configuration and File Paths
# ADJUST THESE PATHS AND PARAMETERS AS NEEDED

# Input: Path to the full GTEx Sample Attributes file
# Download from GTEx Portal: GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt
GTEX_FULL_SAMPLE_ATTRIBUTES_FILE = 'metadata/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt' # USER MODIFIED THIS

# Output: Path to save your processed metadata CSV for healthy samples
PROCESSED_HEALTHY_METADATA_FILE = 'metadata/processed/gtex_healthy_metadata_for_pydeseq2.csv'

# --- Filtering Parameters ---
# Define the tissue types (from SMTSD or SMTS columns) you want to consider "healthy"
# CRITICAL: After running Cell 3.1, update this list with the *exact* string for
# actual healthy bone marrow tissue (e.g., 'Bone Marrow - Tibia') or 'Whole Blood' from SMTSD.
# Avoid terms that indicate cell lines (e.g., "Cells - ...").
SELECTED_TISSUE_TYPES_SMTSD = ['Whole Blood'] # Example: ['Bone Marrow - Tibia', 'Whole Blood']
                                 # MODIFY THIS LIST BASED ON OUTPUT FROM CELL 3.1

# Optional: If filtering by the broader SMTS column is necessary, define here.
# Be cautious as this might include unwanted subtypes like cell lines.
# SELECTED_TISSUE_TYPES_SMTS = []

# Define terms in SMTSD to EXCLUDE if they get picked up by a broader SMTS filter
# This is important to remove cell lines or other non-healthy tissue samples.
EXCLUDE_SMTSD_TERMS = ['Cells - Leukemia cell line (CML)', 'Cells - EBV-transformed lymphocytes'] # Add other terms as needed

# Define the minimum SMRIN score for RNA quality
MIN_SMRIN_SCORE = 7.0 # Common cutoff, adjust as needed

# Define the label for your healthy condition in the output metadata
HEALTHY_CONDITION_LABEL = 'healthy'

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

In [22]:
# ## 3. Load Full GTEx Sample Attributes Data
print(f"Loading full GTEx sample attributes from: {GTEX_FULL_SAMPLE_ATTRIBUTES_FILE}")
if os.path.exists(GTEX_FULL_SAMPLE_ATTRIBUTES_FILE):
    try:
        # GTEx attribute files are typically tab-separated
        gtex_metadata_full_df = pd.read_csv(GTEX_FULL_SAMPLE_ATTRIBUTES_FILE, sep='\t', low_memory=False) # Added low_memory=False for robust type inference
        print(f"Full GTEx metadata loaded. Shape: {gtex_metadata_full_df.shape}")

        required_cols = ['SAMPID', 'SMTSD', 'SMRIN', 'SMTS']
        missing_cols = [col for col in required_cols if col not in gtex_metadata_full_df.columns]
        if missing_cols:
            print(f"\nWARNING: The following required columns are missing: {missing_cols}")
    except Exception as e:
        print(f"Error loading GTEx attributes file: {e}")
        gtex_metadata_full_df = pd.DataFrame()
else:
    print(f"ERROR: Full GTEx sample attributes file not found at {GTEX_FULL_SAMPLE_ATTRIBUTES_FILE}")
    gtex_metadata_full_df = pd.DataFrame()

# ## 3.1. Inspect Available Tissue Types (CRITICAL STEP)
# This cell helps you find the exact names of tissues available in your metadata file.
# Run this cell, then update `SELECTED_TISSUE_TYPES_SMTSD` and potentially `SELECTED_TISSUE_TYPES_SMTS`
# and `EXCLUDE_SMTSD_TERMS` lists in Section 2 above.
if not gtex_metadata_full_df.empty:
    print("\n--- Inspecting Available Tissue Types ---")
    if 'SMTSD' in gtex_metadata_full_df.columns:
        print("\nUnique tissue types in 'SMTSD' column (detailed tissue type):")
        unique_smtsd = sorted(list(gtex_metadata_full_df['SMTSD'].astype(str).unique()))
        for tissue_type in unique_smtsd:
            print(f"- {tissue_type}")
    else:
        print("\nWARNING: 'SMTSD' column not found in the metadata.")

    if 'SMTS' in gtex_metadata_full_df.columns:
        print("\nUnique tissue types in 'SMTS' column (general tissue type):")
        unique_smts = sorted(list(gtex_metadata_full_df['SMTS'].astype(str).unique()))
        for tissue_type in unique_smts:
            print(f"- {tissue_type}")
    else:
        print("\nWARNING: 'SMTS' column not found in the metadata.")
    print("--- End of Tissue Type Inspection ---")
else:
    print("\nSkipping tissue type inspection as metadata was not loaded.")


Loading full GTEx sample attributes from: metadata/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt
Full GTEx metadata loaded. Shape: (48231, 119)

--- Inspecting Available Tissue Types ---

Unique tissue types in 'SMTSD' column (detailed tissue type):
- Adipose - Subcutaneous
- Adipose - Visceral (Omentum)
- Adrenal Gland
- Artery - Aorta
- Artery - Coronary
- Artery - Pulmonary
- Artery - Tibial
- Bladder
- Brain - Amygdala
- Brain - Anterior cingulate cortex (BA24)
- Brain - Caudate (basal ganglia)
- Brain - Cerebellar Hemisphere
- Brain - Cerebellum
- Brain - Cortex
- Brain - Frontal Cortex (BA9)
- Brain - Hippocampus
- Brain - Hypothalamus
- Brain - Nucleus accumbens (basal ganglia)
- Brain - Putamen (basal ganglia)
- Brain - Spinal cord (cervical c-1)
- Brain - Substantia nigra
- Breast - Mammary Tissue
- Cells - Cultured fibroblasts
- Cells - EBV-transformed lymphocytes
- Cells - Leukemia cell line (CML)
- Cervix - Ectocervix
- Cervix - Endocervix
- Colon - Sigmoid
- Colon -

In [25]:
if not gtex_metadata_full_df.empty:
    print("\nFiltering samples...")
    processed_df = gtex_metadata_full_df.copy()
    initial_total_samples = len(processed_df)

    # --- Primary Filter by DETAILED Tissue Type (SMTSD) ---
    # This is preferred for selecting specific healthy tissues.
    if 'SMTSD' in processed_df.columns and SELECTED_TISSUE_TYPES_SMTSD:
        print(f"Filtering for specific tissue types in 'SMTSD': {SELECTED_TISSUE_TYPES_SMTSD}")
        processed_df = processed_df[processed_df['SMTSD'].isin(SELECTED_TISSUE_TYPES_SMTSD)]
        print(f"Samples remaining after SMTSD filtering: {len(processed_df)}")
    elif SELECTED_TISSUE_TYPES_SMTSD: # If list is defined but column missing
         print("WARNING: 'SMTSD' column not found for detailed tissue filtering, but SELECTED_TISSUE_TYPES_SMTSD was specified.")
    else:
        print("No specific 'SMTSD' tissue types selected for primary filtering. Consider using SELECTED_TISSUE_TYPES_SMTS or refining SMTSD selection.")

    # --- Optional: Filter by GENERAL Tissue Type (SMTS) if SMTSD list is empty ---
    # Use this cautiously. If SMTSD is used, this might be redundant or conflict.
    if not SELECTED_TISSUE_TYPES_SMTSD and 'SMTS' in processed_df.columns and SELECTED_TISSUE_TYPES_SMTS:
        print(f"Filtering for general tissue types in 'SMTS': {SELECTED_TISSUE_TYPES_SMTS} (as SMTSD selection was empty)")
        # If we already filtered by SMTSD, we might not want to do this, or do it on the full set.
        # For simplicity, this example assumes if SMTSD list is empty, we try SMTS on the current 'processed_df'.
        # A more robust logic might apply SMTS filter first if SMTSD is not the primary target.
        current_count_before_smts_filter = len(processed_df)
        if current_count_before_smts_filter == initial_total_samples: # only apply SMTS if no SMTSD filter was effectively applied
            processed_df = processed_df[processed_df['SMTS'].isin(SELECTED_TISSUE_TYPES_SMTS)]
            print(f"Samples remaining after SMTS filtering: {len(processed_df)}")
        else:
            print("Skipping SMTS filter as SMTSD filter was already applied or SELECTED_TISSUE_TYPES_SMTSD is not empty.")


    # --- EXCLUDE specific SMTSD terms (e.g., cell lines) ---
    if 'SMTSD' in processed_df.columns and EXCLUDE_SMTSD_TERMS:
        print(f"Excluding SMTSD terms: {EXCLUDE_SMTSD_TERMS}")
        count_before_exclusion = len(processed_df)
        # Using ~ .str.contains() for partial matches, or .isin() for exact matches
        # For exact matches in the list:
        processed_df = processed_df[~processed_df['SMTSD'].isin(EXCLUDE_SMTSD_TERMS)]
        # If you need partial string matching for exclusion (e.g., exclude if 'Cells -' is present):
        # for term_to_exclude in EXCLUDE_SMTSD_TERMS:
        #    processed_df = processed_df[~processed_df['SMTSD'].astype(str).str.contains(term_to_exclude, case=False, na=False)]
        print(f"Samples remaining after SMTSD exclusion: {len(processed_df)} (removed {count_before_exclusion - len(processed_df)})")

    print(f"Total samples removed by all tissue filters: {initial_total_samples - len(processed_df)}")

    # --- Filter by SMRIN score ---
    if 'SMRIN' in processed_df.columns:
        print(f"Filtering for SMRIN score >= {MIN_SMRIN_SCORE}")
        processed_df['SMRIN'] = pd.to_numeric(processed_df['SMRIN'], errors='coerce')
        processed_df = processed_df.dropna(subset=['SMRIN'])

        initial_sample_count_rin = len(processed_df)
        processed_df = processed_df[processed_df['SMRIN'] >= MIN_SMRIN_SCORE]
        print(f"Samples remaining after SMRIN filtering: {len(processed_df)} (removed {initial_sample_count_rin - len(processed_df)})")
    else:
        print("WARNING: 'SMRIN' column not found for quality filtering.")

    if processed_df.empty:
        print("\nNo samples remaining after all filtering. CRITICAL: Review your filtering criteria in Section 2 (especially tissue selection and exclusion lists) based on Cell 3.1 output.")
    else:
        print(f"\nHead of filtered metadata ({len(processed_df)} samples):\n", processed_df[['SAMPID', 'SMTS', 'SMTSD', 'SMRIN']].head())
else:
    print("Skipping filtering as full GTEx metadata was not loaded.")
    processed_df = pd.DataFrame()


Filtering samples...
Filtering for specific tissue types in 'SMTSD': ['Whole Blood']
Samples remaining after SMTSD filtering: 4369
Excluding SMTSD terms: ['Cells - Leukemia cell line (CML)', 'Cells - EBV-transformed lymphocytes']
Samples remaining after SMTSD exclusion: 4369 (removed 0)
Total samples removed by all tissue filters: 43862
Filtering for SMRIN score >= 7.0
Samples remaining after SMRIN filtering: 1731 (removed 164)

Head of filtered metadata (1731 samples):
                         SAMPID   SMTS        SMTSD  SMRIN
1634  GTEX-1117F-0005-SM-GMJZF  Blood  Whole Blood    8.3
1635  GTEX-1117F-0005-SM-HL9SH  Blood  Whole Blood    8.3
1686  GTEX-111CU-0005-SM-GJ3PH  Blood  Whole Blood    8.8
1730  GTEX-111FC-0006-SM-H65Z1  Blood  Whole Blood    7.6
1819  GTEX-111YS-0006-SM-5NQBE  Blood  Whole Blood    8.2


In [26]:
# ## 5. Create 'condition' Column
if not processed_df.empty:
    print(f"\nCreating 'condition' column with label: '{HEALTHY_CONDITION_LABEL}'")
    processed_df['condition'] = HEALTHY_CONDITION_LABEL
    # print("Added 'condition' column. Head:\n", processed_df[['SAMPID', 'SMTSD', 'SMRIN', 'condition']].head())
else:
    print("Skipping creation of 'condition' column as no data was processed.")




Creating 'condition' column with label: 'healthy'


In [27]:
# ## 6. Set Index to SAMPID
# This is crucial for PyDESeq2 to match metadata to count data samples.
if not processed_df.empty and 'SAMPID' in processed_df.columns:
    print("\nSetting 'SAMPID' as index...")
    processed_df = processed_df.set_index('SAMPID', drop=True) # drop=True removes SAMPID from columns
    # print("Index set. Head of final processed metadata for healthy samples:\n", processed_df.head())
elif 'SAMPID' not in processed_df.columns and not processed_df.empty:
    print("WARNING: 'SAMPID' column not found. Cannot set index. The output file will not be suitable for PyDESeq2 without manual adjustment.")
else:
    print("Skipping index setting as no data was processed.")




Setting 'SAMPID' as index...


In [28]:
# ## 7. Select Relevant Columns (Optional, but good for clean metadata)
# Keep only the 'condition' column and any other columns you might find useful
# (e.g., for verifying or if you plan a more complex DESeq2 design).
# For a simple '~condition' design, only 'condition' is strictly needed after indexing by SAMPID.
if not processed_df.empty:
    final_columns = ['condition', 'SMTSD', 'SMRIN'] # Example columns to keep
    # Ensure these columns actually exist after previous steps
    final_columns = [col for col in final_columns if col in processed_df.columns]

    if final_columns:
        processed_df_final = processed_df[final_columns].copy() # Use .copy() to avoid SettingWithCopyWarning
        print(f"\nSelected final columns: {final_columns}")
        print("Final metadata structure for healthy samples (head):\n", processed_df_final.head())
    else:
        print("\nNo specified final columns found, keeping all processed columns.")
        processed_df_final = processed_df.copy()





Selected final columns: ['condition', 'SMTSD', 'SMRIN']
Final metadata structure for healthy samples (head):
                          condition        SMTSD  SMRIN
SAMPID                                                
GTEX-1117F-0005-SM-GMJZF   healthy  Whole Blood    8.3
GTEX-1117F-0005-SM-HL9SH   healthy  Whole Blood    8.3
GTEX-111CU-0005-SM-GJ3PH   healthy  Whole Blood    8.8
GTEX-111FC-0006-SM-H65Z1   healthy  Whole Blood    7.6
GTEX-111YS-0006-SM-5NQBE   healthy  Whole Blood    8.2


In [29]:
# ## 8. Save Processed Metadata
if not processed_df_final.empty: # Check the final dataframe
    print(f"\nSaving processed healthy metadata to: {PROCESSED_HEALTHY_METADATA_FILE}")
    try:
        processed_df_final.to_csv(PROCESSED_HEALTHY_METADATA_FILE, index=True) # index=True to save the SAMPID index
        print(f"Successfully saved processed metadata for {len(processed_df_final)} healthy samples.")
        print("This file can now be used as input (or part of the input if you merge it with unhealthy sample metadata) for the PyDESeq2 notebook.")
    except Exception as e:
        print(f"Error saving processed metadata: {e}")
else:
    print("\nNo processed metadata to save (processed_df_final is empty).")

print("\n--- GTEx Metadata Preparation Notebook execution finished ---")


Saving processed healthy metadata to: data/gtex_healthy_metadata_for_pydeseq2.csv
Successfully saved processed metadata for 1731 healthy samples.
This file can now be used as input (or part of the input if you merge it with unhealthy sample metadata) for the PyDESeq2 notebook.

--- GTEx Metadata Preparation Notebook execution finished ---


### Viasualise processed data

In [31]:

processed_df_final

Unnamed: 0_level_0,condition,SMTSD,SMRIN
SAMPID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
GTEX-1117F-0005-SM-GMJZF,healthy,Whole Blood,8.3
GTEX-1117F-0005-SM-HL9SH,healthy,Whole Blood,8.3
GTEX-111CU-0005-SM-GJ3PH,healthy,Whole Blood,8.8
GTEX-111FC-0006-SM-H65Z1,healthy,Whole Blood,7.6
GTEX-111YS-0006-SM-5NQBE,healthy,Whole Blood,8.2
...,...,...,...
GTEX-ZYVF-0005-SM-GMJZL,healthy,Whole Blood,8.2
GTEX-ZYVF-0005-SM-HL9SN,healthy,Whole Blood,8.2
GTEX-ZYW4-0006-SM-HAUXL,healthy,Whole Blood,9.2
GTEX-ZZPU-0006-SM-GMJZM,healthy,Whole Blood,8.7
