In [4]:
# Jupyter Notebook: Final Data Normalization with PyDESeq2

# ## 1. Introduction and Setup
# This notebook takes the correctly merged raw count data and its corresponding
# simple metadata to perform Variance Stabilizing Transformation (VST).
# The final output will be a single CSV file with the condition labels included.

import pandas as pd
import numpy as np
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import os

# ## 2. Configuration and File Paths
# --- UPDATE THESE PATHS ---

# Input file for your CORRECTLY MERGED RAW gene counts.
RAW_COUNTS_FILE = 'metadata/merged dataset/merged_dataset.csv'

# Input file for your SIMPLE METADATA.
METADATA_FILE = 'data/simple_metadata_for_pydeseq2.csv'

# --- Output File ---
# Path to save the final VST transformed data WITH the condition column
VST_TRANSFORMED_FILE = 'data/final_vst_normalised_data.csv'

# --- Design Configuration ---
DESIGN_FORMULA = '~condition'

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)


# ## 3. Load Datasets
print(f"--- Loading Final Datasets ---")
# ### 3.1 Load Combined Raw Count Data
print(f"Loading combined raw count data from: {RAW_COUNTS_FILE}")
raw_counts_df = pd.DataFrame()
if os.path.exists(RAW_COUNTS_FILE):
    raw_counts_df = pd.read_csv(RAW_COUNTS_FILE, index_col=0, low_memory=False) # First column is SAMPID
    print(f"✓ Raw counts data loaded. Shape: {raw_counts_df.shape}")
else:
    print(f"❌ ERROR: Combined raw counts file not found at {RAW_COUNTS_FILE}.")

# ### 3.2 Load Combined Metadata
print(f"\nLoading combined metadata from: {METADATA_FILE}")
metadata_df = pd.DataFrame()
if os.path.exists(METADATA_FILE):
    metadata_df = pd.read_csv(METADATA_FILE, index_col=0, low_memory=False) # First column is SAMPID
    print(f"✓ Metadata loaded. Shape: {metadata_df.shape}")
else:
    print(f"❌ ERROR: Combined metadata file not found at {METADATA_FILE}.")


# ## 4. Prepare and Verify Data for PyDESeq2
if not raw_counts_df.empty and not metadata_df.empty:
    print("\n--- Verifying and Aligning Data ---")

    # --- NEW: Safety check to remove 'condition' column from raw counts if it exists ---
    if 'condition' in raw_counts_df.columns:
        print("WARNING: Found a 'condition' column in the raw count data. This is unexpected and will be dropped before normalization.")
        raw_counts_df.drop(columns=['condition'], inplace=True)
        print(f"Shape of raw counts after dropping 'condition' column: {raw_counts_df.shape}")

    # Align samples between count data and metadata
    common_samples = raw_counts_df.index.intersection(metadata_df.index)

    print(f"Found {len(common_samples)} common samples between counts and metadata.")

    if len(common_samples) < len(raw_counts_df) or len(common_samples) < len(metadata_df):
        print(f"WARNING: Not all samples were common. Filtering to keep only the {len(common_samples)} matching samples.")
    else:
        print("✓ All samples in counts and metadata match.")

    # Filter to keep only the common samples and ensure same order
    raw_counts_df = raw_counts_df.loc[common_samples]
    metadata_df = metadata_df.loc[common_samples]

    print(f"Final aligned counts data shape: {raw_counts_df.shape}")
    print(f"Final aligned metadata shape: {metadata_df.shape}")

    # Verify the condition column in the metadata
    condition_col_name = DESIGN_FORMULA.strip('~')
    if condition_col_name not in metadata_df.columns:
        print(f"❌ ERROR: The design column '{condition_col_name}' is NOT in your metadata.")
        metadata_df = pd.DataFrame() # Make empty to stop execution
    else:
        print(f"\nCondition column ('{condition_col_name}') looks good. Value counts:")
        print(metadata_df[condition_col_name].value_counts())

else:
    print("\nSkipping PyDESeq2 processing due to missing data or metadata.")


# ## 5. Run PyDESeq2 and Apply VST
vst_transformed_counts_df = pd.DataFrame() # Initialize empty dataframe
if not raw_counts_df.empty and not metadata_df.empty and DESIGN_FORMULA.strip('~') in metadata_df.columns:
    print("\n--- Running PyDESeq2 ---")

    # Ensure count data is integers
    try:
        raw_counts_df = raw_counts_df.astype(int)
        print("✓ Count data converted to integer type.")
    except Exception as e:
        print(f"❌ ERROR: Could not convert count data to integers. {e}")
        raw_counts_df = pd.DataFrame() # Stop processing

    if not raw_counts_df.empty and metadata_df[DESIGN_FORMULA.strip('~')].nunique() >= 2:
        print("\nInitializing DeseqDataSet...")
        try:
            dds = DeseqDataSet(
                counts=raw_counts_df,
                metadata=metadata_df,
                design=DESIGN_FORMULA
            )
            print("✓ DeseqDataSet initialized.")

            print("\nRunning DESeq2 analysis (this may take a few minutes)...")
            dds.deseq2()
            print("✓ DESeq2 analysis complete.")

            # --- VST Transformation ---
            print("\nApplying Variance Stabilizing Transformation (VST)...")
            dds.vst()
            vst_transformed_counts_df = pd.DataFrame(dds.layers['vst_counts'], index=dds.obs.index, columns=dds.var.index)
            print("✓ VST complete.")
            print("  - Shape of transformed data:", vst_transformed_counts_df.shape)
            print("  - VST transformed data head (without condition):\n", vst_transformed_counts_df.head())

        except Exception as e:
            print(f"❌ An error occurred during PyDESeq2 processing: {e}")
else:
    print("\nSkipping PyDESeq2 run due to issues in previous steps.")


# ## 6. Add Condition Column and Save Final File
# This is the final step to create a machine-learning-ready file.
if not vst_transformed_counts_df.empty and not metadata_df.empty:
    print(f"\n--- Creating Final ML-Ready Dataset ---")

    # Join the VST data with the condition column from the metadata
    final_ml_df = vst_transformed_counts_df.join(metadata_df['condition'])

    # Reorder columns to have 'condition' first for clarity
    if 'condition' in final_ml_df.columns:
        cols = ['condition'] + [col for col in final_ml_df.columns if col != 'condition']
        final_ml_df = final_ml_df[cols]

    print("✓ Merged condition column into final dataset.")
    print("  - Final data head:\n", final_ml_df.head())

    # Save the final file
    print(f"\nSaving final VST data with condition to: {VST_TRANSFORMED_FILE}")
    try:
        final_ml_df.to_csv(VST_TRANSFORMED_FILE, index=True)
        print("✓ Successfully saved final file.")
        print("This file is now ready for feature selection and machine learning.")
    except Exception as e:
        print(f"❌ Error saving final file: {e}")

else:
    print("\nSkipping final save because VST transformation did not produce data.")


print("\n--- Normalization Notebook execution finished ---")


--- Loading Final Datasets ---
Loading combined raw count data from: metadata/merged dataset/merged_dataset.csv
✓ Raw counts data loaded. Shape: (1895, 58086)

Loading combined metadata from: data/simple_metadata_for_pydeseq2.csv
✓ Metadata loaded. Shape: (1895, 1)

--- Verifying and Aligning Data ---
Shape of raw counts after dropping 'condition' column: (1895, 58085)
Found 1895 common samples between counts and metadata.
✓ All samples in counts and metadata match.
Final aligned counts data shape: (1895, 58085)
Final aligned metadata shape: (1895, 1)

Condition column ('condition') looks good. Value counts:
condition
healthy      1000
unhealthy     895
Name: count, dtype: int64

--- Running PyDESeq2 ---
✓ Count data converted to integer type.

Initializing DeseqDataSet...
✓ DeseqDataSet initialized.

Running DESeq2 analysis (this may take a few minutes)...
Using None as control genes, passed at DeseqDataSet initialization


Fitting size factors...
... done in 1.34 seconds.

Fitting dispersions...
... done in 11.56 seconds.

Fitting dispersion trend curve...
... done in 0.52 seconds.

Fitting MAP dispersions...
... done in 13.07 seconds.

Fitting LFCs...
... done in 16.76 seconds.

Calculating cook's distance...
... done in 5.49 seconds.

Replacing 10010 outlier genes.

Fitting dispersions...
... done in 2.60 seconds.

Fitting MAP dispersions...
... done in 2.24 seconds.

Fitting LFCs...
... done in 3.59 seconds.



✓ DESeq2 analysis complete.

Applying Variance Stabilizing Transformation (VST)...
Fit type used for VST : parametric


Fitting dispersions...
... done in 11.12 seconds.



✓ VST complete.
  - Shape of transformed data: (1895, 58085)
  - VST transformed data head (without condition):
                                ENSG00000000003  ENSG00000000005  \
GTEX-1117F-0011-R11b-SM-GIN8R         8.965071         1.994612   
GTEX-1117F-1426-SM-H65ZH             10.190289         4.259777   
GTEX-1117F-3226-SM-5N9CT             10.730460         2.181853   
GTEX-111CU-0826-SM-5EGIJ              9.609595         3.315978   
GTEX-111CU-1226-SM-5EGIN              9.781490         4.709775   

                               ENSG00000000419  ENSG00000000457  \
GTEX-1117F-0011-R11b-SM-GIN8R        10.047195         9.138564   
GTEX-1117F-1426-SM-H65ZH             10.161156        10.048578   
GTEX-1117F-3226-SM-5N9CT             10.026630         8.526861   
GTEX-111CU-0826-SM-5EGIJ             10.262449         9.471320   
GTEX-111CU-1226-SM-5EGIN             10.294166         9.655512   

                               ENSG00000000460  ENSG00000000938  \
GTEX-1117F-001