In [13]:
# Jupyter Notebook: Final Data Normalization with PyDESeq2

# ## 1. Introduction and Setup
# This notebook takes the correctly merged raw count data and its corresponding
# simple metadata to perform Variance Stabilizing Transformation (VST).
# The input files should have perfectly matching sample IDs.

import pandas as pd
import numpy as np
from pydeseq2.dds import DeseqDataSet
from pydeseq2.ds import DeseqStats
import os

In [5]:
# ## 2. Configuration and File Paths
# --- UPDATE THESE PATHS ---

# Input file for your CORRECTLY MERGED RAW gene counts.
# This is the output from your 'corrected_data_merging_notebook'.
RAW_COUNTS_FILE = 'metadata/merged dataset/merged_dataset.csv'

# Input file for your SIMPLE METADATA.
# This is the output from your 'simple_metadata_creation_notebook'.
METADATA_FILE = 'metadata/simple_metadata_for_pydeseq2.csv'

# --- Output File ---
# Path to save the final VST transformed data
VST_TRANSFORMED_FILE = 'data/final_vst_transformed_data.csv'

# --- Design Configuration ---
DESIGN_FORMULA = '~condition'

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)


In [12]:
# ## 3. Load Datasets
print(f"--- Loading Final Datasets ---")
# ### 3.1 Load Combined Raw Count Data
print(f"Loading combined raw count data from: {RAW_COUNTS_FILE}")
raw_counts_df = pd.DataFrame()
if os.path.exists(RAW_COUNTS_FILE):
    raw_counts_df = pd.read_csv(RAW_COUNTS_FILE, index_col=0, low_memory=False) # First column is SAMPID
    print(f"✓ Raw counts data loaded. Shape: {raw_counts_df.shape}")
else:
    print(f"❌ ERROR: Combined raw counts file not found at {RAW_COUNTS_FILE}.")

# ### 3.2 Load Combined Metadata
print(f"\nLoading combined metadata from: {METADATA_FILE}")
metadata_df = pd.DataFrame()
if os.path.exists(METADATA_FILE):
    metadata_df = pd.read_csv(METADATA_FILE, index_col=0, low_memory=False) # First column is SAMPID
    print(f"✓ Metadata loaded. Shape: {metadata_df.shape}")
else:
    print(f"❌ ERROR: Combined metadata file not found at {METADATA_FILE}.")


--- Loading Final Datasets ---
Loading combined raw count data from: metadata/merged dataset/merged_dataset.csv
✓ Raw counts data loaded. Shape: (1895, 58086)

Loading combined metadata from: data/simple_metadata_for_pydeseq2.csv
✓ Metadata loaded. Shape: (1895, 1)


In [11]:
# ## 4. Prepare and Verify Data for PyDESeq2
if not raw_counts_df.empty and not metadata_df.empty:
    print("\n--- Verifying and Aligning Data ---")

    # Align samples between count data and metadata
    common_samples = raw_counts_df.index.intersection(metadata_df.index)

    print(f"Found {len(common_samples)} common samples between counts and metadata.")

    if len(common_samples) < len(raw_counts_df) or len(common_samples) < len(metadata_df):
        print(f"WARNING: Not all samples were common. Filtering to keep only the {len(common_samples)} matching samples.")
    else:
        print("✓ All samples in counts and metadata match.")

    # Filter to keep only the common samples and ensure same order
    raw_counts_df = raw_counts_df.loc[common_samples]
    metadata_df = metadata_df.loc[common_samples]

    print(f"Final aligned counts data shape: {raw_counts_df.shape}")
    print(f"Final aligned metadata shape: {metadata_df.shape}")

    # Verify the condition column
    condition_col_name = DESIGN_FORMULA.strip('~')
    if condition_col_name not in metadata_df.columns:
        print(f"❌ ERROR: The design column '{condition_col_name}' is NOT in your metadata.")
        metadata_df = pd.DataFrame() # Make empty to stop execution
    else:
        print(f"\nCondition column ('{condition_col_name}') looks good. Value counts:")
        print(metadata_df[condition_col_name].value_counts())

else:
    print("\nSkipping PyDESeq2 processing due to missing data or metadata.")



--- Verifying and Aligning Data ---
Found 1895 common samples between counts and metadata.
✓ All samples in counts and metadata match.
Final aligned counts data shape: (1895, 58086)
Final aligned metadata shape: (1895, 1)

Condition column ('condition') looks good. Value counts:
condition
healthy      1000
unhealthy     895
Name: count, dtype: int64
