In [None]:
# Jupyter Notebook: Create Basic Metadata from an Existing Merged Count File

In [None]:
# ## 1. Introduction and Goal

In [1]:
# This notebook creates a simple metadata file directly from a pre-merged
# raw count matrix. It's designed to solve sample ID mismatch issues when the
# original mapping information is unavailable or inconsistent.
#
# It works by making a simple assumption:
# - Sample IDs starting with "GTEX-" are labeled 'healthy'.
# - All other sample IDs are labeled 'unhealthy'.

import pandas as pd
import os

In [None]:
# ## 2. Configuration

In [2]:
# --- ADJUST THESE PATHS ---

# Input: Path to the pre-merged raw count file from your colleague.
# This file should have samples as rows and genes as columns.
MERGED_COUNTS_FILE = 'metadata/merged dataset/merged_dataset.csv'

# Output: Path where the new, simple metadata file will be saved.
SIMPLE_METADATA_OUTPUT_FILE = 'metadata/simple_metadata_for_pydeseq2.csv'

# --- Labels ---
HEALTHY_LABEL = 'healthy'
UNHEALTHY_LABEL = 'unhealthy' # You can change this to 'B_ALL' if you prefer

# Create data directory if it doesn't exist
os.makedirs('data', exist_ok=True)

In [None]:
# ## 3. Load Merged Count Data and Create Metadata

In [3]:
print(f"--- Creating Basic Metadata from {MERGED_COUNTS_FILE} ---")
basic_metadata_df = pd.DataFrame()

if os.path.exists(MERGED_COUNTS_FILE):
    try:
        # Load only the index (sample IDs) of the count file to save memory
        print(f"Reading sample IDs from: {MERGED_COUNTS_FILE}")
        # Added engine='python' to handle potential parsing issues.
        sample_ids = pd.read_csv(MERGED_COUNTS_FILE, index_col=0, usecols=[0], engine='python').index
        print(f"✓ Found {len(sample_ids)} sample IDs.")

        # Create a new DataFrame using these sample IDs as the index
        basic_metadata_df = pd.DataFrame(index=sample_ids)
        basic_metadata_df.index.name = 'SAMPID'

        # Create the 'condition' column based on the heuristic
        print(f"Assigning conditions: 'healthy' if ID starts with 'GTEX-', otherwise '{UNHEALTHY_LABEL}'.")

        conditions = [HEALTHY_LABEL if str(sid).startswith('GTEX-') else UNHEALTHY_LABEL for sid in basic_metadata_df.index]
        basic_metadata_df['condition'] = conditions

        print("\n--- Metadata Creation Summary ---")
        print(f"Shape of new metadata file: {basic_metadata_df.shape}")
        print("Value counts for 'condition' column:")
        print(basic_metadata_df['condition'].value_counts())

        print("\nMetadata Head (first 5 rows):")
        print(basic_metadata_df.head())
        print("\nMetadata Tail (last 5 rows):")
        print(basic_metadata_df.tail())

    except Exception as e:
        print(f"❌ ERROR: Failed to load or process the merged count file: {e}")
        basic_metadata_df = pd.DataFrame() # Ensure it's empty on failure

else:
    print(f"❌ ERROR: Merged count file not found at {MERGED_COUNTS_FILE}")

# ## 4. Save the New Basic Metadata File
if not basic_metadata_df.empty:
    print(f"\nSaving new basic metadata to: {SIMPLE_METADATA_OUTPUT_FILE}")
    try:
        basic_metadata_df.to_csv(SIMPLE_METADATA_OUTPUT_FILE, index=True)
        print("✓ Successfully saved the new metadata file.")
        print("\n--- Next Steps ---")
        print("You can now use this file as the 'METADATA_FILE' in your 'pydeseq2_normalization_final' notebook.")
        print(f"And use '{MERGED_COUNTS_FILE}' as the 'RAW_COUNTS_FILE'.")
    except Exception as e:
        print(f"❌ ERROR: Could not save the metadata file: {e}")
else:
    print("\nSkipping save because metadata was not created due to previous errors.")

print("\n--- Notebook execution finished ---")


--- Creating Basic Metadata from metadata/merged dataset/merged_dataset.csv ---
Reading sample IDs from: metadata/merged dataset/merged_dataset.csv
✓ Found 1895 sample IDs.
Assigning conditions: 'healthy' if ID starts with 'GTEX-', otherwise 'unhealthy'.

--- Metadata Creation Summary ---
Shape of new metadata file: (1895, 1)
Value counts for 'condition' column:
condition
healthy      1000
unhealthy     895
Name: count, dtype: int64

Metadata Head (first 5 rows):
                              condition
SAMPID                                 
GTEX-1117F-0011-R11b-SM-GIN8R   healthy
GTEX-1117F-1426-SM-H65ZH        healthy
GTEX-1117F-3226-SM-5N9CT        healthy
GTEX-111CU-0826-SM-5EGIJ        healthy
GTEX-111CU-1226-SM-5EGIN        healthy

Metadata Tail (last 5 rows):
                                      condition
SAMPID                                         
fe9ca1c7-58be-4123-8742-68843ff9881b  unhealthy
fff4dfba-2d68-4c05-9d1d-70f410afe6d4  unhealthy
c2f59459-f7e1-48ae-b1e0-772008