In [None]:
# GTEx Blood Sample Exploration Notebook

In [None]:
## Cell 1: Import Libraries and Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import requests
import os

# Set up plotting
plt.style.use('default')
sns.set_palette("husl")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

print("Libraries imported successfully!")

In [None]:
## Cell 2: Download GTEx Metadata Files

In [None]:
# Define file URLs (update these with actual GTEx portal URLs)
base_url = "https://storage.googleapis.com/adult-gtex/annotations/"

files_to_download = {
    'sample_attributes': 'GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt',
    'sample_dictionary': 'GTEx_Analysis_v10_Annotations_SampleAttributesDD.xlsx'
}

# Create data directory
os.makedirs('data/gtex_metadata', exist_ok=True)

# Download files
for file_key, filename in files_to_download.items():
    local_path = f'data/gtex_metadata/{filename}'

    if not os.path.exists(local_path):
        print(f"Downloading {filename}...")
        # You'll need to manually download these from GTEx portal
        print(f"Please download {filename} from GTEx portal to {local_path}")
    else:
        print(f"✓ {filename} already exists")

print("\nManual download required:")
print("1. Go to https://gtexportal.org/home/datasets")
print("2. Download GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt")
print("3. Download GTEx_Analysis_v10_Annotations_SampleAttributesDD.xlsx")
print("4. Place them in data/gtex_metadata/ folder")

In [None]:
## Cell 3: Load and Explore Sample Attributes

In [None]:
# Load the main sample attributes file
try:
    sample_attrs = pd.read_csv('metadata/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt',
                              sep='\t', low_memory=False)
    print("✓ Sample attributes loaded successfully!")
    print(f"Shape: {sample_attrs.shape}")
    print(f"Columns: {len(sample_attrs.columns)}")

except FileNotFoundError:
    print("❌ Please download the metadata files first (see Cell 2)")
    sample_attrs = None

if sample_attrs is not None:
    # Display basic info
    print("\nFirst few columns:")
    print(sample_attrs.columns[:10].tolist())

    print("\nFirst few rows:")
    display(sample_attrs)

In [None]:
## View uniques samples in SAMPID column
if sample_attrs is not None:
    print("\nUnique sample IDs (SAMPID):")
    unique_samples = sample_attrs['SAMPID'].unique()
    print(f"Total unique samples: {len(unique_samples)}")
    for i, sample_id in enumerate(unique_samples):
        print(f"{i+1}. {sample_id}")
else:
    print("❌ Sample attributes not loaded - please check previous cells")

In [None]:
## Cell 4: Analyze Tissue Types and Sample Sources

In [None]:
if sample_attrs is not None:
    # Key columns to explore
    tissue_cols = ['SMTS', 'SMTSD']  # Tissue type columns

    print("=== TISSUE TYPE ANALYSIS ===")

    # Primary tissue types
    if 'SMTS' in sample_attrs.columns:
        print("\n1. Primary Tissue Types (SMTS):")
        tissue_counts = sample_attrs['SMTS'].value_counts()
        print(tissue_counts)

        # Look for blood-related tissues
        blood_tissues = tissue_counts[tissue_counts.index.str.contains('Blood|blood', case=False, na=False)]
        print(f"\n🩸 Blood-related tissues found: {len(blood_tissues)}")
        if len(blood_tissues) > 0:
            print(blood_tissues)

    # Detailed tissue types
    if 'SMTSD' in sample_attrs.columns:
        print("\n2. Detailed Tissue Types (SMTSD):")
        detailed_tissues = sample_attrs['SMTSD'].value_counts()

        # Look for blood/immune related tissues
        blood_keywords = ['Blood', 'blood', 'Whole', 'whole', 'Cells', 'cells', 'immune', 'lymph']
        blood_related = detailed_tissues[detailed_tissues.index.str.contains('|'.join(blood_keywords), case=False, na=False)]

        print(f"\n🩸 Blood-related detailed tissues ({len(blood_related)}):")
        for tissue, count in blood_related.items():
            print(f"  {tissue}: {count} samples")

        # Show top 20 tissues overall
        print(f"\n📊 Top 20 tissue types:")
        print(detailed_tissues.head)

In [None]:
## Cell 5: Focus on Blood Samples

In [None]:
if sample_attrs is not None:
    # Filter for blood samples
    blood_samples = sample_attrs[
        sample_attrs['SMTS'].str.contains('Blood', case=False, na=False) |
        sample_attrs['SMTSD'].str.contains('Blood|Whole Blood', case=False, na=False)
    ].copy()

    print(f"🩸 BLOOD SAMPLES FOUND: {len(blood_samples)}")

    if len(blood_samples) > 0:
        print("\nBlood sample breakdown:")
        print(blood_samples['SMTSD'].value_counts())

        # Check sample quality metrics
        quality_cols = ['SMRIN', 'SMTSISCH', 'SMCENTER']  # RIN, ischemic time, center

        print("\n=== QUALITY METRICS ===")
        for col in quality_cols:
            if col in blood_samples.columns:
                print(f"\n{col} statistics:")
                print(blood_samples[col].describe())

        # Save blood sample IDs
        blood_sample_ids = blood_samples.index.tolist()

        # Save to file for later use
        with open('data/gtex_blood_sample_ids.txt', 'w') as f:
            for sample_id in blood_sample_ids:
                f.write(f"{sample_id}\n")

        print(f"\n✓ {len(blood_sample_ids)} blood sample IDs saved to 'data/gtex_blood_sample_ids.txt'")

        # Display first 10 sample IDs
        print("\nFirst 10 blood sample IDs:")
        for i, sample_id in enumerate(blood_sample_ids[:10]):
            print(f"  {i+1}. {sample_id}")

    else:
        print("❌ No blood samples found in GTEx data")
        print("Available tissue types:")
        print(sample_attrs['SMTS'].value_counts().head(10))

In [None]:
## Cell 6: Alternative Tissue Exploration

In [None]:

if sample_attrs is not None:
    print("=== ALTERNATIVE BLOOD-COMPATIBLE TISSUES ===")

    # Look for other potentially compatible tissues
    alternative_keywords = [
        'Spleen',  # Immune organ
        'Lymph',   # Lymphatic system
        'Bone',    # Bone marrow (unlikely but worth checking)
        'Marrow',  # Bone marrow
        'immune',  # Any immune-related
        'cell',    # Cell types
        'EBV'      # EBV-transformed lymphocytes (common in GTEx)
    ]

    print("Looking for alternative blood-compatible tissues...")

    for keyword in alternative_keywords:
        matches = sample_attrs[
            sample_attrs['SMTS'].str.contains(keyword, case=False, na=False) |
            sample_attrs['SMTSD'].str.contains(keyword, case=False, na=False)
        ]

        if len(matches) > 0:
            print(f"\n🔍 '{keyword}' matches ({len(matches)} samples):")
            print(matches['SMTSD'].value_counts())

    # Check for any cell line or transformed cell samples
    transformed = sample_attrs[
        sample_attrs['SMTSD'].str.contains('transform|cell line|EBV|lymphocyte', case=False, na=False)
    ]

    if len(transformed) > 0:
        print(f"\n🧬 Transformed/cell line samples ({len(transformed)}):")
        print(transformed['SMTSD'].value_counts())

In [None]:
## Cell 7: Sample Quality Assessment

In [None]:
if sample_attrs is not None and len(blood_samples) > 0:
    print("=== BLOOD SAMPLE QUALITY ASSESSMENT ===")

    # Create quality plots
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))

    # RIN scores (RNA quality)
    if 'SMRIN' in blood_samples.columns:
        axes[0,0].hist(blood_samples['SMRIN'].dropna(), bins=20, alpha=0.7, color='red')
        axes[0,0].set_title('RNA Integrity Number (RIN) Distribution')
        axes[0,0].set_xlabel('RIN Score')
        axes[0,0].set_ylabel('Count')
        axes[0,0].axvline(7.0, color='green', linestyle='--', label='Good Quality (RIN ≥ 7)')
        axes[0,0].legend()

    # Ischemic time (sample preservation)
    if 'SMTSISCH' in blood_samples.columns:
        ischemic_time = pd.to_numeric(blood_samples['SMTSISCH'], errors='coerce')
        axes[0,1].hist(ischemic_time.dropna(), bins=20, alpha=0.7, color='blue')
        axes[0,1].set_title('Ischemic Time Distribution')
        axes[0,1].set_xlabel('Ischemic Time (minutes)')
        axes[0,1].set_ylabel('Count')

    # Collection center
    if 'SMCENTER' in blood_samples.columns:
        center_counts = blood_samples['SMCENTER'].value_counts()
        axes[1,0].bar(range(len(center_counts)), center_counts.values, color='green', alpha=0.7)
        axes[1,0].set_title('Samples by Collection Center')
        axes[1,0].set_xlabel('Collection Center')
        axes[1,0].set_ylabel('Count')
        axes[1,0].set_xticks(range(len(center_counts)))
        axes[1,0].set_xticklabels(center_counts.index, rotation=45)

    # Sample processing batch
    if 'SMNABTCH' in blood_samples.columns:
        batch_counts = blood_samples['SMNABTCH'].value_counts()
        axes[1,1].hist(range(len(batch_counts)), weights=batch_counts.values,
                       bins=min(20, len(batch_counts)), alpha=0.7, color='orange')
        axes[1,1].set_title('Processing Batch Distribution')
        axes[1,1].set_xlabel('Batch')
        axes[1,1].set_ylabel('Sample Count')

    plt.tight_layout()
    plt.savefig('data/gtex_blood_quality_assessment.png', dpi=300, bbox_inches='tight')
    plt.show()

    # Quality filtering recommendations
    print("\n=== QUALITY FILTERING RECOMMENDATIONS ===")

    if 'SMRIN' in blood_samples.columns:
        high_quality_rin = blood_samples[blood_samples['SMRIN'] >= 7.0]
        print(f"Samples with RIN ≥ 7.0: {len(high_quality_rin)} / {len(blood_samples)} ({len(high_quality_rin)/len(blood_samples)*100:.1f}%)")

    if 'SMTSISCH' in blood_samples.columns:
        ischemic_time = pd.to_numeric(blood_samples['SMTSISCH'], errors='coerce')
        low_ischemic = blood_samples[ischemic_time <= 1440]  # ≤ 24 hours
        print(f"Samples with ischemic time ≤ 24h: {len(low_ischemic)} / {len(blood_samples)} ({len(low_ischemic)/len(blood_samples)*100:.1f}%)")

In [None]:
## Cell 8: Compatibility Assessment with GDC Data

In [None]:
if sample_attrs is not None and len(blood_samples) > 0:
    print("=== COMPATIBILITY ASSESSMENT ===")

    # Compare with your GDC metadata
    print("GTEx Blood Samples:")
    print(f"  Total count: {len(blood_samples)}")
    print(f"  Tissue types: {blood_samples['SMTSD'].unique()}")

    print("\nYour GDC Unhealthy Samples:")
    print("  Total count: 871 tumor + 871 normal")
    print("  Tissue types: peripheral whole blood, whole bone marrow")

    # Compatibility analysis
    gtex_tissues = set(blood_samples['SMTSD'].str.lower())
    gdc_tissues = {'peripheral whole blood', 'whole bone marrow'}

    compatible_tissues = []
    for gtex_tissue in gtex_tissues:
        for gdc_tissue in gdc_tissues:
            if 'blood' in gtex_tissue and 'blood' in gdc_tissue:
                compatible_tissues.append(gtex_tissue)
                break

    print(f"\n🎯 COMPATIBILITY VERDICT:")
    if len(compatible_tissues) > 0:
        print(f"✅ Found {len(compatible_tissues)} compatible tissue type(s):")
        for tissue in compatible_tissues:
            count = blood_samples[blood_samples['SMTSD'].str.lower() == tissue].shape[0]
            print(f"  - {tissue}: {count} samples")

        total_compatible = sum(blood_samples[blood_samples['SMTSD'].str.lower().isin(compatible_tissues)].shape[0]
                              for tissue in compatible_tissues)
        print(f"\n📊 Total compatible samples: {total_compatible}")

        if total_compatible >= 100:
            print("✅ SUFFICIENT for binary classification!")
        elif total_compatible >= 50:
            print("⚠️  MARGINAL - might work but underpowered")
        else:
            print("❌ INSUFFICIENT - too few samples")
    else:
        print("❌ No directly compatible tissues found")
        print("Recommendation: Use TARGET dataset instead")

In [None]:
## Cell 9: Generate Final Recommendations

In [None]:
if sample_attrs is not None:
    print("=== FINAL RECOMMENDATIONS ===")

    if len(blood_samples) > 0:
        compatible_count = len(blood_samples)

        print(f"\n🩸 GTEx Blood Samples Available: {compatible_count}")

        if compatible_count >= 100:
            print("\n✅ RECOMMENDATION: Use GTEx blood samples")
            print("Next steps:")
            print("1. Filter GTEx gene expression data to blood samples only")
            print("2. Use sample IDs from 'data/gtex_blood_sample_ids.txt'")
            print("3. Proceed with your current ML pipeline")
            print("4. Expected result: Realistic AUC (0.7-0.9), not perfect separation")

        elif compatible_count >= 50:
            print("\n⚠️  RECOMMENDATION: GTEx blood samples might work")
            print("Considerations:")
            print("- Sample size is marginal for robust ML")
            print("- Consider combining with TARGET dataset")
            print("- Expect higher variance in results")

        else:
            print("\n❌ RECOMMENDATION: GTEx blood samples insufficient")
            print("Alternative:")
            print("- Apply for TARGET controlled access (436 samples)")
            print("- Much more robust for your research")

    else:
        print("\n❌ RECOMMENDATION: No suitable GTEx samples found")
        print("Best alternatives:")
        print("1. TARGET-AML controlled access (436 blood/bone marrow controls)")
        print("2. Beat AML dataset (21 bone marrow controls)")
        print("3. St. Jude Cloud (tissue-matched controls)")

    print("\n" + "="*50)
    print("SUMMARY:")
    print("- GTEx primarily contains solid tissue samples")
    print("- Blood samples in GTEx are limited")
    print("- TARGET dataset remains your best option")
    print("- Apply for controlled access - it's worth it!")

In [None]:
## Cell 10: Export Results and Sample Lists

In [None]:
if sample_attrs is not None and len(blood_samples) > 0:
    print("=== EXPORTING RESULTS ===")

    # Create comprehensive results file
    results = {
        'total_gtex_samples': len(sample_attrs),
        'blood_samples_found': len(blood_samples),
        'blood_sample_ids': blood_samples.index.tolist(),
        'tissue_breakdown': blood_samples['SMTSD'].value_counts().to_dict(),
        'recommendation': 'Use TARGET instead' if len(blood_samples) < 100 else 'GTEx blood samples viable'
    }

    # Save detailed blood sample metadata
    if len(blood_samples) > 0:
        blood_samples.to_csv('data/gtex_blood_samples_metadata.csv')
        print(f"✓ Saved detailed metadata for {len(blood_samples)} blood samples")

    # Save summary report
    with open('data/gtex_exploration_summary.txt', 'w') as f:
        f.write("GTEx Blood Sample Exploration Summary\n")
        f.write("="*40 + "\n\n")
        f.write(f"Total GTEx samples analyzed: {len(sample_attrs)}\n")
        f.write(f"Blood samples identified: {len(blood_samples)}\n\n")

        if len(blood_samples) > 0:
            f.write("Blood sample tissue breakdown:\n")
            for tissue, count in blood_samples['SMTSD'].value_counts().items():
                f.write(f"  {tissue}: {count}\n")

        f.write(f"\nRecommendation: {results['recommendation']}\n")

    print("✓ Saved exploration summary to 'data/gtex_exploration_summary.txt'")
    print("\nFiles created:")
    print("- data/gtex_blood_sample_ids.txt")
    print("- data/gtex_blood_samples_metadata.csv")
    print("- data/gtex_exploration_summary.txt")
    print("- data/gtex_blood_quality_assessment.png")

else:
    print("❌ No results to export - please check metadata files")

## Instructions for Use:

1. **Download the metadata files** from GTEx Portal
2. **Run cells 1-2** to set up and download files
3. **Run cells 3-10** to explore blood samples
4. **Check the final recommendations** in cell 9

This notebook will tell you exactly how many blood samples are available in GTEx and whether they're sufficient for your ML analysis!