In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

print("Libraries imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"Device available: {'CUDA' if torch.cuda.is_available() else 'CPU'}")

Libraries imported successfully!
PyTorch version: 2.8.0+cu129
Device available: CUDA


## 2. Data Loading
Load raw TCGA-SARC multi-omics datasets

In [2]:
print("Loading TCGA-SARC multi-omics datasets...")
print("=" * 50)

# Load multi-omics data
expression_data = pd.read_csv('../TCGA-SARC.star_tpm.tsv', sep='\t', index_col=0)  # Gene expression (TPM)
methylation_data = pd.read_csv('../TCGA-SARC.methylation450.tsv', sep='\t', index_col=0)  # DNA methylation
copy_number_data = pd.read_csv('../TCGA-SARC.gene-level_absolute.tsv', sep='\t', index_col=0) # Copy number variations (absolute)

# Load clinical data with error handling
try:
    phenotype_data = pd.read_csv('../TCGA-SARC.clinical.tsv', sep='\t', index_col=0)
except Exception as e:
    print(f"Warning: Initial load failed ({e}), attempting with error handling...")
    phenotype_data = pd.read_csv('../TCGA-SARC.clinical.tsv', sep='\t', index_col=0, on_bad_lines='skip')

print("Raw data shapes:")
print(f"📊 Expression data: {expression_data.shape} (genes x samples)")
print(f"🧬 Methylation data: {methylation_data.shape} (CpG sites x samples)")
print(f"📈 Copy number data: {copy_number_data.shape} (genes x samples)")
print(f"🏥 Clinical data: {phenotype_data.shape} (samples x features)")

print("\n✅ Data loading completed!")

Loading TCGA-SARC multi-omics datasets...
Raw data shapes:
📊 Expression data: (60660, 265) (genes x samples)
🧬 Methylation data: (486427, 269) (CpG sites x samples)
📈 Copy number data: (60623, 248) (genes x samples)
🏥 Clinical data: (272, 78) (samples x features)

✅ Data loading completed!


## 3. Sample Matching & Quality Control
Identify common samples across all omics platforms

In [3]:
print("Sample matching and quality assessment...")
print("=" * 50)

# Check sample overlap between different omics data
samples_expression = set(expression_data.columns)
samples_methylation = set(methylation_data.columns)
samples_cnv = set(copy_number_data.columns)
samples_clinical = set(phenotype_data.index)

print("Sample counts per modality:")
print(f"🧬 Expression samples: {len(samples_expression)}")
print(f"🔬 Methylation samples: {len(samples_methylation)}")
print(f"📊 CNV samples: {len(samples_cnv)}")
print(f"🏥 Clinical samples: {len(samples_clinical)}")

# Find common samples across all omics
common_samples = list(samples_expression.intersection(samples_methylation, samples_cnv, samples_clinical))
print(f"\n🎯 Common samples across all omics: {len(common_samples)}")

# Filter data to keep only common samples
expression_data = expression_data[common_samples]
methylation_data = methylation_data[common_samples]
copy_number_data = copy_number_data[common_samples]
phenotype_data = phenotype_data.loc[common_samples]

print(f"\n📏 Filtered data shapes:")
print(f"Expression: {expression_data.shape}")
print(f"Methylation: {methylation_data.shape}")
print(f"Copy Number: {copy_number_data.shape}")
print(f"Clinical: {phenotype_data.shape}")

print("\n✅ Sample matching completed!")

Sample matching and quality assessment...
Sample counts per modality:
🧬 Expression samples: 265
🔬 Methylation samples: 269
📊 CNV samples: 248
🏥 Clinical samples: 272

🎯 Common samples across all omics: 246

📏 Filtered data shapes:
Expression: (60660, 246)
Methylation: (486427, 246)
Copy Number: (60623, 246)
Clinical: (246, 78)

✅ Sample matching completed!


## 4. Missing Value Assessment
Comprehensive analysis of missing values across all omics

In [4]:
print("Missing value assessment...")
print("=" * 50)

# Check for null values in each omics modality
def assess_missing_values(data, name):
    total_values = data.size
    missing_count = data.isnull().sum().sum()
    missing_percentage = (missing_count / total_values) * 100
    
    print(f"\n📊 {name}:")
    print(f"   Total values: {total_values:,}")
    print(f"   Missing values: {missing_count:,}")
    print(f"   Missing percentage: {missing_percentage:.2f}%")
    
    if missing_count > 0:
        print(f"   ⚠️  Contains missing values - preprocessing required")
    else:
        print(f"   ✅ No missing values found")
    
    return missing_count, missing_percentage

# Assess each omics modality
expr_missing, expr_pct = assess_missing_values(expression_data, "Expression Data")
meth_missing, meth_pct = assess_missing_values(methylation_data, "Methylation Data")
cnv_missing, cnv_pct = assess_missing_values(copy_number_data, "Copy Number Data")
pheno_missing, pheno_pct = assess_missing_values(phenotype_data, "Phenotype Data")

# Summary
print(f"\n📋 MISSING VALUE SUMMARY:")
print(f"Expression: {expr_missing:,} ({expr_pct:.2f}%)")
print(f"Methylation: {meth_missing:,} ({meth_pct:.2f}%)")
print(f"Copy Number: {cnv_missing:,} ({cnv_pct:.2f}%)")
max_value = copy_number_data.max().max()
min_value = copy_number_data.min().min()

print(f"Maximum CNV value: {max_value}")
print(f"Minimum CNV value: {min_value}")
print()
print(f"Phenotype: {pheno_missing:,} ({pheno_pct:.2f}%)")

print("\n✅ Missing value assessment completed!")

Missing value assessment...

📊 Expression Data:
   Total values: 14,922,360
   Missing values: 0
   Missing percentage: 0.00%
   ✅ No missing values found

📊 Methylation Data:
   Total values: 119,661,042
   Missing values: 19,622,910
   Missing percentage: 16.40%
   ⚠️  Contains missing values - preprocessing required

📊 Copy Number Data:
   Total values: 14,913,258
   Missing values: 994,360
   Missing percentage: 6.67%
   ⚠️  Contains missing values - preprocessing required

📊 Phenotype Data:
   Total values: 19,188
   Missing values: 3,960
   Missing percentage: 20.64%
   ⚠️  Contains missing values - preprocessing required

📋 MISSING VALUE SUMMARY:
Expression: 0 (0.00%)
Methylation: 19,622,910 (16.40%)
Copy Number: 994,360 (6.67%)
Maximum CNV value: 7.0
Minimum CNV value: 0.0

Phenotype: 3,960 (20.64%)

✅ Missing value assessment completed!


In [None]:
# expression data preprocessing -------------------------------------------------->

# Log2 transform
expression_data_log = np.log2(expression_data + 1)

# Z-score standardization
scaler_expr = StandardScaler()
expression_data_scaled = pd.DataFrame(
    scaler_expr.fit_transform(expression_data_log.T).T,
    index=expression_data_log.index,
    columns=expression_data_log.columns
)

# Methylation data preprocessing -------------------------------------------------->

# Drop probes with more than 20% missing values
methylation_data = methylation_data.dropna(thresh=0.8 * methylation_data.shape[1], axis=0)

# Fill NA with probe-wise median
methylation_data = methylation_data.apply(lambda x: x.fillna(x.median()), axis=1)

# Clip beta values to avoid log(0) or log(inf)
# Add small offset to prevent numerical issues
epsilon = 1e-6
methylation_clipped = methylation_data.clip(lower=epsilon, upper=1-epsilon)

# Convert to M-values: M = log2(Beta / (1 - Beta))
methylation_m_values = np.log2(methylation_clipped / (1 - methylation_clipped))

# Remove low-variance probes
selector = VarianceThreshold(threshold=0.01)
methylation_scaled = pd.DataFrame(
    selector.fit_transform(methylation_m_values.T).T,
    index=methylation_m_values.index[selector.get_support()],
    columns=methylation_m_values.columns
)


# Copy number data preprocessing -------------------------------------------------->

# Handle missing values
gene_missing_threshold = 0.2
cnv_filtered = copy_number_data.loc[
    copy_number_data.isnull().mean(axis=1) < gene_missing_threshold
]
cnv_imputed = cnv_filtered.apply(lambda row: row.fillna(row.median()), axis=1)

# Remove low-variance regions (RECOMMENDED for PCA)

selector = VarianceThreshold(threshold=0.01)
cnv_variable = pd.DataFrame(
    selector.fit_transform(cnv_imputed.T).T,
    index=cnv_imputed.index[selector.get_support()],
    columns=cnv_imputed.columns
)
removed = cnv_imputed.shape[0] - cnv_variable.shape[0]

scaler_cnv = StandardScaler()
copy_number_scaled = pd.DataFrame(
    scaler_cnv.fit_transform(cnv_variable.T).T,
    index=cnv_variable.index,
    columns=cnv_variable.columns
)

In [12]:
print("Processing Phenotype Data...")
print("=" * 50)

# Define subtype column and selected subtypes
subtype_column = 'primary_diagnosis.diagnoses'
selected_subtypes = [
    'Leiomyosarcoma, NOS',
    'Dedifferentiated liposarcoma',
    'Undifferentiated sarcoma',
    'Fibromyxosarcoma'
]

print(f"🎯 Target column: '{subtype_column}'")
print(f"📊 Original subtype distribution:")
subtype_counts = phenotype_data[subtype_column].value_counts()
for subtype, count in subtype_counts.items():
    marker = "✅" if subtype in selected_subtypes else "❌"
    print(f"   {marker} {subtype}: {count}")

# Filter to selected subtypes only
print(f"\n🔄 Filtering to selected subtypes...")
before_filter = len(phenotype_data)
phenotype_data = phenotype_data[phenotype_data[subtype_column].isin(selected_subtypes)]
after_filter = len(phenotype_data)
removed_samples = before_filter - after_filter
print(f"📊 Removed {removed_samples} samples ({removed_samples/before_filter*100:.1f}%)")
print(f"📊 Remaining samples: {after_filter}")

# Check for missing subtypes
missing_subtypes = phenotype_data[subtype_column].isnull().sum()
print(f"\n🔍 Missing values in subtype column: {missing_subtypes}")

if missing_subtypes > 0:
    print("🔄 Removing samples with missing subtypes...")
    phenotype_data_clean = phenotype_data.dropna(subset=[subtype_column])
    print(f"📊 Removed {missing_subtypes} samples with missing subtypes")
else:
    phenotype_data_clean = phenotype_data.copy()
    print("✅ No missing subtypes found")

print(f"📊 Clean phenotype data shape: {phenotype_data_clean.shape}")

# Encode subtypes as numeric labels
print("\n🔄 Encoding subtypes as numeric labels...")
subtypes = phenotype_data_clean[subtype_column]
label_encoder = LabelEncoder()
subtype_encoded = label_encoder.fit_transform(subtypes)

# Create and display encoding mapping
subtype_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"📋 Subtype encoding mapping:")
for subtype, encoded in subtype_mapping.items():
    print(f"   {encoded}: {subtype}")

# Convert to pandas Series for easy handling
subtype_encoded = pd.Series(subtype_encoded, index=subtypes.index, name='subtype_encoded')

print(f"\n📊 Encoded subtype distribution:")
encoded_counts = subtype_encoded.value_counts().sort_index()
for label, count in encoded_counts.items():
    subtype_name = label_encoder.classes_[label]
    print(f"   Class {label}: {count} samples ({subtype_name})")

print("\n✅ Phenotype data processing completed!")

Processing Phenotype Data...
🎯 Target column: 'primary_diagnosis.diagnoses'
📊 Original subtype distribution:
   ✅ Leiomyosarcoma, NOS: 96
   ✅ Dedifferentiated liposarcoma: 53
   ✅ Undifferentiated sarcoma: 34
   ✅ Fibromyxosarcoma: 22

🔄 Filtering to selected subtypes...
📊 Removed 0 samples (0.0%)
📊 Remaining samples: 205

🔍 Missing values in subtype column: 0
✅ No missing subtypes found
📊 Clean phenotype data shape: (205, 78)

🔄 Encoding subtypes as numeric labels...
📋 Subtype encoding mapping:
   0: Dedifferentiated liposarcoma
   1: Fibromyxosarcoma
   2: Leiomyosarcoma, NOS
   3: Undifferentiated sarcoma

📊 Encoded subtype distribution:
   Class 0: 53 samples (Dedifferentiated liposarcoma)
   Class 1: 22 samples (Fibromyxosarcoma)
   Class 2: 96 samples (Leiomyosarcoma, NOS)
   Class 3: 34 samples (Undifferentiated sarcoma)

✅ Phenotype data processing completed!


In [13]:
print("Final Sample Alignment...")
print("=" * 50)

# Update common samples with available subtypes
valid_samples = list(set(common_samples).intersection(set(phenotype_data_clean.index)))
print(f"🔄 Updating common samples: {len(common_samples)} → {len(valid_samples)}")
removed_samples = len(common_samples) - len(valid_samples)
print(f"📊 Removed {removed_samples} samples (missing subtypes or not in selected subtypes)")

# Align all datasets to valid samples
print("\n🔄 Aligning all datasets to valid samples...")
expression_data_scaled = expression_data_scaled[valid_samples]
methylation_scaled = methylation_scaled[valid_samples]
copy_number_scaled = copy_number_scaled[valid_samples]
subtype_encoded = subtype_encoded.loc[valid_samples]
phenotype_data_clean = phenotype_data_clean.loc[valid_samples]
common_samples = valid_samples

# Final shape verification
print(f"\n📏 Final aligned data shapes:")
print(f"   Expression: {expression_data_scaled.shape} (genes x samples)")
print(f"   Methylation: {methylation_scaled.shape} (probes x samples)")
print(f"   Copy Number: {copy_number_scaled.shape} (regions x samples)")
print(f"   Phenotype: {phenotype_data_clean.shape} (samples x features)")
print(f"   Subtypes: {len(subtype_encoded)} (samples)")
print(f"   Common samples: {len(common_samples)}")

# Verify sample consistency
sample_sets = [
    set(expression_data_scaled.columns),
    set(methylation_scaled.columns),
    set(copy_number_scaled.columns),
    set(subtype_encoded.index),
    set(phenotype_data_clean.index)
]

all_consistent = all(s == sample_sets[0] for s in sample_sets)
if all_consistent:
    print("\n✅ All datasets have consistent sample alignment!")
else:
    print("\n❌ Warning: Sample alignment inconsistency detected!")

print("\n✅ Final sample alignment completed!")

Final Sample Alignment...
🔄 Updating common samples: 205 → 205
📊 Removed 0 samples (missing subtypes or not in selected subtypes)

🔄 Aligning all datasets to valid samples...

📏 Final aligned data shapes:
   Expression: (60660, 205) (genes x samples)
   Methylation: (396650, 205) (probes x samples)
   Copy Number: (56756, 205) (regions x samples)
   Phenotype: (205, 78) (samples x features)
   Subtypes: 205 (samples)
   Common samples: 205

✅ All datasets have consistent sample alignment!

✅ Final sample alignment completed!


## 11. Data Export
Save processed datasets for downstream analysis

In [None]:
# print("Exporting Processed Data...")
# print("=" * 50)

# Define output directory
# output_dir = "../Updated_model_nd_dataset/"
# # Save main processed files (features as rows, samples as columns)
# expression_data_scaled.to_csv(f"{output_dir}processed_expression_FXS_OG.csv", mode='w')
# methylation_scaled.to_csv(f"{output_dir}processed_methylation_FXS_OG.csv", mode='w')
# copy_number_scaled.to_csv(f"{output_dir}processed_cnv_FXS_OG.csv", mode='w')
# phenotype_data_clean.to_csv(f"{output_dir}processed_phenotype_FXS_OG.csv", mode='w')
# subtype_encoded.to_csv(f"{output_dir}processed_labels_3Omics_FXS_OG.csv", header=True, mode='w')