In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import warnings
warnings.filterwarnings('ignore')

## 2. Data Loading
Load raw TCGA-SARC multi-omics datasets

In [2]:
print("Loading TCGA-SARC multi-omics datasets...")
print("=" * 50)

# Load multi-omics data
expression_data = pd.read_csv('../TCGA-SARC.star_tpm.tsv', sep='\t', index_col=0)  # Gene expression (TPM)
methylation_data = pd.read_csv('../TCGA-SARC.methylation450.tsv', sep='\t', index_col=0)  # DNA methylation
copy_number_data = pd.read_csv('../TCGA-SARC.gene-level_absolute.tsv', sep='\t', index_col=0) # Copy number variations (absolute)

# Load clinical data with error handling
try:
    phenotype_data = pd.read_csv('../TCGA-SARC.clinical.tsv', sep='\t', index_col=0)
except Exception as e:
    print(f"Warning: Initial load failed ({e}), attempting with error handling...")
    phenotype_data = pd.read_csv('../TCGA-SARC.clinical.tsv', sep='\t', index_col=0, on_bad_lines='skip')

print("Raw data shapes:")
print(f"📊 Expression data: {expression_data.shape} (genes x samples)")
print(f"🧬 Methylation data: {methylation_data.shape} (CpG sites x samples)")
print(f"📈 Copy number data: {copy_number_data.shape} (genes x samples)")
print(f"🏥 Clinical data: {phenotype_data.shape} (samples x features)")

print("\n✅ Data loading completed!")

Loading TCGA-SARC multi-omics datasets...
Raw data shapes:
📊 Expression data: (60660, 265) (genes x samples)
🧬 Methylation data: (486427, 269) (CpG sites x samples)
📈 Copy number data: (60623, 248) (genes x samples)
🏥 Clinical data: (272, 78) (samples x features)

✅ Data loading completed!


In [3]:
print("RAW DATASETS - First 5 rows and basic info")
print("="*80)


print("✅ Raw dataset preview completed.")
print(phenotype_data.dtypes[:10])
print("Data types (first 10 clinical features):")
print(phenotype_data.iloc[:5, :5])
print("Top 5 rows:")
print("Columns (first 5 clinical features):", list(phenotype_data.columns[:5]))
print("Index (first 5 sample IDs):", list(phenotype_data.index[:5]))
print(f"Shape: {phenotype_data.shape} (rows=samples, cols=features)")
print(copy_number_data.dtypes[:5])
print("Data types (first 5 columns):")
print(copy_number_data.iloc[:5, :5])
print("Top 5 rows:")
print("Columns (first 5 sample IDs):", list(copy_number_data.columns[:5]))
print("Index (first 5 feature IDs):", list(copy_number_data.index[:5]))
print(f"Shape: {copy_number_data.shape} (rows=features, cols=samples)")
print(methylation_data.dtypes[:5])
print("Data types (first 5 columns):")
print(methylation_data.iloc[:5, :5])
print("Top 5 rows:")
print("Columns (first 5 sample IDs):", list(methylation_data.columns[:5]))
print("Index (first 5 probe IDs):", list(methylation_data.index[:5]))
print(f"Shape: {methylation_data.shape} (rows=probes, cols=samples)")
print(expression_data.dtypes[:5])
print("Data types (first 5 columns):")
print(expression_data.iloc[:5, :5])
print("Top 5 rows:")
print("Columns (first 5 sample IDs):", list(expression_data.columns[:5]))
print("Index (first 5 gene IDs):", list(expression_data.index[:5]))
print(f"Shape: {expression_data.shape}")

RAW DATASETS - First 5 rows and basic info
✅ Raw dataset preview completed.
id                           object
disease_type                 object
case_id                      object
submitter_id                 object
primary_site                 object
alcohol_history.exposures    object
race.demographic             object
gender.demographic           object
ethnicity.demographic        object
vital_status.demographic     object
dtype: object
Data types (first 10 clinical features):
                                                    id         disease_type  \
sample                                                                        
TCGA-PC-A5DP-01A  f0cbc2a4-c1f7-4b32-bce1-57eb59d351cc  Myomatous Neoplasms   
TCGA-QQ-A8VB-01A  4c323272-2e94-4910-a894-7836570852cd  Nerve Sheath Tumors   
TCGA-IS-A3K8-01A  dcd4d76e-4e41-4a84-b87c-6c94586e73f4  Myomatous Neoplasms   
TCGA-KF-A41W-01A  ca0f032b-a34c-4a92-a403-bc7dc6eca8e9  Myomatous Neoplasms   
TCGA-KF-A41W-11A  ca0f032b-a34c-4a9

## 3. Sample Matching & Quality Control
Identify common samples across all omics platforms

In [4]:
print("Sample matching and quality assessment...")
print("=" * 50)

# Check sample overlap between different omics data
samples_expression = set(expression_data.columns)
samples_methylation = set(methylation_data.columns)
samples_cnv = set(copy_number_data.columns)
samples_clinical = set(phenotype_data.index)

print("Sample counts per modality:")
print(f"🧬 Expression samples: {len(samples_expression)}")
print(f"🔬 Methylation samples: {len(samples_methylation)}")
print(f"📊 CNV samples: {len(samples_cnv)}")
print(f"🏥 Clinical samples: {len(samples_clinical)}")

# Find common samples across all omics
common_samples = list(samples_expression.intersection(samples_methylation, samples_cnv, samples_clinical))
print(f"\n🎯 Common samples across all omics: {len(common_samples)}")

# Filter data to keep only common samples
expression_data = expression_data[common_samples]
methylation_data = methylation_data[common_samples]
copy_number_data = copy_number_data[common_samples]
phenotype_data = phenotype_data.loc[common_samples]

print(f"\n📏 Filtered data shapes:")
print(f"Expression: {expression_data.shape}")
print(f"Methylation: {methylation_data.shape}")
print(f"Copy Number: {copy_number_data.shape}")
print(f"Clinical: {phenotype_data.shape}")

print("\n✅ Sample matching completed!")

Sample matching and quality assessment...
Sample counts per modality:
🧬 Expression samples: 265
🔬 Methylation samples: 269
📊 CNV samples: 248
🏥 Clinical samples: 272

🎯 Common samples across all omics: 246

📏 Filtered data shapes:
Expression: (60660, 246)
Methylation: (486427, 246)
Copy Number: (60623, 246)
Clinical: (246, 78)

✅ Sample matching completed!


## 4. Missing Value Assessment
Comprehensive analysis of missing values across all omics

In [5]:
print("Missing value assessment...")
print("=" * 50)

# Check for null values in each omics modality
def assess_missing_values(data, name):
    total_values = data.size
    missing_count = data.isnull().sum().sum()
    missing_percentage = (missing_count / total_values) * 100
    
    print(f"\n📊 {name}:")
    print(f"   Total values: {total_values:,}")
    print(f"   Missing values: {missing_count:,}")
    print(f"   Missing percentage: {missing_percentage:.2f}%")
    
    if missing_count > 0:
        print(f"   ⚠️  Contains missing values - preprocessing required")
    else:
        print(f"   ✅ No missing values found")
    
    return missing_count, missing_percentage

# Assess each omics modality
expr_missing, expr_pct = assess_missing_values(expression_data, "Expression Data")
meth_missing, meth_pct = assess_missing_values(methylation_data, "Methylation Data")
cnv_missing, cnv_pct = assess_missing_values(copy_number_data, "Copy Number Data")
pheno_missing, pheno_pct = assess_missing_values(phenotype_data, "Phenotype Data")

# Summary
print(f"\n📋 MISSING VALUE SUMMARY:")
print(f"Expression: {expr_missing:,} ({expr_pct:.2f}%)")
max_value = expression_data.max().max()
min_value = expression_data.min().min()
print(f"Maximum Expression value: {max_value}")
print(f"Minimum Expression value: {min_value}")
print(f"Methylation: {meth_missing:,} ({meth_pct:.2f}%)")
max_value = methylation_data.max().max()
min_value = methylation_data.min().min()
print(f"Maximum Methylation value: {max_value}")
print(f"Minimum Methylation value: {min_value}")
print(f"Copy Number: {cnv_missing:,} ({cnv_pct:.2f}%)")
max_value = copy_number_data.max().max()
min_value = copy_number_data.min().min()

print(f"Maximum CNV value: {max_value}")
print(f"Minimum CNV value: {min_value}")
print()
print(f"Phenotype: {pheno_missing:,} ({pheno_pct:.2f}%)")

print("\n✅ Missing value assessment completed!")

Missing value assessment...

📊 Expression Data:
   Total values: 14,922,360
   Missing values: 0
   Missing percentage: 0.00%
   ✅ No missing values found

📊 Methylation Data:
   Total values: 119,661,042
   Missing values: 19,622,910
   Missing percentage: 16.40%
   ⚠️  Contains missing values - preprocessing required

📊 Copy Number Data:
   Total values: 14,913,258
   Missing values: 994,360
   Missing percentage: 6.67%
   ⚠️  Contains missing values - preprocessing required

📊 Phenotype Data:
   Total values: 19,188
   Missing values: 3,960
   Missing percentage: 20.64%
   ⚠️  Contains missing values - preprocessing required

📋 MISSING VALUE SUMMARY:
Expression: 0 (0.00%)
Maximum Expression value: 17.262945359182932
Minimum Expression value: 0.0
Methylation: 19,622,910 (16.40%)
Maximum Methylation value: 0.996083421132424
Minimum Methylation value: 0.0038287026893228
Copy Number: 994,360 (6.67%)
Maximum CNV value: 7.0
Minimum CNV value: 0.0

Phenotype: 3,960 (20.64%)

✅ Missing valu

In [14]:
from sklearn.feature_selection import VarianceThreshold
import numpy as np
import pandas as pd

# --- Expression preprocessing ---
expr_log = np.log2(expression_data + 1)
expr_selector = VarianceThreshold(threshold=0.01)
expression_data_scaled = pd.DataFrame(
    expr_selector.fit_transform(expr_log.T).T,
    index=expr_log.index[expr_selector.get_support()],
    columns=expr_log.columns
)
expr_mask = expr_selector.get_support()

# --- Methylation preprocessing ---
methylation_filtered = methylation_data.dropna(thresh=0.20 * methylation_data.shape[1], axis=0)
methylation_imputed = methylation_filtered.apply(lambda r: r.fillna(r.median()), axis=1)
meth_m = np.log2(methylation_imputed / (1 - methylation_imputed))
meth_selector = VarianceThreshold(threshold=0.01)
methylation_scaled = pd.DataFrame(
    meth_selector.fit_transform(meth_m.T).T,
    index=meth_m.index[meth_selector.get_support()],
    columns=meth_m.columns
)
meth_mask = meth_selector.get_support()

# --- CNV preprocessing ---
cnv_filtered = copy_number_data.loc[copy_number_data.isnull().mean(axis=1) < 0.2]
cnv_imputed = cnv_filtered.apply(lambda r: r.fillna(r.median()), axis=1)
cnv_clipped = cnv_imputed.clip(lower=0.05, upper=6)
log_cnv = np.log2(cnv_clipped / 2)
cnv_selector = VarianceThreshold(threshold=0.01)
copy_number_scaled = pd.DataFrame(
    cnv_selector.fit_transform(log_cnv.T).T,
    index=log_cnv.index[cnv_selector.get_support()],
    columns=log_cnv.columns
)
cnv_mask = cnv_selector.get_support()
print("Feature counts after filtering:")
print("Expression:", expression_data_scaled.shape)
print("Methylation:", methylation_scaled.shape)
print("CNV:", copy_number_scaled.shape)

Feature counts after filtering:
Expression: (44663, 246)
Methylation: (419542, 246)
CNV: (56756, 246)


In [15]:
print("Processing Phenotype Data...")
print("=" * 50)

# Define subtype column and selected subtypes
subtype_column = 'primary_diagnosis.diagnoses'
selected_subtypes = [
    'Leiomyosarcoma, NOS',
    'Dedifferentiated liposarcoma',
    'Undifferentiated sarcoma',
    'Fibromyxosarcoma'
]

print(f"🎯 Target column: '{subtype_column}'")
print(f"📊 Original subtype distribution:")
subtype_counts = phenotype_data[subtype_column].value_counts()
for subtype, count in subtype_counts.items():
    marker = "✅" if subtype in selected_subtypes else "❌"
    print(f"   {marker} {subtype}: {count}")

# Filter to selected subtypes only
print(f"\n🔄 Filtering to selected subtypes...")
before_filter = len(phenotype_data)
phenotype_data = phenotype_data[phenotype_data[subtype_column].isin(selected_subtypes)]
after_filter = len(phenotype_data)
removed_samples = before_filter - after_filter
print(f"📊 Removed {removed_samples} samples ({removed_samples/before_filter*100:.1f}%)")
print(f"📊 Remaining samples: {after_filter}")

# Check for missing subtypes
missing_subtypes = phenotype_data[subtype_column].isnull().sum()
print(f"\n🔍 Missing values in subtype column: {missing_subtypes}")

if missing_subtypes > 0:
    print("🔄 Removing samples with missing subtypes...")
    phenotype_data_clean = phenotype_data.dropna(subset=[subtype_column])
    print(f"📊 Removed {missing_subtypes} samples with missing subtypes")
else:
    phenotype_data_clean = phenotype_data.copy()
    print("✅ No missing subtypes found")

print(f"📊 Clean phenotype data shape: {phenotype_data_clean.shape}")

# Encode subtypes as numeric labels
print("\n🔄 Encoding subtypes as numeric labels...")
subtypes = phenotype_data_clean[subtype_column]
label_encoder = LabelEncoder()
subtype_encoded = label_encoder.fit_transform(subtypes)

# Create and display encoding mapping
subtype_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"📋 Subtype encoding mapping:")
for subtype, encoded in subtype_mapping.items():
    print(f"   {encoded}: {subtype}")

# Convert to pandas Series for easy handling
subtype_encoded = pd.Series(subtype_encoded, index=subtypes.index, name='subtype_encoded')

print(f"\n📊 Encoded subtype distribution:")
encoded_counts = subtype_encoded.value_counts().sort_index()
for label, count in encoded_counts.items():
    subtype_name = label_encoder.classes_[label]
    print(f"   Class {label}: {count} samples ({subtype_name})")

print("\n✅ Phenotype data processing completed!")

Processing Phenotype Data...
🎯 Target column: 'primary_diagnosis.diagnoses'
📊 Original subtype distribution:
   ✅ Leiomyosarcoma, NOS: 96
   ✅ Dedifferentiated liposarcoma: 53
   ✅ Undifferentiated sarcoma: 34
   ✅ Fibromyxosarcoma: 22

🔄 Filtering to selected subtypes...
📊 Removed 0 samples (0.0%)
📊 Remaining samples: 205

🔍 Missing values in subtype column: 0
✅ No missing subtypes found
📊 Clean phenotype data shape: (205, 78)

🔄 Encoding subtypes as numeric labels...
📋 Subtype encoding mapping:
   0: Dedifferentiated liposarcoma
   1: Fibromyxosarcoma
   2: Leiomyosarcoma, NOS
   3: Undifferentiated sarcoma

📊 Encoded subtype distribution:
   Class 0: 53 samples (Dedifferentiated liposarcoma)
   Class 1: 22 samples (Fibromyxosarcoma)
   Class 2: 96 samples (Leiomyosarcoma, NOS)
   Class 3: 34 samples (Undifferentiated sarcoma)

✅ Phenotype data processing completed!


In [16]:
print("Final Sample Alignment...")
print("=" * 50)

# Update common samples with available subtypes
valid_samples = list(set(common_samples).intersection(set(phenotype_data_clean.index)))
print(f"🔄 Updating common samples: {len(common_samples)} → {len(valid_samples)}")
removed_samples = len(common_samples) - len(valid_samples)
print(f"📊 Removed {removed_samples} samples (missing subtypes or not in selected subtypes)")

# Align all datasets to valid samples
print("\n🔄 Aligning all datasets to valid samples...")
expression_data_scaled = expression_data_scaled[valid_samples]
methylation_scaled = methylation_scaled[valid_samples]
copy_number_scaled = copy_number_scaled[valid_samples]
subtype_encoded = subtype_encoded.loc[valid_samples]
phenotype_data_clean = phenotype_data_clean.loc[valid_samples]
common_samples = valid_samples

# Final shape verification
print(f"\n📏 Final aligned data shapes:")
print(f"   Expression: {expression_data_scaled.shape} (genes x samples)")
print(f"   Methylation: {methylation_scaled.shape} (probes x samples)")
print(f"   Copy Number: {copy_number_scaled.shape} (regions x samples)")

# Check for infinite values in each dataset
for name, df in [
    ("expression_data_scaled", expression_data_scaled),
    ("methylation_scaled", methylation_scaled),
    ("copy_number_scaled", copy_number_scaled)
]:
    has_inf = np.isinf(df.select_dtypes(include=[np.number])).any().any()
    print(f"   Any inf values in {name}? {has_inf}")
print(f"   Phenotype: {phenotype_data_clean.shape} (samples x features)")
print(f"   Subtypes: {len(subtype_encoded)} (samples)")
print(f"   Common samples: {len(common_samples)}")

# Verify sample consistency
sample_sets = [
    set(expression_data_scaled.columns),
    set(methylation_scaled.columns),
    set(copy_number_scaled.columns),
    set(subtype_encoded.index),
    set(phenotype_data_clean.index)
]

all_consistent = all(s == sample_sets[0] for s in sample_sets)
if all_consistent:
    print("\n✅ All datasets have consistent sample alignment!")
else:
    print("\n❌ Warning: Sample alignment inconsistency detected!")

print("\n✅ Final sample alignment completed!")

Final Sample Alignment...
🔄 Updating common samples: 205 → 205
📊 Removed 0 samples (missing subtypes or not in selected subtypes)

🔄 Aligning all datasets to valid samples...

📏 Final aligned data shapes:
   Expression: (44663, 205) (genes x samples)
   Methylation: (419542, 205) (probes x samples)
   Copy Number: (56756, 205) (regions x samples)
   Any inf values in expression_data_scaled? False
   Any inf values in methylation_scaled? False
   Any inf values in copy_number_scaled? False
   Phenotype: (205, 78) (samples x features)
   Subtypes: 205 (samples)
   Common samples: 205

✅ All datasets have consistent sample alignment!

✅ Final sample alignment completed!


In [17]:
import numpy as np

for name, df in [
    ("expression_data_scaled", expression_data_scaled),
    ("methylation_scaled", methylation_scaled),
    ("copy_number_scaled", copy_number_scaled),
    ("phenotype_data_clean", phenotype_data_clean)
]:
    has_inf = np.isinf(df.select_dtypes(include=[np.number])).any().any()
    print(f"Any inf values in {name}? {has_inf}")

Any inf values in expression_data_scaled? False
Any inf values in methylation_scaled? False
Any inf values in copy_number_scaled? False
Any inf values in phenotype_data_clean? False


## 10. Dataset Preview
Display first 5 rows of all processed datasets

In [18]:
print("DATASET PREVIEW: First 5 rows of all processed datasets")
print("=" * 80)

# Expression Data Preview
print("\n📊 EXPRESSION DATA (Log2-transformed, variance filtered)")
print("-" * 60)
print(f"Shape: {expression_data_scaled.shape} (genes x samples)")
print("First 5 genes, first 5 samples:")
print(expression_data_scaled.iloc[:5, :5])
print(f"Data type: {expression_data_scaled.dtypes.iloc[0]}")
print(f"Value range: [{expression_data_scaled.min().min():.3f}, {expression_data_scaled.max().max():.3f}]")

# Methylation Data Preview  
print("\n🧬 METHYLATION DATA (M-values, variance filtered)")
print("-" * 60)
print(f"Shape: {methylation_scaled.shape} (CpG sites x samples)")
print("First 5 CpG sites, first 5 samples:")
print(methylation_scaled.iloc[:5, :5])
print(f"Data type: {methylation_scaled.dtypes.iloc[0]}")
print(f"Value range: [{methylation_scaled.min().min():.3f}, {methylation_scaled.max().max():.3f}]")

# Copy Number Data Preview
print("\n📈 COPY NUMBER DATA (Log2 ratio, variance filtered)")
print("-" * 60)
print(f"Shape: {copy_number_scaled.shape} (regions x samples)")
print("First 5 regions, first 5 samples:")
print(copy_number_scaled.iloc[:5, :5])
print(f"Data type: {copy_number_scaled.dtypes.iloc[0]}")
print(f"Value range: [{copy_number_scaled.min().min():.3f}, {copy_number_scaled.max().max():.3f}]")

# Phenotype Data Preview
print("\n🏥 PHENOTYPE DATA (Clinical features)")
print("-" * 60)
print(f"Shape: {phenotype_data_clean.shape} (samples x features)")
print("First 5 samples, first 5 clinical features:")
print(phenotype_data_clean.iloc[:5, :5])
print(f"Total clinical features: {phenotype_data_clean.shape[1]}")

# Subtype Labels Preview
print("\n🎯 SUBTYPE LABELS (Encoded)")
print("-" * 60)
print(f"Shape: {subtype_encoded.shape} (samples)")
print("First 10 samples with their encoded subtypes:")
for i in range(min(10, len(subtype_encoded))):
    sample_id = subtype_encoded.index[i]
    encoded_label = subtype_encoded.iloc[i]
    original_subtype = label_encoder.inverse_transform([encoded_label])[0]
    print(f"   {sample_id}: {encoded_label} ({original_subtype})")

print(f"\nLabel distribution:")
for label in sorted(subtype_encoded.unique()):
    count = (subtype_encoded == label).sum()
    subtype_name = label_encoder.inverse_transform([label])[0]
    print(f"   Class {label}: {count} samples ({subtype_name})")

print("\n✅ Dataset preview completed!")

DATASET PREVIEW: First 5 rows of all processed datasets

📊 EXPRESSION DATA (Log2-transformed, variance filtered)
------------------------------------------------------------
Shape: (44663, 205) (genes x samples)
First 5 genes, first 5 samples:
                    TCGA-DX-A6BF-01A  TCGA-K1-A3PO-01A  TCGA-MJ-A68J-01A  \
Ensembl_ID                                                                 
ENSG00000000003.15          2.054906          2.454944          2.399346   
ENSG00000000005.6           1.038691          0.392627          0.405410   
ENSG00000000419.13          2.902904          2.911035          2.860862   
ENSG00000000457.14          1.178986          1.929639          1.668313   
ENSG00000000460.17          1.086512          1.363362          1.362192   

                    TCGA-3B-A9HZ-01A  TCGA-DX-A1L0-01A  
Ensembl_ID                                              
ENSG00000000003.15          2.076216          2.568289  
ENSG00000000005.6           1.088846          2.0732

## 11. Data Export
Save processed datasets for downstream analysis

In [None]:
# print("Exporting Processed Data...")
# print("=" * 50)

# # Define output directory
# output_dir = "../NewDatasets/"
# # Save main processed files (features as rows, samples as columns)
# expression_data_scaled.to_csv(f"{output_dir}processed_expression_FXS_OG.csv", mode='w')
# methylation_scaled.to_csv(f"{output_dir}processed_methylation_FXS_OG.csv", mode='w')
# copy_number_scaled.to_csv(f"{output_dir}processed_cnv_FXS_OG.csv", mode='w')
# phenotype_data_clean.to_csv(f"{output_dir}processed_phenotype_FXS_OG.csv", mode='w')
# subtype_encoded.to_csv(f"{output_dir}processed_labels_3Omics_FXS_OG.csv", header=True, mode='w')