# Import Tools

In [16]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
import torch.nn.functional as F
# torch.manual_seed(42)
# np.random.seed(42)
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from torch_geometric.nn import knn_graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [17]:
# Save processed data
expression_data_scaled=pd.read_csv("processed_expression.csv")
methylation_scaled=pd.read_csv("processed_methylation.csv")
copy_number_scaled=pd.read_csv("processed_cnv.csv")
protein_scaled=pd.read_csv("processed_protein.csv")
phenotype_data_cleaned=pd.read_csv("processed_phenotype.csv")
subtype_mapping=pd.read_csv("subtype_mapping_with_protein.csv")
subtype_labels = pd.read_csv("subtype_labels.csv")
print("Expression:", expression_data_scaled.shape)
print("Methylation:", methylation_scaled.shape)
print("Copy Number:", copy_number_scaled.shape)
print("Protein:", protein_scaled.shape)
print("Phenotype:", phenotype_data_cleaned.shape)
print("Phenotype:", subtype_mapping.shape)

Expression: (60660, 247)
Methylation: (212382, 247)
Copy Number: (56756, 247)
Protein: (457, 211)
Phenotype: (210, 79)
Phenotype: (13, 2)


In [18]:
print(expression_data_scaled.iloc[:5, :5])  # First 5 rows/columns
print(methylation_scaled.iloc[:5, :5])
print(copy_number_scaled.iloc[:5, :5])
print(protein_scaled.iloc[:5, :5])
print(phenotype_data_cleaned.iloc[:5, :5])
print(subtype_mapping.head())


           Ensembl_ID  TCGA-DX-AB2O-01A  TCGA-IS-A3K7-01A  TCGA-DX-A7EQ-01A  \
0  ENSG00000000003.15         -1.152830         -0.219623         -0.884881   
1   ENSG00000000005.6          0.330547         -1.014041         -0.763520   
2  ENSG00000000419.13         -0.213944         -0.593653         -4.090048   
3  ENSG00000000457.14          0.822442         -1.529460         -2.880470   
4  ENSG00000000460.17          1.047877         -1.004992         -4.226427   

   TCGA-FX-A48G-01A  
0         -0.526559  
1          0.378398  
2         -3.689964  
3         -2.294075  
4         -3.340772  
  Composite Element REF  TCGA-DX-AB2O-01A  TCGA-IS-A3K7-01A  TCGA-DX-A7EQ-01A  \
0            cg00000165          1.499566         -0.235202         -1.133312   
1            cg00000292          0.303039          0.485554         -1.261731   
2            cg00000321         -1.374888         -0.928886         -0.113112   
3            cg00000363         -0.540672          0.649121         -

In [19]:
# Fix data indexing first - set the first column as index for each omics data
expression_data_scaled = expression_data_scaled.set_index(expression_data_scaled.columns[0])
methylation_scaled = methylation_scaled.set_index(methylation_scaled.columns[0])
copy_number_scaled = copy_number_scaled.set_index(copy_number_scaled.columns[0])
protein_scaled = protein_scaled.set_index(protein_scaled.columns[0])
phenotype_data_cleaned = phenotype_data_cleaned.set_index(phenotype_data_cleaned.columns[0])

print("Data shapes after setting index:")
print(f"  Expression: {expression_data_scaled.shape}")
print(f"  Methylation: {methylation_scaled.shape}")
print(f"  Copy number: {copy_number_scaled.shape}")
print(f"  Protein: {protein_scaled.shape}")
print(f"  Phenotype: {phenotype_data_cleaned.shape}")

# Sample matching
subtype_column = 'primary_diagnosis.diagnoses'

# Get sample sets from each omics type
samples_expression = set(expression_data_scaled.columns)
samples_methylation = set(methylation_scaled.columns)
samples_cnv = set(copy_number_scaled.columns)
samples_protein = set(protein_scaled.columns)
samples_clinical = set(phenotype_data_cleaned.index)

print("Sample overlap:")
print(f"Expression samples: {len(samples_expression)}")
print(f"Methylation samples: {len(samples_methylation)}")
print(f"CNV samples: {len(samples_cnv)}")
print(f"Protein samples: {len(samples_protein)}")
print(f"Clinical samples: {len(samples_clinical)}")

# Find common samples across all omics
common_samples = list(samples_expression.intersection(samples_methylation, samples_cnv, samples_protein, samples_clinical))
print(f"Common samples across all omics: {len(common_samples)}")

# Extract subtypes for the common samples
subtypes = phenotype_data_cleaned.loc[common_samples, subtype_column]

# Remove any samples with missing subtypes
subtypes_clean = subtypes.dropna()
final_samples = list(subtypes_clean.index)

print(f"Samples after removing missing subtypes: {len(final_samples)}")

# Encode subtypes as numeric labels
label_encoder = LabelEncoder()
subtype_encoded = label_encoder.fit_transform(subtypes_clean)

# Create mapping to encode subtype classes
subtype_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"\nSubtype encoding mapping:")
for subtype, encoded in subtype_mapping.items():
    print(f"  {encoded}: {subtype}")

# Convert to pandas Series
subtype_encoded = pd.Series(subtype_encoded, index=subtypes_clean.index, name='subtype_encoded')

print(f"\nEncoded subtype distribution:")
print(subtype_encoded.value_counts().sort_index())

# Filter all omics data to final common samples
expression_data_scaled = expression_data_scaled[final_samples]
methylation_scaled = methylation_scaled[final_samples]
copy_number_scaled = copy_number_scaled[final_samples]
protein_scaled = protein_scaled[final_samples]

common_samples = final_samples
print()
print(f"Final data shapes after phenotype preprocessing:")
print(f"  Expression: {expression_data_scaled.shape}")
print(f"  Methylation: {methylation_scaled.shape}")
print(f"  Copy number: {copy_number_scaled.shape}")
print(f"  Protein: {protein_scaled.shape}")
print(f"  Phenotype: {phenotype_data_cleaned.shape}")
print(f"  Subtypes: {len(subtype_encoded)}")
print(f"  Common samples: {len(common_samples)}")

Data shapes after setting index:
  Expression: (60660, 246)
  Methylation: (212382, 246)
  Copy number: (56756, 246)
  Protein: (457, 210)
  Phenotype: (210, 78)
Sample overlap:
Expression samples: 246
Methylation samples: 246
CNV samples: 246
Protein samples: 210
Clinical samples: 210
Common samples across all omics: 210
Samples after removing missing subtypes: 210

Subtype encoding mapping:
  0: Dedifferentiated liposarcoma
  1: Fibromyxosarcoma
  2: Giant cell sarcoma
  3: Leiomyosarcoma, NOS
  4: Liposarcoma, well differentiated
  5: Malignant fibrous histiocytoma
  6: Malignant peripheral nerve sheath tumor
  7: Myxoid leiomyosarcoma
  8: Pleomorphic liposarcoma
  9: Synovial sarcoma, NOS
  10: Synovial sarcoma, biphasic
  11: Synovial sarcoma, spindle cell
  12: Undifferentiated sarcoma

Encoded subtype distribution:
subtype_encoded
0     49
1     20
2      3
3     76
4      1
5     11
6      9
7      1
8      2
9      1
10     1
11     4
12    32
Name: count, dtype: int64

Final

# Reduce Dimension

### Autoencoder

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, input_dim, hidden_dim=256, latent_dim=64):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, latent_dim),
            nn.ReLU(inplace=True)
        )
        self.decoder = nn.Sequential(
            nn.Linear(latent_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, input_dim),
        )

    def forward(self, x):
        z = self.encoder(x)  # Bottleneck representation
        x_hat = self.decoder(z)  # Decoder representation
        return x_hat, z
    
    def encode(self, x):
        """Extract bottleneck representation"""
        return self.encoder(x)
    
    def decode(self, z):
        """Extract decoder representation"""
        return self.decoder(z)

def extract_representations(model, tensor_data):
    """Extract bottleneck and decoder representations from trained autoencoder"""
    model.eval()
    with torch.no_grad():
        # Get bottleneck (latent) representation
        bottleneck = model.encode(tensor_data)
        # Get decoder representation (reconstructed features)
        decoder_repr = model.decode(bottleneck)
    return bottleneck.cpu().numpy(), decoder_repr.cpu().numpy()

# Initialize autoencoders for each omics type (without training)
omics_scaled = {
    'expr': expression_data_scaled,
    'meth': methylation_scaled,
    'cnv': copy_number_scaled,
    'prot': protein_scaled
}

# Store autoencoder models (to be trained later)
autoencoder_models = {}
bottleneck_embeds = {}
decoder_embeds = {}

for name, df in omics_scaled.items():
    print(f">> Initializing autoencoder for {name} modality")

    X = torch.tensor(df.values.T, dtype=torch.float32, device=device)  # (samples, features) -- Transpose to (features, samples)
    input_dim = X.shape[1]
    sample_ids = df.columns
    
    print(f"Data shape: {df.shape} -> After transpose: {X.shape}")
    print(f"Input dimension (features): {input_dim}")
    
    # Initialize autoencoder
    model = Autoencoder(
        input_dim=input_dim, 
        hidden_dim=128,
        latent_dim=64
    ).to(device)
    autoencoder_models[name] = model

    bottleneck, decoder_repr = extract_representations(model, X)
    
    print(f"Index colomn shape: {df.columns.shape}")
    bottleneck_embeds[name] = pd.DataFrame(
        bottleneck,
        index=sample_ids,  # Sample IDs
        columns=[f"{name}_bottleneck_{i}" for i in range(bottleneck.shape[1])]
    )
    
    print(f"Index colomn shape: {df.columns.shape}")
    decoder_embeds[name] = pd.DataFrame(
        decoder_repr,
        index=sample_ids,  # Sample IDs
        columns=[f"{name}_decoder_{i}" for i in range(decoder_repr.shape[1])]
    )
    
    print(f"   Bottleneck shape: {bottleneck_embeds[name].shape}")
    print(f"   Decoder shape: {decoder_embeds[name].shape}")
    print()

Clearing existing models...
INITIALIZING AUTOENCODERS

>> Processing expr modality
   Data shape: (60660, 210) -> After transpose: torch.Size([210, 60660])
   Input dimension (features): 60660
   ✓ Bottleneck shape: (210, 64)
   ✓ Decoder shape: (210, 60660)

>> Processing meth modality
   Data shape: (212382, 210) -> After transpose: torch.Size([210, 212382])
   Input dimension (features): 212382
   Data shape: (212382, 210) -> After transpose: torch.Size([210, 212382])
   Input dimension (features): 212382
   ✓ Bottleneck shape: (210, 64)
   ✓ Decoder shape: (210, 212382)

>> Processing cnv modality
   Data shape: (56756, 210) -> After transpose: torch.Size([210, 56756])
   Input dimension (features): 56756
   ✓ Bottleneck shape: (210, 64)
   ✓ Decoder shape: (210, 56756)

>> Processing prot modality
   Data shape: (457, 210) -> After transpose: torch.Size([210, 457])
   Input dimension (features): 457
   ✓ Bottleneck shape: (210, 64)
   ✓ Decoder shape: (210, 457)

AUTOENCODER INITI

In [21]:
# # Create fusion models
# print("\n" + "="*50)
# print("Creating Fusion Models")
# print("="*50)

# # Fusion Model 1: Using bottleneck representations
# bottleneck_fusion = pd.concat(
#     [bottleneck_embeds['expr'], bottleneck_embeds['meth'], 
#      bottleneck_embeds['cnv'], bottleneck_embeds['prot']],
#     axis=1
# )

# # Fusion Model 2: Using decoder representations
# decoder_fusion = pd.concat(
#     [decoder_embeds['expr'], decoder_embeds['meth'], 
#      decoder_embeds['cnv'], decoder_embeds['prot']],
#     axis=1
# )

# print(f"Bottleneck fusion shape: {bottleneck_fusion.shape}")
# print(f"Decoder fusion shape: {decoder_fusion.shape}")

# print("\nFusion models created successfully!")
# print("Note: These use untrained autoencoders. Train the models first, then re-extract representations.")

# # Display sample data
# print(f"\nBottleneck fusion sample:")
# print(bottleneck_fusion.head())

# print(f"\nDecoder fusion sample:")
# print(decoder_fusion.head())