# Import Tools

In [6]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GATConv
from torch_geometric.data import Data
from torch_geometric.nn import knn_graph
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
import torch.nn as nn
import numpy as np
import pandas as pd
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Data load

In [7]:
expression_data = pd.read_csv('TCGA-SARC.star_tpm.tsv', sep='\t', index_col=0)  # Gene expression (TPM)
methylation_data = pd.read_csv('TCGA-SARC.methylation450.tsv', sep='\t', index_col=0)  # DNA methylation
copy_number_data = pd.read_csv('TCGA-SARC.gene-level_absolute.tsv', sep='\t', index_col=0)  # Gene-level copy number (ABSOLUTE)
protein_data = pd.read_csv('TCGA-SARC.protein.tsv', sep='\t', index_col=0)  # Protein expression

try:
    phenotype_data = pd.read_csv('TCGA-SARC.clinical.tsv', sep='\t', index_col=0)
except Exception as e:
    print("Error while loading phenotype_data:", e)
    with open('TCGA-SARC.clinical.tsv', 'r') as f:
        for i, line in enumerate(f):
            try:
                pd.read_csv(pd.compat.StringIO(line), sep='\t')
            except Exception as row_e:
                print(f"Error in line {i}: {row_e}")
    phenotype_data = pd.read_csv('TCGA-SARC.clinical.tsv', sep='\t', index_col=0, on_bad_lines='skip')

print("Data shapes:")
print(f"Expression data: {expression_data.shape}")
print(f"Methylation data: {methylation_data.shape}")
print(f"Copy number data: {copy_number_data.shape}")
print(f"Protein data: {protein_data.shape}")
print(f"Phenotype data: {phenotype_data.shape}")

Data shapes:
Expression data: (60660, 265)
Methylation data: (486427, 269)
Copy number data: (60623, 248)
Protein data: (487, 226)
Phenotype data: (272, 78)


# Sample Matching

In [8]:
print("Checking for missing values in phenotype data:")
missing_values = phenotype_data.isnull().sum()
print(missing_values[missing_values > 0])

print()
print("Checking sample overlap between different omics data:")

samples_expression = set(expression_data.columns)
samples_methylation = set(methylation_data.columns)
samples_cnv = set(copy_number_data.columns)
samples_clinical = set(phenotype_data.index)
samples_protein = set(protein_data.columns)
print("Sample overlap:")
print(f"Expression samples: {len(samples_expression)}")
print(f"Methylation samples: {len(samples_methylation)}")
print(f"CNV samples: {len(samples_cnv)}")
print(f"Clinical samples: {len(samples_clinical)}")
print(f"Protein samples: {len(samples_protein)}")

# Find common samples across all omics
common_samples = list(samples_expression.intersection(samples_methylation, samples_cnv, samples_clinical, samples_protein))
print(f"Common samples across all omics: {len(common_samples)}")

# Filter data to keep only common samples
expression_data = expression_data[common_samples]
methylation_data = methylation_data[common_samples]
copy_number_data = copy_number_data[common_samples]
protein_data = protein_data[common_samples]
phenotype_data = phenotype_data.loc[common_samples]

Checking for missing values in phenotype data:
days_to_birth.demographic                                      1
year_of_birth.demographic                                      4
days_to_death.demographic                                    170
year_of_death.demographic                                    195
entity_submitter_id.annotations                              254
notes.annotations                                            254
submitter_id.annotations                                     254
classification.annotations                                   254
entity_id.annotations                                        254
created_datetime.annotations                                 254
annotation_id.annotations                                    254
entity_type.annotations                                      254
updated_datetime.annotations                                 254
case_id.annotations                                          254
state.annotations                          

# Data preprocessing

### Check Null value

In [9]:
#Check for null values of expression data
print("Checking for null values in expression data:")
null_expression = expression_data.isnull().sum().sum()
if null_expression > 0:
    print(f"Expression data contains {null_expression} null values.")
else:   
    print("No null values found in expression data.")
print()


# Check for null values in methylation data
print("Checking for null values in methylation data:")
null_methylation = methylation_data.isnull().sum().sum()
if null_methylation > 0:
    print(f"Methylation data contains {null_methylation} null values.")
else:
    print("No null values found in methylation data.")
print()


# Check for null values in copy number data
print("Checking for null values in copy number data:")
null_copy_number = copy_number_data.isnull().sum().sum()
if null_copy_number > 0:
    print(f"Copy number data contains {null_copy_number} null values.")
else:
    print("No null values found in copy number data.")
max_value = copy_number_data.max().max()
min_value = copy_number_data.min().min()

print(f"Maximum CNV value: {max_value}")
print(f"Minimum CNV value: {min_value}")
print()


# Check for null values in protein data
print("Checking for null values in protein data:")
null_protein = protein_data.isnull().sum().sum()
if null_protein > 0:
    print(f"Protein data contains {null_protein} null values.")
else:
    print("No null values found in protein data.")
print()


# Check for null values in phenotype data
print("Checking for null values in phenotype data:")
null_phenotype = phenotype_data.isnull().sum().sum()
if null_phenotype > 0:
    print(f"Phenotype data contains {null_phenotype} null values.")
else:
    print("No null values found in phenotype data.")

Checking for null values in expression data:
No null values found in expression data.

Checking for null values in methylation data:
Methylation data contains 16814049 null values.

Checking for null values in copy number data:
Copy number data contains 855388 null values.
Maximum CNV value: 7.0
Minimum CNV value: 0.0

Checking for null values in protein data:
Protein data contains 6300 null values.

Checking for null values in phenotype data:
Phenotype data contains 3389 null values.


### Process data

In [10]:
# expression data preprocessing -------------------------------------------------->

# Log2 transform
expression_data_log = np.log2(expression_data + 1)

# Z-score standardization
scaler_expr = StandardScaler()
expression_data_scaled = pd.DataFrame(
    scaler_expr.fit_transform(expression_data_log.T).T,
    index=expression_data_log.index,
    columns=expression_data_log.columns
)

from sklearn.feature_selection import VarianceThreshold


# Methylation data preprocessing -------------------------------------------------->

# Droping probes with more than 10% missing values 
methylation_data = methylation_data.dropna(thresh=0.9 * methylation_data.shape[1], axis=0)

# fill na with probe wise median
methylation_data = methylation_data.apply(lambda x: x.fillna(x.median()), axis=1)


# Remove low-variance methylation probes
selector = VarianceThreshold(threshold=0.01)
methylation_filtered = pd.DataFrame(
    selector.fit_transform(methylation_data.T).T,
    index=methylation_data.index[selector.get_support()],
    columns=methylation_data.columns
)

# Z-score standardization
scaler_meth = StandardScaler()
methylation_scaled = pd.DataFrame(
    scaler_meth.fit_transform(methylation_filtered.T).T,
    index=methylation_filtered.index,
    columns=methylation_filtered.columns
)


# Copy number data preprocessing -------------------------------------------------->


# Drop genes with >20% missing values
gene_missing_threshold = 0.2
copy_number_data_filtered = copy_number_data.loc[
    copy_number_data.isnull().mean(axis=1) < gene_missing_threshold
]

# filling null values with gene-wise median
copy_number_imputed = copy_number_data_filtered.apply(
    lambda row: row.fillna(row.median()), axis=1
)

# Standardize across samples
scaler_cnv = StandardScaler()
copy_number_scaled = pd.DataFrame(
    scaler_cnv.fit_transform(copy_number_imputed.T).T,
    index=copy_number_imputed.index,
    columns=copy_number_imputed.columns
)


# Protein data preprocessing ------------------------------------------------------>

# Drop proteins with >20% missing values
protein_missing_threshold = 0.3
protein_data_filtered = protein_data.loc[
    protein_data.isnull().mean(axis=1) < protein_missing_threshold
]
# filling null values with protein-wise median
protein_imputed = protein_data_filtered.apply(
    lambda row: row.fillna(row.median()), axis=1
)

# Standardize across samples
scaler_protein = StandardScaler()
protein_scaled = pd.DataFrame(
    scaler_protein.fit_transform(protein_imputed.T).T,
    index=protein_imputed.index,
    columns=protein_imputed.columns
)

In [11]:
# Phenotype data preprocessing -------------------------------------------------->

subtype_column = 'primary_diagnosis.diagnoses'
print(f"Using '{subtype_column}' as the subtype column")
print(f"Subtype distribution:\n{phenotype_data[subtype_column].value_counts()}")
print()

# Checking missing values in the subtype column
missing_subtypes = phenotype_data[subtype_column].isnull().sum()
print(f"Missing values in subtype column: {missing_subtypes}")

if missing_subtypes > 0:
    phenotype_data_clean = phenotype_data.dropna(subset=[subtype_column])
    print(f"Removed {missing_subtypes} samples with missing subtypes")
    print(f"Remaining samples: {len(phenotype_data_clean)}")
else:
    phenotype_data_clean = phenotype_data.copy()
    print("No missing subtypes found")

Using 'primary_diagnosis.diagnoses' as the subtype column
Subtype distribution:
primary_diagnosis.diagnoses
Leiomyosarcoma, NOS                        76
Dedifferentiated liposarcoma               49
Undifferentiated sarcoma                   32
Fibromyxosarcoma                           20
Malignant fibrous histiocytoma             11
Malignant peripheral nerve sheath tumor     9
Synovial sarcoma, spindle cell              4
Giant cell sarcoma                          3
Pleomorphic liposarcoma                     2
Synovial sarcoma, NOS                       1
Liposarcoma, well differentiated            1
Synovial sarcoma, biphasic                  1
Myxoid leiomyosarcoma                       1
Name: count, dtype: int64

Missing values in subtype column: 0
No missing subtypes found


In [12]:
# Extract subtypes for the common samples
subtypes = phenotype_data_clean[subtype_column]

# Encode subtypes as numeric labels
label_encoder = LabelEncoder()
subtype_encoded = label_encoder.fit_transform(subtypes)

# Create mapping to encode subtype classes
subtype_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
print(f"\nSubtype encoding mapping:")
for subtype, encoded in subtype_mapping.items():
    print(f"  {encoded}: {subtype}")

# Converted to pandas Series to easily handle the index
subtype_encoded = pd.Series(subtype_encoded, index=subtypes.index, name='subtype_encoded')

print(f"\nEncoded subtype distribution:")
print(subtype_encoded.value_counts().sort_index())

# Update common samples with available subtypes
if missing_subtypes > 0:
    valid_samples = list(set(common_samples).intersection(set(phenotype_data_clean.index)))
    print(f"\nUpdating common samples from {len(common_samples)} to {len(valid_samples)} (removed samples with missing subtypes)")

    expression_data_scaled = expression_data_scaled[valid_samples]
    methylation_scaled = methylation_scaled[valid_samples]
    copy_number_scaled = copy_number_scaled[valid_samples]
    protein_scaled = protein_scaled[valid_samples]
    subtype_encoded = subtype_encoded.loc[valid_samples]
    
    common_samples = valid_samples
print()
print(f"Final data shapes after phenotype preprocessing:")
print(f"  Expression: {expression_data_scaled.shape}")
print(f"  Methylation: {methylation_scaled.shape}")
print(f"  Copy number: {copy_number_scaled.shape}")
print(f"  Protein: {protein_scaled.shape}")
print(f"  Phenotype: {phenotype_data_clean.shape}")
print(f"  Subtypes: {len(subtype_encoded)}")
print(f"  Common samples: {len(common_samples)}")


Subtype encoding mapping:
  0: Dedifferentiated liposarcoma
  1: Fibromyxosarcoma
  2: Giant cell sarcoma
  3: Leiomyosarcoma, NOS
  4: Liposarcoma, well differentiated
  5: Malignant fibrous histiocytoma
  6: Malignant peripheral nerve sheath tumor
  7: Myxoid leiomyosarcoma
  8: Pleomorphic liposarcoma
  9: Synovial sarcoma, NOS
  10: Synovial sarcoma, biphasic
  11: Synovial sarcoma, spindle cell
  12: Undifferentiated sarcoma

Encoded subtype distribution:
subtype_encoded
0     49
1     20
2      3
3     76
4      1
5     11
6      9
7      1
8      2
9      1
10     1
11     4
12    32
Name: count, dtype: int64

Final data shapes after phenotype preprocessing:
  Expression: (60660, 210)
  Methylation: (212863, 210)
  Copy number: (56646, 210)
  Protein: (457, 210)
  Phenotype: (210, 78)
  Subtypes: 210
  Common samples: 210


### Save preprocessed data

In [None]:
# Save processed data
# expression_data_scaled.to_csv("processed_expression.csv")
# methylation_scaled.to_csv("processed_methylation.csv")
# copy_number_scaled.to_csv("processed_cnv.csv")
# protein_scaled.to_csv("processed_protein.csv")
# phenotype_data_clean.to_csv("processed_phenotype.csv")
# subtype_encoded.to_csv("subtype_labels.csv")


# save subtype mapping
# subtype_mapping_df = pd.DataFrame(list(subtype_mapping.items()), columns=['subtype_name', 'encoded_label'])
# subtype_mapping_df.to_csv("subtype_mapping_with_protein.csv", index=False)

# print(f"\nFinal data shapes:")
# print(f"Expression: {expression_data_scaled.shape}")
# print(f"Methylation: {methylation_scaled.shape}")
# print(f"Copy number: {copy_number_scaled.shape}")
# print(f"Protein: {protein_scaled.shape}")
# print(f"Subtypes: {len(subtype_encoded)}")
# print(f"Subtype classes: {len(label_encoder.classes_)}")
# print("\nProcessed data saved to CSV files:")
# print("- processed_expression.csv")
# print("- processed_methylation.csv") 
# print("- processed_protein.csv")
# print("- processed_cnv.csv")
# print("- subtype_labels.csv")
# print("- subtype_mapping.csv")

In [None]:
print("Expression:", expression_data_scaled.shape)
print("Methylation:", methylation_scaled.shape)
print("Copy Number:", copy_number_scaled.shape)
print("Protein:", protein_scaled.shape)
print("Phenotype:", phenotype_data_clean.shape)
print("Subtype labels:", subtype_encoded.shape)

Expression: (60660, 210)
Methylation: (212863, 210)
Copy Number: (56646, 210)
Subtype labels: (210,)


In [None]:
print(expression_data_scaled.iloc[:5, :5])  # First 5 rows/columns
print(methylation_scaled.iloc[:5, :5])
print(copy_number_scaled.iloc[:5, :5])
print(protein_scaled.iloc[:5, :5])
print(subtype_encoded.head())

                    TCGA-DX-A6B9-01A  TCGA-DX-A8BR-01A  TCGA-DX-A1L3-01A  \
Ensembl_ID                                                                 
ENSG00000000003.15         -0.469188         -2.807430         -0.163443   
ENSG00000000005.6           2.268745          0.226222          0.660750   
ENSG00000000419.13         -0.519179          0.148284         -0.280435   
ENSG00000000457.14          0.329740          1.138170          2.741732   
ENSG00000000460.17         -1.261882          0.898457          1.845504   

                    TCGA-Z4-AAPF-01A  TCGA-DX-A7ET-01A  
Ensembl_ID                                              
ENSG00000000003.15         -0.104785         -0.504571  
ENSG00000000005.6           1.328134         -1.333194  
ENSG00000000419.13         -1.087689         -1.117240  
ENSG00000000457.14         -0.967435         -2.767352  
ENSG00000000460.17         -1.096206         -2.129919  
                       TCGA-DX-A6B9-01A  TCGA-DX-A8BR-01A  TCGA-DX-A

In [None]:
# # Label Encoding for Categorical Phenotype Variables
# print("LABEL ENCODING FOR PHENOTYPE DATA")
# print("=" * 80)

# # Separate numeric and categorical columns
# numeric_cols = phenotype_data_clean2.select_dtypes(include=[np.number]).columns
# categorical_cols = phenotype_data_clean2.select_dtypes(include=['object', 'category']).columns

# print(f"📊 Phenotype Data Analysis:")
# print(f"   Total columns: {len(phenotype_data_clean2.columns)}")
# print(f"   Numeric columns: {len(numeric_cols)}")
# print(f"   Categorical columns: {len(categorical_cols)}")

# print(f"\n📋 Numeric columns:")
# for col in numeric_cols:
#     print(f"   - {col}")

# # print(f"\n📋 Categorical columns:")
# # for col in categorical_cols:
# #     unique_vals = phenotype_data_clean2[col].nunique()
# #     print(f"   - {col} ({unique_vals} unique values)")

# # Create encoded phenotype dataset
# print(f"\n🔄 Applying Label Encoding to categorical variables...")
# phenotype_encoded = phenotype_data_clean2[numeric_cols].copy()

# # Encode categorical columns (only if they have reasonable number of categories)
# encoders = {}
# max_categories = 20  # Only encode if less than 20 unique values

# for col in categorical_cols:
#     n_unique = phenotype_data_clean2[col].nunique()
    
#     # Skip columns with too many unique values (likely IDs)
#     if n_unique >= max_categories:
#         print(f"   ⚠️  Skipping '{col}' ({n_unique} unique values - likely an ID column)")
#         continue
    
#     # Skip columns with all missing or single value
#     if n_unique <= 1:
#         print(f"   ⚠️  Skipping '{col}' ({n_unique} unique value - no information)")
#         continue
    
#     try:
#         le = LabelEncoder()
#         # Handle missing values by converting to string first
#         phenotype_encoded[f"{col}_encoded"] = le.fit_transform(
#             phenotype_data_clean2[col].fillna('Missing').astype(str)
#         )
#         encoders[col] = le
#         print(f"   ✅ Encoded '{col}' → '{col}_encoded' ({n_unique} categories)")
#     except Exception as e:
#         print(f"   ❌ Failed to encode '{col}': {str(e)}")

# print(f"\n📊 Encoded Phenotype Data Summary:")
# print(f"   Original shape: {phenotype_data_clean2.shape}")
# print(f"   Encoded shape: {phenotype_encoded.shape}")
# print(f"   Original features: {len(phenotype_data_clean2.columns)}")
# print(f"   Encoded features: {len(phenotype_encoded.columns)}")
# print(f"   Added encoded features: {len(phenotype_encoded.columns) - len(numeric_cols)}")

# # Check for missing values
# print(f"\n🔍 Missing Values Check:")
# missing_counts = phenotype_encoded.isnull().sum()
# if missing_counts.sum() == 0:
#     print(f"   ✅ No missing values in encoded phenotype data")
# else:
#     print(f"   ⚠️  Found {missing_counts.sum()} missing values:")
#     for col in missing_counts[missing_counts > 0].index:
#         print(f"      - {col}: {missing_counts[col]} missing")

# # Handle missing values strategically
# print(f"\n🔄 Handling missing values in phenotype data...")

# # Impute age/birth/diagnosis columns with median (small number of missing)
# age_related_cols = [col for col in phenotype_encoded.columns 
#                     if any(x in col.lower() for x in ['age', 'birth', 'diagnosis', 'collection'])
#                     and 'death' not in col.lower()]

# imputed_count = 0
# for col in age_related_cols:
#     if phenotype_encoded[col].isnull().any():
#         n_missing = phenotype_encoded[col].isnull().sum()
#         median_val = phenotype_encoded[col].median()
#         phenotype_encoded[col].fillna(median_val, inplace=True)
#         imputed_count += n_missing
#         if n_missing > 0:
#             print(f"   ✅ Imputed {n_missing} values in '{col}' with median")

# # Fill remaining missing values (death/annotation columns) with 0
# # These represent "no event" or "no data" which is informative
# remaining_missing = phenotype_encoded.isnull().sum().sum()
# if remaining_missing > 0:
#     print(f"\n   📝 Filling {remaining_missing} remaining missing values with 0 (informative missingness)")
#     phenotype_encoded.fillna(0, inplace=True)

# print(f"\n✅ Missing value handling completed!")
# print(f"   Imputed: {imputed_count} values (age/diagnosis related)")
# print(f"   Filled with 0: {remaining_missing} values (death/annotation columns)")
# print(f"   Final missing values: {phenotype_encoded.isnull().sum().sum()}")

# # Store the encoded phenotype for later use
# phenotype_data_encoded = phenotype_encoded.copy()
# # phenotype_data_encoded.to_csv("phenotype_data_encoded.csv")