In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np

# Functions

In [2]:
class SingleOmicsDataset:
    def __init__(self, path: str, name: str, sep='\t'):
        self.path = path
        self.name = name
        self.sep = sep
        self.data = None
        
        self._load_data()
        
    def _load_data(self):
        self.data = pd.read_csv(self.path, sep=self.sep, index_col=0)
        self.data = self.data.transpose()
        self.data.index.names = ['sample']
        self.data.columns.names = ['feature']
    
    def remove_missing_labels(self, label_indices):
        self.data = self.data[self.data.index.isin(label_indices)]
        self.data = self.data.sort_index(axis=0)

    def clean_missing_values(self):
        # Keep features/columns with less than 10% missing values
        min_non_na = int(len(self.data.index) * 0.9)
        has_missing_values = self.data.isna().any().any()
        self.data = self.data.dropna(axis=1, thresh=min_non_na)
        
        # Fill NA values with the mean of each feature/column
        self.data = self.data.fillna(self.data.mean())

        # Check for any missing values in the DataFrame
        has_missing_values = self.data.isna().any().any()
        if has_missing_values:
            raise ValueError("The modality contains missing values. Please handle them before proceeding.")

    def normalize_data(self):
        # apply the min-max scaling for each feature separately
        self.data = (self.data - self.data.min()) / (self.data.max() - self.data.min())

    def remove_low_variance_features(self, threshold):
        if threshold is not None:
            self.data = self.data.loc[:, self.data.var() >= threshold]

In [3]:
class MultiomicsDataset:
    def __init__(self, 
                 raw_data_dir,
                 raw_label_dir,
                 num_omics,
                 sep='\t',
                 clean_missing=True,
                 normalize=True,
                 var_threshold=None,
                 label_column_name=None,
                 label_column_values=None):
        self.raw_data_dir = raw_data_dir
        self.raw_label_dir = raw_label_dir
        self.label_column_name = label_column_name
        self.label_column_values = label_column_values
        self.sep = sep
        self.num_omics = num_omics
        self.clean_missing = clean_missing
        self.normalize = normalize
        self.var_threshold = var_threshold
        self.data = []
        self.label = None
        
        self._process()
        
    def _process(self):
        if self.label_column_name is not None:
            self.label = pd.read_csv(self.raw_label_dir, sep=self.sep, index_col=0)
            self.label = self.label[self.label[self.label_column_name].notnull()]
            self.label = self.label[self.label_column_name]
            if self.label_column_values is not None:
                self.label = self.label[self.label.isin(self.label_column_values)]
        print(f"\nInput class labels:\n{self.label.value_counts(dropna=True)}\n")
        
        self.label = self.label.sort_index(axis=0)

        for path, name in self.raw_data_dir:
            omics_data = SingleOmicsDataset(path, name, self.sep)
            self.data.append(omics_data)
        self.print_omics_data("Raw/Input omic modalities")

        # remove samples with missing labels (their labels are not included in the label file)
        for omics_data in self.data:
            omics_data.remove_missing_labels(self.label.index)
        self.print_omics_data("After missing label removal")

        # check whether each index in label is present in at least one of the omic dataset
        is_available, labels_to_remove = self._check_label_indices_availability()
        self.label = self.label.drop(labels_to_remove)
        print(f"Are all labels available in omic modalities? {is_available}\n")
        
        # clean missing values
        if self.clean_missing:
            for omics_data in self.data:
                omics_data.clean_missing_values()
            self.print_omics_data("After missing value removal")

        # normalize features
        if self.normalize:
            for omics_data in self.data:
                omics_data.normalize_data()
            
        # remove low variance features
        if self.var_threshold is not None:
            for omics_idx, omics_data in enumerate(self.data):
                omics_data.remove_low_variance_features(self.var_threshold[omics_idx])
            self.print_omics_data("After low-variance feature removal")
    
    def _check_label_indices_availability(self):
        # Create an empty set to store indices from all omics data
        combined_omics_indices = set()
        
        # Union indices from each omic dataset
        for omics_data in self.data:
            combined_omics_indices.update(omics_data.data.index)

        not_combined_omics_count = 0
        labels_to_remove = []

        # Check if each label index is in the combined omics indices
        for label_idx in self.label.index:
            if label_idx not in combined_omics_indices:
                not_combined_omics_count += 1
                labels_to_remove.append(label_idx)

        print(f"Number of labels not in combined omics indices: {not_combined_omics_count}")

        return not_combined_omics_count == 0, labels_to_remove

    def map_labels(self):
        unique_labels = self.label.unique()
        label_to_number = {label: i for i, label in enumerate(unique_labels)}
        self.label = self.label.map(label_to_number)
        self.label = self.label.rename("Class")
        print(f"Mapped class labels: {label_to_number}")
        print(f"\nMapped class labels:\n{self.label.value_counts(dropna=False)}\n")

    def save_result(self, save_label=True):
        print("Saving the processed data...")
        if save_label:
            processed_label_path = self.raw_label_dir.replace("raw_data", "processed_data") + ".csv"
            print(f" - label path: {processed_label_path}, label shape:{self.label.shape}")
            self.label.to_csv(processed_label_path, index=True)

        for omics_idx, omics_data in enumerate(self.data):
            processed_path = self.raw_data_dir[omics_idx][0].replace("raw_data", "processed_data") + ".csv"
            print(f" - {self.raw_data_dir[omics_idx][1]} path: {processed_path}, {self.raw_data_dir[omics_idx][1]} shape: {omics_data.data.shape}")
            omics_data.data.to_csv(processed_path, index=True)

    def print_omics_data(self, title):
        print(f"\nOmic modality shape ({title}):")
        for omics_data in self.data:
            print(f" - {omics_data.name} shape: {omics_data.data.shape}")
        print()

# Breast Cancer

In [4]:
base_raw_dir = os.path.join("..", "data", "raw_data", "BRCA")
base_processed_dir = os.path.join("..", "data", "processed_data", "BRCA")

raw_data_dir = [
    (os.path.join(base_raw_dir, "DNA"), "DNA"), # DNA methylation HumanMethylation27
    (os.path.join(base_raw_dir, "mRNA"), "mRNA"), # Gene expression RNAseq
    (os.path.join(base_raw_dir, "miRNA"), "miRNA"), # miRNA mature strand expression RNAseq
 ]
raw_label_dir = os.path.join(base_raw_dir, "ClinicalMatrix")

if not os.path.exists(base_processed_dir):
    os.makedirs(base_processed_dir)

In [5]:
multiomics_data = MultiomicsDataset(raw_data_dir, 
                                    raw_label_dir,
                                    label_column_name='PAM50Call_RNAseq',
                                    clean_missing=True,
                                    normalize=True,
                                    num_omics=3,
                                    var_threshold=[0.04, 0.03, None])
multiomics_data.map_labels()
multiomics_data.save_result()


Input class labels:
PAM50Call_RNAseq
LumA      434
LumB      194
Basal     142
Normal    119
Her2       67
Name: count, dtype: int64


Omic modality shape (Raw/Input omic modalities):
 - DNA shape: (345, 27578)
 - mRNA shape: (1218, 20530)
 - miRNA shape: (832, 2238)


Omic modality shape (After missing label removal):
 - DNA shape: (328, 27578)
 - mRNA shape: (956, 20530)
 - miRNA shape: (584, 2238)

Number of labels not in combined omics indices: 0
Are all labels available in omic modalities? True


Omic modality shape (After missing value removal):
 - DNA shape: (328, 24956)
 - mRNA shape: (956, 20530)
 - miRNA shape: (584, 436)


Omic modality shape (After low-variance feature removal):
 - DNA shape: (328, 6860)
 - mRNA shape: (956, 2608)
 - miRNA shape: (584, 436)

Mapped class labels: {'Normal': 0, 'LumA': 1, 'LumB': 2, 'Basal': 3, 'Her2': 4}

Mapped class labels:
Class
1    434
2    194
3    142
0    119
4     67
Name: count, dtype: int64

Saving the processed data...
 - label 

# Bladder Cancer

In [6]:
base_raw_dir = os.path.join("..", "data", "raw_data", "BLCA")
base_processed_dir = os.path.join("..", "data", "processed_data", "BLCA")

raw_data_dir = [
    (os.path.join(base_raw_dir, "DNA"), "DNA"), # DNA methylation HumanMethylation450
    (os.path.join(base_raw_dir, "mRNA"), "mRNA"), # Gene expression RNAseq
    (os.path.join(base_raw_dir, "miRNA"), "miRNA"), # miRNA mature strand expression RNAseq
 ]
raw_label_dir = os.path.join(base_raw_dir, "ClinicalMatrix")

if not os.path.exists(base_processed_dir):
    os.makedirs(base_processed_dir)

In [7]:
multiomics_data = MultiomicsDataset(raw_data_dir, 
                                    raw_label_dir,
                                    label_column_name='neoplasm_histologic_grade',
                                    num_omics=3,
                                    var_threshold=[0.08, 0.03, None])
multiomics_data.map_labels()
multiomics_data.save_result()


Input class labels:
neoplasm_histologic_grade
High Grade    412
Low Grade      21
Name: count, dtype: int64


Omic modality shape (Raw/Input omic modalities):
 - DNA shape: (434, 485577)
 - mRNA shape: (426, 20530)
 - miRNA shape: (429, 2210)


Omic modality shape (After missing label removal):
 - DNA shape: (431, 485577)
 - mRNA shape: (423, 20530)
 - miRNA shape: (426, 2210)

Number of labels not in combined omics indices: 0
Are all labels available in omic modalities? True


Omic modality shape (After missing value removal):
 - DNA shape: (431, 395362)
 - mRNA shape: (423, 20530)
 - miRNA shape: (426, 471)


Omic modality shape (After low-variance feature removal):
 - DNA shape: (431, 8592)
 - mRNA shape: (423, 5980)
 - miRNA shape: (426, 471)

Mapped class labels: {'High Grade': 0, 'Low Grade': 1}

Mapped class labels:
Class
0    412
1     21
Name: count, dtype: int64

Saving the processed data...
 - label path: ../data/processed_data/BLCA/ClinicalMatrix.csv, label shape:(433,)
 -

# Ovarian Cancer

In [8]:
base_raw_dir = os.path.join("..", "data", "raw_data", "OV")
base_processed_dir = os.path.join("..", "data", "processed_data", "OV")

raw_data_dir = [
    (os.path.join(base_raw_dir, "DNA"), "DNA"), # DNA methylation HumanMethylation27
    (os.path.join(base_raw_dir, "mRNA"), "mRNA"), # Gene expression RNAseq
    (os.path.join(base_raw_dir, "miRNA"), "miRNA"), # miRNA mature strand expression RNAseq
 ]
raw_label_dir = os.path.join(base_raw_dir, "ClinicalMatrix")
modified_label_dir = os.path.join(base_raw_dir, "ClinicalMatrix_modified")

if not os.path.exists(base_processed_dir):
    os.makedirs(base_processed_dir)

In [9]:
def discretize_class(label):
    if (label['days_to_death'] / 365) >= 3:
        return "long-term"
    elif label['vital_status'] == 'DECEASED':
        return "short-term"
    return None

label = pd.read_csv(raw_label_dir, sep='\t', index_col=0)
label = label[label['days_to_death'].notnull()]
label['Class'] = label.apply(discretize_class, axis=1)
label = label[label['Class'] != None].loc[:, ['Class']]
label.to_csv(modified_label_dir, sep='\t', index=True)

In [10]:
multiomics_data = MultiomicsDataset(raw_data_dir, 
                                    modified_label_dir,
                                    label_column_name='Class',
                                    num_omics=3,
                                    var_threshold=[0.04, 0.03, None])
multiomics_data.map_labels()
multiomics_data.save_result()


Input class labels:
Class
short-term    184
long-term     177
Name: count, dtype: int64


Omic modality shape (Raw/Input omic modalities):
 - DNA shape: (616, 27578)
 - mRNA shape: (308, 20530)
 - miRNA shape: (485, 2165)


Omic modality shape (After missing label removal):
 - DNA shape: (356, 27578)
 - mRNA shape: (184, 20530)
 - miRNA shape: (302, 2165)

Number of labels not in combined omics indices: 1
Are all labels available in omic modalities? False


Omic modality shape (After missing value removal):
 - DNA shape: (356, 24876)
 - mRNA shape: (184, 20530)
 - miRNA shape: (302, 448)


Omic modality shape (After low-variance feature removal):
 - DNA shape: (356, 5167)
 - mRNA shape: (184, 11308)
 - miRNA shape: (302, 448)

Mapped class labels: {'long-term': 0, 'short-term': 1}

Mapped class labels:
Class
1    184
0    176
Name: count, dtype: int64

Saving the processed data...
 - label path: ../data/processed_data/OV/ClinicalMatrix_modified.csv, label shape:(360,)
 - DNA path: ../