In [None]:
import os
import gdown
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import scipy as sp
import matplotlib.cm as cm

#np.random.seed(0)

from collections import Counter
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, AgglomerativeClustering
from sklearn.metrics import silhouette_score, confusion_matrix, adjusted_mutual_info_score, adjusted_rand_score, silhouette_samples
from sklearn.neighbors import NearestNeighbors
from scipy.spatial import distance_matrix

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Import utils.py
!gdown 'https://drive.google.com/uc?id=13I5w4WajPg6MObtLPQjxznm8w5hKlEY0' -O ./utils.py
from utils import *

# Retrieve datasets and reduce their dimension with PCA

The dimensionality reduction is needed, because iClusterPlus is very computationally heavy and Colab RAM gets saturated by running the algorithm on the original omics (Lung and Kidney in particular)

## Synthetic

Load omics

In [None]:
# Load dataset
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=12PArkc1RsOm2437mbysxRF4hQMddZOsc' -O ./mRNA.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1aJkDF0ckxzY4vsnS53s-V89DdAnRVbPo' -O ./meth.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1iS4u1SZH6r_Dvs7qRSKC444kc_bqmGhJ' -O ./prot.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UtHj4BzBx5hnQkkklJ9ugU5ERjLhjx4W' -O ./clusters.txt

ds = {}     # this will contain each omic
omics = ['mRNA','meth','prot']
for omic_name in omics:
    path = omic_name + ".txt"
    ds[omic_name] = pd.read_csv(path, sep='\t', index_col=0)
    ds[omic_name].index.name = None
    ds[omic_name] = ds[omic_name].T
    # N.B.: the matrices have been transposed so that now we have samples as rows and features as columns


y = pd.read_csv('clusters.txt', sep='\t').set_index('subjects')    # this will contain the true cluster label of each sample
true_cluster_labels_synthetic = y.values.reshape(y.shape[0])-1

Pre-process omics

In [None]:
for omic in omics:
    if omic == "meth":
        mu = 0
        sigma = 0.4 # 0.4

    elif omic == "prot":
        mu = 0
        sigma = 4 # 4

    else:
        mu = 0
        sigma = 4 # 4
        
    n_samples, n_features = ds[omic].shape

    # Add salt & pepper noise
    np.random.seed(42)   # fixed seed for reproducibility
    noise = np.random.normal(mu, sigma, size=(n_samples, n_features))
    p = 0.5 #0.8     # salt & pepper noise: add (gaussian) noise to a particolar feature of a particular sample with probability p
    raveled_indices = np.random.choice(np.arange(n_samples*n_features), replace=False, size=int(n_samples*n_features*(1-p)))
    indices = np.unravel_index(raveled_indices, (n_samples, n_features))
    noise[indices] = 0

    print(f"Salt & pepper gaussian noise N({mu},{sigma**2}) is added to the {omic} dataset")
    ds[f'{omic}_noisy'] = ds[f'{omic}'] + noise



    # Normalize omic
    ds[f'{omic}_normalized'] = MinMaxScaler().fit_transform(ds[f'{omic}_noisy'])
    ds[f'{omic}_normalized_no_noise'] = MinMaxScaler().fit_transform(ds[omic])  # version without noise

Reduce the n. of feature of each omic to 128 with PCA

In [None]:
for omic in omics:
    pca = PCA(128)
    # Reduce n. features
    principalComponents_df = pd.DataFrame(pca.fit_transform(ds[f'{omic}_normalized']))
    # Min-max normalize again
    #principalComponents_df = pd.DataFrame(MinMaxScaler().fit_transform(principalComponents_df))
    # Save reduced omic
    principalComponents_df.to_csv(f'{omic}_synthetic_128.txt', sep=',', header=False)


for omic in omics:
    pca = PCA(128)
    # Reduce n. features
    principalComponents_df = pd.DataFrame(pca.fit_transform(ds[f'{omic}_normalized_no_noise']))
    # Min-max normalize again
    #principalComponents_df = pd.DataFrame(MinMaxScaler().fit_transform(principalComponents_df))
    # Save reduced omic
    principalComponents_df.to_csv(f'{omic}_synthetic_no_noise_128.txt', sep=',', header=False)

## Lung

Load omics

In [None]:
# Load dataset
!gdown 'https://drive.google.com/uc?id=1kNQxRoCs6TIGVzdlpEpUkAqx8FyoSrsi' -O ./mRNA.txt
!gdown 'https://drive.google.com/uc?id=16V4tY8GeCUemOrf_KBqJwdixrQYZmFHv' -O ./miRNA.txt
!gdown 'https://drive.google.com/uc?id=1UuCNcXbxHS1lW3bRkixsbo77xIFsWxIG' -O ./meth.txt
!gdown 'https://drive.google.com/uc?id=1RT1EiQCj19gGD8bYr0UBjxvUPl8473FX' -O ./cnv.txt  # corrected for batch effects
!gdown 'https://drive.google.com/uc?id=1t1rTrZNVHqZRo_F0YA0heD3WbvijHox7' -O ./clusters.txt

ds = {}     # this will contain each omic
omics = ['mRNA','miRNA','meth','cnv']
for omic_name in omics:
    path = omic_name + ".txt"
    if omic_name not in ds:
        ds[omic_name] = pd.read_csv(path, sep='\t', index_col=0)

y = pd.read_csv('clusters.txt', sep='\t', index_col=0)
true_cluster_labels_lung = y.values.reshape(y.shape[0])

Pre-process omics

In [None]:
# mRNA
# Keep only protein coding genes
# NB: the file idx_mRNA_prot_cod.txt, containing the protein coding genes of the
# human genome, has been obtained in the colab notebook 'Find protein coding genes'
!gdown 'https://drive.google.com/uc?id=1i6lXPIh60BtCF-ujfaqf4WrvWM4XMvNF' -O ./idx_mRNA_prot_cod_lung.txt

idx_mRNA_prot_cod = pd.read_csv('idx_mRNA_prot_cod_lung.txt')
idx_mRNA_prot_cod = idx_mRNA_prot_cod['idx'].values

ds['mRNA'] = ds['mRNA'].iloc[:, idx_mRNA_prot_cod]

# Delete genes with a zero expression value across all the samples
ds['mRNA'] = ds['mRNA'].loc[:, (ds['mRNA'] != 0).any(axis=0)]

# Normalize mRNA with MinMax Scaler
ds['mRNA_normalized'] = MinMaxScaler().fit_transform(ds['mRNA'].values)



# miRNA
# Delete sequences with a zero expression value across all the samples
ds['miRNA'] = ds['miRNA'].loc[:, (ds['miRNA'] != 0).any(axis=0)]

# Normalize with log2 normalization
ds['miRNA'] = np.log(ds['miRNA'] + 1) / np.log(2)

# Normalize with MinMaxScaler
ds['miRNA_normalized'] = MinMaxScaler().fit_transform(ds['miRNA'].values)



# meth
# Delete sequences with a zero expression value across all the samples
ds['meth'] = ds['meth'].loc[:, (ds['meth'] != 0).any(axis=0)]

# Normalize with MinMaxScaler
ds['meth_normalized'] = MinMaxScaler().fit_transform(ds['meth'].values)



# cnv
# Normalize with MinMaxScaler
ds['cnv_normalized'] = MinMaxScaler().fit_transform(ds['cnv'].values)

Reduce the n. of feature of each omic to 256 with PCA

In [None]:
for omic in omics:
    pca = PCA(256)
    # Reduce n. features
    principalComponents_df = pd.DataFrame(pca.fit_transform(ds[f'{omic}_normalized']))
    # Min-max normalize again
    #principalComponents_df = pd.DataFrame(MinMaxScaler().fit_transform(principalComponents_df))
    # Save reduced omic
    principalComponents_df.to_csv(f'{omic}_lung_256.txt', sep=',', header=False)

## Kidney

Load omics

In [None]:
# Load dataset
!gdown 'https://drive.google.com/uc?id=1i1do_UTzwXzPVIDDmYSFJEholK2Mp8g_' -O ./mRNA.txt
!gdown 'https://drive.google.com/uc?id=1liKeOBKjnbCi1CIjcOPA3Zxv2fRzCfa2' -O ./miRNA.txt
!gdown 'https://drive.google.com/uc?id=1qr9joY0bAVDLvjWsKF5xf3CaRolBu-mP' -O ./meth.txt
!gdown 'https://drive.google.com/uc?id=1R-U2iDgM4oEyzNRfBIA2kXMbKw_s0QtI' -O ./clusters.txt

ds = {}     # this will contain each omic
omics = ['mRNA','miRNA','meth']
for omic_name in omics:
    path = omic_name + ".txt"
    if omic_name not in ds:
        ds[omic_name] = pd.read_csv(path, sep='\t', index_col=0)

y = pd.read_csv('clusters.txt', sep='\t', index_col=0)
true_cluster_labels_kidney = y.values.reshape(y.shape[0])

Pre-process omics

In [None]:
# mRNA
# Keep only protein coding genes
# NB: the file idx_mRNA_prot_cod.txt, containing the protein coding genes of the
# human genome, has been obtained in the colab notebook 'Find protein coding genes'
if not os.path.exists("./idx_mRNA_prot_cod_kidney.txt"):
    !gdown 'https://drive.google.com/uc?id=1Pi4u8y_YAc2tmOWZYaeLn9wGdzu4cFC5' -O ./idx_mRNA_prot_cod_kidney.txt

idx_mRNA_prot_cod = pd.read_csv('idx_mRNA_prot_cod_kidney.txt')
idx_mRNA_prot_cod = idx_mRNA_prot_cod['idx'].values

ds['mRNA'] = ds['mRNA'].iloc[:, idx_mRNA_prot_cod]

# Delete genes with a zero expression value across all the samples
ds['mRNA'] = ds['mRNA'].loc[:, (ds['mRNA'] != 0).any(axis=0)]

# Normalize mRNA with MinMax Scaler
ds['mRNA_normalized'] = MinMaxScaler().fit_transform(ds['mRNA'].values)



# miRNA
# Delete sequences with a zero expression value across all the samples
ds['miRNA'] = ds['miRNA'].loc[:, (ds['miRNA'] != 0).any(axis=0)]

# Normalize with log2 normalization
ds['miRNA'] = np.log(ds['miRNA'] + 1) / np.log(2)

# Normalize with MinMaxScaler
ds['miRNA_normalized'] = MinMaxScaler().fit_transform(ds['miRNA'].values)



# meth
# Delete sequences with a zero expression value across all the samples
ds['meth'] = ds['meth'].loc[:, (ds['meth'] != 0).any(axis=0)]

# Normalize with MinMaxScaler
ds['meth_normalized'] = MinMaxScaler().fit_transform(ds['meth'].values)

Reduce the n. of feature of each omic to 256 with PCA

In [None]:
for omic in omics:
    pca = PCA(256)
    # Reduce n. features
    principalComponents_df = pd.DataFrame(pca.fit_transform(ds[f'{omic}_normalized']))
    # Min-max normalize again
    #principalComponents_df = pd.DataFrame(MinMaxScaler().fit_transform(principalComponents_df))
    # Save reduced omic
    principalComponents_df.to_csv(f'{omic}_kidney_256.txt', sep=',', header=False)

# Import R libraries and rpy

In [None]:
%load_ext rpy2.ipython

In [None]:
%%R
if (!requireNamespace("BiocManager", quietly = TRUE))
    install.packages("BiocManager")
BiocManager::install("iClusterPlus")
library('iClusterPlus')

#install.packages('googledrive')
#library('googledrive')
#drive_deauth()

# Apply iClusterPlus

## Synthetic (noisy)

Load omics and adapt them to iClusterPlus accepted format

In [None]:
%%R
print('mRNA syntethic 128 loading...')
mRNA = read.csv("mRNA_synthetic_128.txt", sep=",", header=FALSE)
mRNA <- as.matrix(mRNA) 
mRNA <- mRNA[,-1]
colnames(mRNA) <- NULL

print('meth syntethic 128 loading...')
meth = read.csv("meth_synthetic_128.txt", sep=",", header=FALSE)
meth <- as.matrix(meth)
meth <- meth[,-1]
colnames(meth) <- NULL

print('prot syntethic 128 loading...')
prot = read.csv("prot_synthetic_128.txt", sep=",", header=FALSE)
prot <- as.matrix(prot)
prot <- prot[,-1]
colnames(prot) <- NULL

Apply iClusterPlus

In [None]:
%%R
# run iClusterPlus algorithm
r.icluster <- iClusterPlus::iClusterPlus(
  mRNA, # Providing each omics type
  meth,
  prot,
  type=c("gaussian", "gaussian", "gaussian"), # Providing the distributions
  K=64, # provide the number of factors to learn
  alpha=c(1,1,1), # as well as other model parameters
  lambda=c(.03,.03,.03)
)

# extract the H and W matrices from the run result
# here, we refer to H as z, to keep with iCluster terminology
# NB: H is basically the encoded dataset: (500,dim_ls) where dim_ls = K = 64
icluster.z <- r.icluster$meanZ
#icluster.ws <- r.icluster$beta

# Save H matrix
write.table(icluster.z, file="icluster_z_mat_synthetic.txt", row.names=FALSE, col.names=FALSE, sep=',')

Results

In [None]:
# Load H matrix (iClusterPlus-encoded-integrated dataset)
H_synthetic = pd.read_csv("icluster_z_mat_synthetic.txt", sep=',', header=None)

for k in range(2,11):
    # Apply K-means
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(H_synthetic)

    # Perform a 2D PCA to visualize the dataset
    pca = PCA(2)
    principalComponents = pca.fit_transform(H_synthetic)
    kmeans.cluster_centers_ = pca.transform(kmeans.cluster_centers_)

    # Plot the clustered dataset with cluster assignments and true cluster labels
    plot_2D_dataset(principalComponents, cluster_labels, title=f'Dataset visualization', caption=f'predicted {k} clusters')
    plot_2D_dataset(principalComponents, true_cluster_labels_synthetic, title=f'Dataset visualization', caption='true clusters')

    # Plot the confusion matrix
    plot_confusion_matrix(true_cluster_labels_synthetic, cluster_labels)

    # Compute silhouette on the iClusterPlus-encoded dataset with cluster assignments and true cluster labels
    print(f"Silhouette, predicted clusters: {silhouette_score(H_synthetic, cluster_labels)}")
    print(f"Silhouette, true clusters: {silhouette_score(H_synthetic, true_cluster_labels_synthetic)}")


We can clearly see from the visualized dataset that iClusterPlus is not robust to noise in the input data

## Synthetic (no noise)

Load omics and adapt them to iClusterPlus accepted format

In [None]:
%%R
print('mRNA syntethic no noise 128 loading...')
mRNA = read.csv("mRNA_synthetic_no_noise_128.txt", sep=",", header=FALSE)
mRNA <- as.matrix(mRNA) 
mRNA <- mRNA[,-1]
colnames(mRNA) <- NULL

print('meth syntethic no noise 128 loading...')
meth = read.csv("meth_synthetic_no_noise_128.txt", sep=",", header=FALSE)
meth <- as.matrix(meth)
meth <- meth[,-1]
colnames(meth) <- NULL

print('prot syntethic no noise 128 loading...')
prot = read.csv("prot_synthetic_no_noise_128.txt", sep=",", header=FALSE)
prot <- as.matrix(prot)
prot <- prot[,-1]
colnames(prot) <- NULL

Apply iClusterPlus

In [None]:
%%R
# run iClusterPlus algorithm
r.icluster <- iClusterPlus::iClusterPlus(
  mRNA, # Providing each omics type
  meth,
  prot,
  type=c("gaussian", "gaussian", "gaussian"), # Providing the distributions
  K=64, # provide the number of factors to learn
  alpha=c(1,1,1), # as well as other model parameters
  lambda=c(.03,.03,.03)
)

# extract the H and W matrices from the run result
# here, we refer to H as z, to keep with iCluster terminology
# NB: H is basically the encoded dataset: (500,dim_ls) where dim_ls = K = 64
icluster.z <- r.icluster$meanZ
#icluster.ws <- r.icluster$beta

# Save H matrix
write.table(icluster.z, file="icluster_z_mat_synthetic_no_noise.txt", row.names=FALSE, col.names=FALSE, sep=',')

Results

In [None]:
# Load H matrix (iClusterPlus-encoded-integrated dataset)
H_synthetic = pd.read_csv("icluster_z_mat_synthetic_no_noise.txt", sep=',', header=None)

for k in range(2,11):
    # Apply K-means
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(H_synthetic)

    # Perform a 2D PCA to visualize the dataset
    pca = PCA(2)
    principalComponents = pca.fit_transform(H_synthetic)
    kmeans.cluster_centers_ = pca.transform(kmeans.cluster_centers_)

    # Plot the clustered dataset with cluster assignments and true cluster labels
    plot_2D_dataset(principalComponents, cluster_labels, title=f'Dataset visualization', caption=f'predicted {k} clusters')
    plot_2D_dataset(principalComponents, true_cluster_labels_synthetic, title=f'Dataset visualization', caption='true clusters')

    # Plot the confusion matrix
    plot_confusion_matrix(true_cluster_labels_synthetic, cluster_labels)

    # Compute silhouette on the iClusterPlus-encoded dataset with cluster assignments and true cluster labels
    print(f"Silhouette, predicted clusters: {silhouette_score(H_synthetic, cluster_labels)}")
    print(f"Silhouette, true clusters: {silhouette_score(H_synthetic, true_cluster_labels_synthetic)}")


## Lung

Load omics and adapt them to iClusterPlus accepted format

In [None]:
%%R
print('mRNA lung 256 loading...')
mRNA = read.csv("mRNA_lung_256.txt", sep=",", header=FALSE)
mRNA <- as.matrix(mRNA)
mRNA <- mRNA[,-1]
colnames(mRNA) <- NULL

print('miRNA lung 256 loading...')
miRNA = read.csv("miRNA_lung_256.txt", sep=",", header=FALSE)
miRNA <- as.matrix(miRNA) 
miRNA <- miRNA[,-1]
colnames(miRNA) <- NULL

print('meth lung 256 loading...')
meth = read.csv("meth_lung_256.txt", sep=",", header=FALSE)
meth <- as.matrix(meth)
meth <- meth[,-1]
colnames(meth) <- NULL

print('cnv lung 256 loading...')
cnv = read.csv("cnv_lung_256.txt", sep=",", header=FALSE)
cnv <- as.matrix(cnv) 
cnv <- cnv[,-1]
colnames(cnv) <- NULL

Apply iClusterPlus

In [None]:
%%R
# run iClusterPlus algorithm
r.icluster <- iClusterPlus::iClusterPlus(
  mRNA, # Providing each omics type
  miRNA,
  meth,
  cnv,
  type=c("gaussian", "gaussian", "gaussian", "gaussian"), # Providing the distributions
  K=64, # provide the number of factors to learn
  alpha=c(1,1,1,1), # as well as other model parameters
  lambda=c(.03,.03,.03,.03)
)

# extract the H and W matrices from the run result
# here, we refer to H as z, to keep with iCluster terminology
# NB: H is basically the encoded dataset: (783,dim_ls) where dim_ls = K = 64
icluster.z <- r.icluster$meanZ
#icluster.ws <- r.icluster$beta

# Save H matrix
write.table(icluster.z, file="icluster_z_mat_lung.txt", row.names=FALSE, col.names=FALSE, sep=',')

Results

In [None]:
# Load H matrix (iClusterPlus-encoded-integrated dataset)
H_synthetic = pd.read_csv("icluster_z_mat_lung.txt", sep=',', header=None)

for k in range(2,11):
    # Apply K-means
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(H_synthetic)

    # Perform a 2D PCA to visualize the dataset
    pca = PCA(2)
    principalComponents = pca.fit_transform(H_synthetic)
    kmeans.cluster_centers_ = pca.transform(kmeans.cluster_centers_)

    # Plot the clustered dataset with cluster assignments and true cluster labels
    plot_2D_dataset(principalComponents, cluster_labels, title=f'Dataset visualization', caption=f'predicted {k} clusters')
    plot_2D_dataset(principalComponents, true_cluster_labels_lung, title=f'Dataset visualization', caption='true clusters')

    # Plot the confusion matrix
    plot_confusion_matrix(true_cluster_labels_lung, cluster_labels)

    # Compute silhouette on the iClusterPlus-encoded dataset with cluster assignments and true cluster labels
    print(f"Silhouette, predicted clusters: {silhouette_score(H_synthetic, cluster_labels)}")
    print(f"Silhouette, true clusters: {silhouette_score(H_synthetic, true_cluster_labels_lung)}")


    print()
    print()
    print()

## Kidney

Load omics and adapt them to iClusterPlus accepted format

In [None]:
%%R
print('mRNA kidney 256 loading...')
mRNA = read.csv("mRNA_kidney_256.txt", sep=",", header=FALSE)
mRNA <- as.matrix(mRNA) 
mRNA <- mRNA[,-1]
colnames(mRNA) <- NULL

print('miRNA kidney 256 loading...')
miRNA = read.csv("miRNA_kidney_256.txt", sep=",", header=FALSE)
miRNA <- as.matrix(miRNA) 
miRNA <- miRNA[,-1]
colnames(miRNA) <- NULL

print('meth kidney 256 loading...')
meth = read.csv("meth_kidney_256.txt", sep=",", header=FALSE)
meth <- as.matrix(meth)
meth <- meth[,-1]
colnames(meth) <- NULL

Apply iClusterPlus

In [None]:
%%R
# run iClusterPlus algorithm
r.icluster <- iClusterPlus::iClusterPlus(
  mRNA, # Providing each omics type
  miRNA,
  meth,
  type=c("gaussian", "gaussian", "gaussian"), # Providing the distributions
  K=64, # provide the number of factors to learn
  alpha=c(1,1,1), # as well as other model parameters
  lambda=c(.03,.03,.03)
)

# extract the H and W matrices from the run result
# here, we refer to H as z, to keep with iCluster terminology
# NB: H is basically the encoded dataset: (783,dim_ls) where dim_ls = K = 64
icluster.z <- r.icluster$meanZ
#icluster.ws <- r.icluster$beta

# Save H matrix
write.table(icluster.z, file="icluster_z_mat_kidney.txt", row.names=FALSE, col.names=FALSE, sep=',')

Results

In [None]:
# Load H matrix (iClusterPlus-encoded-integrated dataset)
H_synthetic = pd.read_csv("icluster_z_mat_kidney.txt", sep=',', header=None)

# Apply K-means
for k in range(2,11):
    kmeans = KMeans(n_clusters=k, random_state=0)
    cluster_labels = kmeans.fit_predict(H_synthetic)

    # Perform a 2D PCA to visualize the dataset
    pca = PCA(2)
    principalComponents = pca.fit_transform(H_synthetic)
    kmeans.cluster_centers_ = pca.transform(kmeans.cluster_centers_)

    # Plot the clustered dataset with cluster assignments and true cluster labels
    plot_2D_dataset(principalComponents, cluster_labels, title=f'Dataset visualization', caption=f'predicted {k} clusters')
    plot_2D_dataset(principalComponents, true_cluster_labels_kidney, title=f'Dataset visualization', caption='true clusters')

    # Plot the confusion matrix
    plot_confusion_matrix(true_cluster_labels_kidney, cluster_labels)

    # Compute silhouette on the iClusterPlus-encoded dataset with cluster assignments and true cluster labels
    print(f"Silhouette, predicted clusters: {silhouette_score(H_synthetic, cluster_labels)}")
    print(f"Silhouette, true clusters: {silhouette_score(H_synthetic, true_cluster_labels_kidney)}")