In [None]:
!pip install snfpy
import snf
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import spectral_clustering
import seaborn as sns
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from sklearn.metrics import silhouette_score, confusion_matrix, adjusted_mutual_info_score, adjusted_rand_score, silhouette_samples

# Import utils.py
!gdown 'https://drive.google.com/uc?id=13I5w4WajPg6MObtLPQjxznm8w5hKlEY0' -O ./utils.py
from utils import *

# Synthetic

Load omics

In [None]:
# Load dataset
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=12PArkc1RsOm2437mbysxRF4hQMddZOsc' -O ./mRNA.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1aJkDF0ckxzY4vsnS53s-V89DdAnRVbPo' -O ./meth.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1iS4u1SZH6r_Dvs7qRSKC444kc_bqmGhJ' -O ./prot.txt
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1UtHj4BzBx5hnQkkklJ9ugU5ERjLhjx4W' -O ./clusters.txt

ds = {}     # this will contain each omic
omics = ['mRNA','meth','prot']
for omic_name in omics:
    path = omic_name + ".txt"
    ds[omic_name] = pd.read_csv(path, sep='\t', index_col=0)
    ds[omic_name].index.name = None
    ds[omic_name] = ds[omic_name].T
    # N.B.: the matrices have been transposed so that now we have samples as rows and features as columns


y = pd.read_csv('clusters.txt', sep='\t').set_index('subjects')    # this will contain the true cluster label of each sample
true_cluster_labels_synthetic = y.values.reshape(y.shape[0])-1

Pre-process omics

In [None]:
for omic in omics:
    if omic == "meth":
        mu = 0 #0.1
        sigma = 0.4 # 0.4

    elif omic == "prot":
        mu = 0
        sigma = 4 # 4

    else:
        mu = 0
        sigma = 4 # 4
        
    n_samples, n_features = ds[omic].shape

    # Add salt & pepper noise
    np.random.seed(42)   # fixed seed for reproducibility
    noise = np.random.normal(mu, sigma, size=(n_samples, n_features))
    p = 0.5 #0.8     # salt & pepper noise: add (gaussian) noise to a particolar feature of a particular sample with probability p
    raveled_indices = np.random.choice(np.arange(n_samples*n_features), replace=False, size=int(n_samples*n_features*(1-p)))
    indices = np.unravel_index(raveled_indices, (n_samples, n_features))
    noise[indices] = 0

    print(f"Salt & pepper gaussian noise N({mu},{sigma**2}) is added to the {omic} dataset")
    ds[f'{omic}_noisy'] = ds[f'{omic}'] + noise



    # Normalize omic
    ds[f'{omic}_normalized'] = MinMaxScaler().fit_transform(ds[f'{omic}_noisy'])


Apply SNF

In [None]:
# Concatenate omics
ds['early_integr'] = [ds[f"{omic}_normalized"] for omic in omics]

affinity_networks = snf.make_affinity(ds['early_integr'], metric='euclidean', K=5, mu=0.5)  # these are the W matrices for each omic
fused_network = snf.snf(affinity_networks, K=5) # overall status matrix P^(c)
best, _ = snf.get_n_clusters(fused_network)    # optimal n. of clusters estimated via an eigengap approach
print(best)
cluster_assignments = spectral_clustering(fused_network, n_clusters=best)

ds['early_integr'] = np.concatenate([ds[f"{omic}_normalized"] for omic in omics], axis=1)

# Perform a 2D PCA to visualize the dataset
pca = PCA(2)
principalComponents = pca.fit_transform(ds['early_integr'])

# Plot the clustered dataset
plot_2D_dataset(principalComponents, cluster_assignments, title='Dataset', caption=f'{best} predicted clusters')
plot_2D_dataset(principalComponents, true_cluster_labels_synthetic, title='Dataset', caption=f'true clusters')

# Plot the confusion matrix
plot_confusion_matrix(true_cluster_labels_synthetic, cluster_assignments)

# Compute silhouette
print(silhouette_score(ds['early_integr'], cluster_assignments))

# Lung

Load omics

In [None]:
# Load dataset
!gdown 'https://drive.google.com/uc?id=1kNQxRoCs6TIGVzdlpEpUkAqx8FyoSrsi' -O ./mRNA.txt
!gdown 'https://drive.google.com/uc?id=16V4tY8GeCUemOrf_KBqJwdixrQYZmFHv' -O ./miRNA.txt
!gdown 'https://drive.google.com/uc?id=1UuCNcXbxHS1lW3bRkixsbo77xIFsWxIG' -O ./meth.txt
# Batch effect corrected with combat
# Only autosomic genes are kept; duplicated features were removed before applying combat
!gdown 'https://drive.google.com/uc?id=1RT1EiQCj19gGD8bYr0UBjxvUPl8473FX' -O ./cnv.txt
!gdown 'https://drive.google.com/uc?id=1t1rTrZNVHqZRo_F0YA0heD3WbvijHox7' -O ./clusters.txt

ds = {}     # this will contain each omic
omics = ['mRNA','miRNA','meth','cnv']
for omic_name in omics:
    path = omic_name + ".txt"
    if omic_name not in ds:
        ds[omic_name] = pd.read_csv(path, sep='\t', index_col=0)

y = pd.read_csv('clusters.txt', sep='\t', index_col=0)
true_cluster_labels_lung = y.values.reshape(y.shape[0])

Pre-process omics

In [None]:
# mRNA
# Keep only protein coding genes
# NB: the file idx_mRNA_prot_cod.txt, containing the protein coding genes of the
# human genome, has been obtained in the colab notebook 'Find protein coding genes'
if not os.path.exists("./idx_mRNA_prot_cod_lung.txt"):
    !gdown 'https://drive.google.com/uc?id=1i6lXPIh60BtCF-ujfaqf4WrvWM4XMvNF' -O ./idx_mRNA_prot_cod_lung.txt

idx_mRNA_prot_cod = pd.read_csv('idx_mRNA_prot_cod_lung.txt')
idx_mRNA_prot_cod = idx_mRNA_prot_cod['idx'].values

ds['mRNA'] = ds['mRNA'].iloc[:, idx_mRNA_prot_cod]

# Delete genes with a zero expression value across all the samples
ds['mRNA'] = ds['mRNA'].loc[:, (ds['mRNA'] != 0).any(axis=0)]

# Normalize mRNA with MinMax Scaler
ds['mRNA_normalized'] = MinMaxScaler().fit_transform(ds['mRNA'].values)



# miRNA
# Delete sequences with a zero expression value across all the samples
ds['miRNA'] = ds['miRNA'].loc[:, (ds['miRNA'] != 0).any(axis=0)]

# Normalize with log2 normalization
ds['miRNA'] = np.log(ds['miRNA'] + 1) / np.log(2)

# Normalize with MinMaxScaler
ds['miRNA_normalized'] = MinMaxScaler().fit_transform(ds['miRNA'].values)



# meth
# Delete sequences with a zero expression value across all the samples
ds['meth'] = ds['meth'].loc[:, (ds['meth'] != 0).any(axis=0)]

# Normalize with MinMaxScaler
ds['meth_normalized'] = MinMaxScaler().fit_transform(ds['meth'].values)



# cnv
# Normalize with MinMaxScaler
ds['cnv_normalized'] = MinMaxScaler().fit_transform(ds['cnv'].values)

Apply SNF

In [None]:
# Concatenate omics
ds['early_integr'] = [ds[f"{omic}_normalized"] for omic in omics]

affinity_networks = snf.make_affinity(ds['early_integr'], metric='euclidean', K=5, mu=0.5)
fused_network = snf.snf(affinity_networks, K=5)
best, second = snf.get_n_clusters(fused_network)
print(best)
cluster_assignments = spectral_clustering(fused_network, n_clusters=best)

ds['early_integr'] = np.concatenate([ds[f"{omic}_normalized"] for omic in omics], axis=1)

# Perform a 2D PCA to visualize the dataset
pca = PCA(2)
principalComponents = pca.fit_transform(ds['early_integr'])

# Plot the clustered dataset
plot_2D_dataset(principalComponents, cluster_assignments, title='Dataset', caption=f'{best} predicted clusters')
plot_2D_dataset(principalComponents, true_cluster_labels_lung, title='Dataset', caption=f'true clusters')

# Plot the confusion matrix
plot_confusion_matrix(true_cluster_labels_lung, cluster_assignments)

# Compute silhouette
print(silhouette_score(ds['early_integr'], cluster_assignments))

#Kidney

Load omics

In [None]:
# Load dataset
!gdown 'https://drive.google.com/uc?id=1i1do_UTzwXzPVIDDmYSFJEholK2Mp8g_' -O ./mRNA.txt
!gdown 'https://drive.google.com/uc?id=1liKeOBKjnbCi1CIjcOPA3Zxv2fRzCfa2' -O ./miRNA.txt
!gdown 'https://drive.google.com/uc?id=1qr9joY0bAVDLvjWsKF5xf3CaRolBu-mP' -O ./meth.txt
!gdown 'https://drive.google.com/uc?id=1R-U2iDgM4oEyzNRfBIA2kXMbKw_s0QtI' -O ./clusters.txt

ds = {}     # this will contain each omic
omics = ['mRNA','miRNA','meth']
for omic_name in omics:
    path = omic_name + ".txt"
    if omic_name not in ds:
        ds[omic_name] = pd.read_csv(path, sep='\t', index_col=0)

y = pd.read_csv('clusters.txt', sep='\t', index_col=0)
true_cluster_labels_kidney = y.values.reshape(y.shape[0])

Pre-process omics

In [None]:
# mRNA
# Keep only protein coding genes
# NB: the file idx_mRNA_prot_cod.txt, containing the protein coding genes of the
# human genome, has been obtained in the colab notebook 'Find protein coding genes'
if not os.path.exists("./idx_mRNA_prot_cod_kidney.txt"):
    !gdown 'https://drive.google.com/uc?id=1Pi4u8y_YAc2tmOWZYaeLn9wGdzu4cFC5' -O ./idx_mRNA_prot_cod_kidney.txt

idx_mRNA_prot_cod = pd.read_csv('idx_mRNA_prot_cod_kidney.txt')
idx_mRNA_prot_cod = idx_mRNA_prot_cod['idx'].values

ds['mRNA'] = ds['mRNA'].iloc[:, idx_mRNA_prot_cod]

# Delete genes with a zero expression value across all the samples
ds['mRNA'] = ds['mRNA'].loc[:, (ds['mRNA'] != 0).any(axis=0)]

# Normalize mRNA with MinMax Scaler
ds['mRNA_normalized'] = MinMaxScaler().fit_transform(ds['mRNA'].values)



# miRNA
# Delete sequences with a zero expression value across all the samples
ds['miRNA'] = ds['miRNA'].loc[:, (ds['miRNA'] != 0).any(axis=0)]

# Normalize with log2 normalization
ds['miRNA'] = np.log(ds['miRNA'] + 1) / np.log(2)

# Normalize with MinMaxScaler
ds['miRNA_normalized'] = MinMaxScaler().fit_transform(ds['miRNA'].values)



# meth
# Delete sequences with a zero expression value across all the samples
ds['meth'] = ds['meth'].loc[:, (ds['meth'] != 0).any(axis=0)]

# Normalize with MinMaxScaler
ds['meth_normalized'] = MinMaxScaler().fit_transform(ds['meth'].values)

Apply SNF

In [None]:
# Concatenate omics
ds['early_integr'] = [ds[f"{omic}_normalized"] for omic in omics]

affinity_networks = snf.make_affinity(ds['early_integr'], metric='euclidean', K=5, mu=0.5)
fused_network = snf.snf(affinity_networks, K=5)
best, second = snf.get_n_clusters(fused_network)
print(best)
cluster_assignments = spectral_clustering(fused_network, n_clusters=best)

ds['early_integr'] = np.concatenate([ds[f"{omic}_normalized"] for omic in omics], axis=1)

# Perform a 2D PCA to visualize the dataset
pca = PCA(2)
principalComponents = pca.fit_transform(ds['early_integr'])

# Plot the clustered dataset
plot_2D_dataset(principalComponents, cluster_assignments, title='Dataset', caption=f'{best} predicted clusters')
plot_2D_dataset(principalComponents, true_cluster_labels_kidney, title='Dataset', caption=f'true clusters')

# Plot the confusion matrix
plot_confusion_matrix(true_cluster_labels_kidney, cluster_assignments)

# Compute silhouette
print(silhouette_score(ds['early_integr'], cluster_assignments))