# Similarity Network Fusion downstream analysis

Author: Casper de Visser (casper.devisser@radboudumc.nl), Radoud university medical center

## Introduction

This notebook contains the downstream analysis of the fused sample similarity matrix that was constructed with SNF (Wang 2014). Spectral clustering is performed on these sample similarities and these clusters are compared with the behavioral data and phenotypic covariates.

In [5]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from sklearn.cluster import spectral_clustering
from sklearn.metrics import v_measure_score
import snf
from snf import metrics
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
import statsmodels

## Input file paths 

snf_matrix_path <br />
phenotypes_covariates_path <br />
metabolomics_path <br />
mca_dims_path <br />
output_dir_plots

## Load numpy array of fused network

In [7]:
# Load numpy array of fused network
fused_network = np.loadtxt(snf_matrix_path, delimiter = ",")

## Perform spectral clustering on fused network

In [None]:
# Spectral clustering

# determine optimal number of clusters (estimated via an eigengap approach)
best, second = snf.get_n_clusters(fused_network)

# Perform spectral clustering on the fused network
labels = spectral_clustering(fused_network, n_clusters=best)
labels_second = spectral_clustering(fused_network, n_clusters=second)

In [None]:
# Functions


def sort_fused_network(fused_network, labels_array):
    # Make Pandas Dataframes
    df = pd.DataFrame(fused_network)
    df_labels = pd.DataFrame(labels_array)
    df_labels.columns = ["Label"]
    # sort label df
    df_labels = df_labels.sort_values(by=['Label'])
    # sort fused network df with sorted labels
    df = df.reindex(df_labels.index)
    df = df[df_labels.index]
    array = df.to_numpy()
    np.fill_diagonal(array, 0)
    return(array)

def make_heatmap(array, n_clusters):
    # Create heatmap
    heatmap = plt.imshow(array, cmap='hot', interpolation='nearest')

    # Set axis names, title etc.
    plt.xlabel('samples')
    plt.ylabel('samples')
    cbar = plt.colorbar(heatmap)
    cbar.ax.set_ylabel('sample correlations', loc="top")
    plt.suptitle('Fused network: sample correlations\nNumber of clusters: {:.2f}'.format(round(n_clusters)))
    plt.show()

    return(plt)
    
# Sort Fused networks according to labels found by spectral clustering
sorted_fused_network_best = sort_fused_network(fused_network, labels)
sorted_fused_network_second = sort_fused_network(fused_network, labels_second)

In [None]:
make_heatmap(sorted_fused_network_best,  best)

In [None]:
make_heatmap(sorted_fused_network_second, second)

In [None]:
# Evaluation metrics

# Determine V-measure score (requiring true lables)
#v_score_1 = v_measure_score(labels, true_labels)
#v_score_2 = v_measure_score(labels_second, true_labels)

# Silhouette score
np.fill_diagonal(fused_network, 0)
sil = metrics.silhouette_score(fused_network, labels)
sil2 = metrics.silhouette_score(fused_network, labels_second)

# Affinity Z-score
zscore =  metrics.affinity_zscore(fused_network, labels)
zscore2 = metrics.affinity_zscore(fused_network, labels_second)

## Compare SNF clusters to phenotype data

In [None]:
# Find common IDs with -omics dataframes used for SNF


metabolomics = pd.read_csv(metabolomics_path, index_col=0) #metabolomics_values_mapped
metabolomics = metabolomics.dropna()

In [None]:
# Phenotypes process out

phenotypes_data = pd.read_csv(phenotypes_covariates_path , index_col=0) #phenotype_covariates_data.csv

phenotypes_data = phenotypes_data[phenotypes_data.index.isin(metabolomics.index)]
phenotypes_data.shape

# Add cluster labels from SNF

phenotypes_data["fused_label"] = labels
phenotypes_data["fused_label_2nd"] = labels_second

for i in phenotypes_data.index:
    phenotypes_data.at[i, 'fused_label'] =  "SNF_"+ str(phenotypes_data.at[i, 'fused_label'])
    phenotypes_data.at[i, 'fused_label_2nd'] =  "SNF_"+ str(phenotypes_data.at[i, 'fused_label_2nd'])

## Generate mosaic plots, comparing SNF clusters with phenotypic covariates 

In [None]:
# Function to generate the mosaic plot

from statsmodels.graphics.mosaicplot import mosaic

colors = ['#e69F00', '#56b4e9', '#009e73', '#f0e442', '#0072b2', '#d55e00', '#cc79a7', '#000000']

def make_mosaic(df, col1, col2, out_dir):
    
    #Sort df on pheno value that is plotted against the clusters
    df = df.sort_values(by=[col1])
    
    
    #Adjust plot size to value counts
    number_of_pheno_values = len(df[col1].value_counts())
    number_of_clusters = len(df[col2].value_counts())
    
    
    if number_of_pheno_values < 2:
        print('No differences are observed in this phenotypic feature among the subjects')
    
    else:
    
        if number_of_pheno_values < 4:
            number_of_pheno_values = 4
    
        # Figure size
        fig, ax = plt.subplots(figsize=(number_of_pheno_values*2,number_of_clusters*1.5))      
        
        
        # Figure color palette
        props= {}
        e = 0
        a =  0.6 - number_of_clusters/10 
        for i in df[col1].unique():
            for j in df[col2].unique():
                props[(str(i), str(j))] = {'color': colors[e], 'alpha' : a}
                a += (0.6 - number_of_clusters/10)
            e += 1
            a = (0.6 - number_of_clusters/10)
        
        # Figure lables (percentages)
        labels_dict={}
        for i in df[col1].unique():
            for j in df[col2].unique():
                samples = len(df[(df[col1] == i) & (df[col2] == j)])
                percentage = round(samples/len(df.index) * 100, 2)
                labels_dict[(str(i), str(j))] = str(percentage) + '%'

        # Generate plot
        mosaic(df, 
               [col1, col2], 
               ax=ax, 
               axes_label=False,
               properties = props,
               labelizer = lambda k: labels_dict[k])
        plt.xlabel(col1, fontsize=20)
        plt.ylabel(col2, fontsize=20)
        plt.savefig(out_dir)
        plt.show()
        plt.close()

        return(plt)

In [None]:
make_mosaic(phenotypes_data, 'Age', 'fused_label', str(output_dir_plots) + 'Age_l1.png')

In [None]:
make_mosaic(phenotypes_data, 'Age', 'fused_label_2nd', str(output_dir_plots) + 'Age_l2.png')

In [None]:
make_mosaic(phenotypes_data, 'Sex', 'fused_label', str(output_dir_plots) + 'sex_l1.png') 

In [None]:
make_mosaic(phenotypes_data, 'Sex', 'fused_label_2nd', str(output_dir_plots) + 'sex_l2.png')

In [None]:
make_mosaic(phenotypes_data, 'Sick', 'fused_label', str(output_dir_plots) + 'Sick_l1.png')

In [None]:
make_mosaic(phenotypes_data, 'Sick', 'fused_label_2nd', str(output_dir_plots) + 'Sick_l2.png')

In [None]:
make_mosaic(phenotypes_data, 'Menstruation', 'fused_label', str(output_dir_plots) + 'Menstruation_l.png')

In [None]:
make_mosaic(phenotypes_data, 'Menstruation', 'fused_label_2nd', str(output_dir_plots) + 'Menstruation_l2.png')

In [None]:
make_mosaic(phenotypes_data, 'Vitamines', 'fused_label', str(output_dir_plots) + 'Vitamines_l.png')

In [None]:
make_mosaic(phenotypes_data, 'Vitamines', 'fused_label_2nd', str(output_dir_plots) + 'Vitamines_l2.png')

# Compare clusters on MCA dimensions

In [None]:
# Load in MCA dimensions

mca_coordinates = pd.read_csv(mca_dims_path, index_col=0)
mca_coordinates = mca_coordinates[mca_coordinates.index.isin(phenotypes_data.index)]
phenotypes_data = phenotypes_data[phenotypes_data.index.isin(mca_coordinates.index)]


# Add cluster labels from SNF

mca_coordinates["fused_label"] = phenotypes_data['fused_label']
mca_coordinates["fused_label_2nd"] = phenotypes_data['fused_label_2nd']

for i in mca_coordinates.index:
    mca_coordinates.at[i, 'fused_label'] =  str(mca_coordinates.at[i, 'fused_label'])
    mca_coordinates.at[i, 'fused_label_2nd'] =  str(mca_coordinates.at[i, 'fused_label_2nd'])

mca_coordinates.columns = mca_coordinates.columns.str.replace(' ', '_')

## Functions used for statistics 

In [None]:
def make_significant_bold(x):
    bold = 'bold' if x < 0.05 else ''
    return 'font-weight: %s' % bold


def make_pvalue_table(p_value_list):
    a = np.array(p_value_list)
    df = pd.DataFrame(a[:,1:], index = a[:,0], columns = ['test statistic', 'p-value'])
    df['test statistic'] = pd.to_numeric(df['test statistic'])
    df['p-value'] = pd.to_numeric(df['p-value'])
    p_values = np.asarray(df['p-value'].values.tolist())
    corrected_p_values = statsmodels.stats.multitest.fdrcorrection(p_values)
    df['FDR corrected p-value'] = corrected_p_values[1].tolist()
    df.style.applymap(make_significant_bold)
    return(df)


def man_whitney(group_list):
    
    # Calculate Mann-Whitney U tests per MCA dimension
    p_value_list = []
    
    for i in mca_coordinates.columns[0:-2]:
        row = []
        statistics = stats.mannwhitneyu(group_list[0][i], group_list[1][i])
        row.append(i)
        row.append(list(statistics)[0])
        row.append(list(statistics)[1])
        p_value_list.append(row)
        
    # Make nicer looking table for p-values
    df = make_pvalue_table(p_value_list)
    return(df)


def kruskal_wallis(groups_list):

    # Make list of the different snf groups, per mca dimension        
    groups_vs_mca_dims = []

    for i in mca_coordinates.columns[0:-2]:
        
        groups_vs_mca_dim = []
        for j in groups_list:
            groups_vs_mca_dim.append(j[i])
        groups_vs_mca_dims.append(groups_vs_mca_dim)
    
    # Calculate Kruskal-Wallis statistic over the different groups
    p_value_list = []
    dim_num = 0
    
    for i in groups_vs_mca_dims:
        dim_num += 1
        row = []
        statistics = stats.kruskal(*i)
        row.append("Dim_" + str(dim_num))
        row.append(list(statistics)[0])
        row.append(list(statistics)[1])
        p_value_list.append(row)
        
    # Make nice looking table
    df = make_pvalue_table(p_value_list)
    return(df)

## Shapiro test for MCA dimensions

In [None]:
# Shapiro tests show that MCA dimensions are not normalliy distributed
p_value_list = []

for i in mca_coordinates.columns[0:-2]:
    row = []
    statistics = stats.shapiro(mca_coordinates[i])
    row.append(i)
    row.append(list(statistics)[0])
    row.append(list(statistics)[1])
    p_value_list.append(row)

df = make_pvalue_table(p_value_list)
df.style.applymap(make_significant_bold)
df

In [None]:
from IPython.display import display, Markdown
display(Markdown("## Comparing " + str(best) + " clusters on MCA dimensions"))

In [None]:
snf_groups_best = []

for i in range(best):
    snf_label = 'SNF_' + str(i)
    snf_groups_best.append(mca_coordinates[mca_coordinates['fused_label'] == snf_label])
    
if len(snf_groups_best) > 2:
    results = kruskal_wallis(snf_groups_best)
    
else:
    results = man_whitney(snf_groups_best)

results.style.applymap(make_significant_bold)
results

In [None]:
display(Markdown("## Comparing " + str(second) + " clusters on MCA dimensions"))

In [None]:
snf_groups_second = []
for i in range(second):
    snf_label = 'SNF_' + str(i)
    snf_groups_second.append(mca_coordinates[mca_coordinates['fused_label_2nd'] == snf_label])
    
if len(snf_groups_second) > 2:
    results = kruskal_wallis(snf_groups_second)
    
else:
    results = man_whitney(snf_groups_second)

results.style.applymap(make_significant_bold)
results