In [2]:
import sys
sys.path.append("..")

import dataInterpreter as dt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering 
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import adjusted_rand_score

In [None]:
data = dt.get_data('NHBE', 'healthy', 'sars-cov2', series = (1,))
columns_healthy = dt.get_columns('NHBE', 'healthy', series = (1,))
columns_sars_cov2 = dt.get_columns('NHBE', 'sars-cov2', series = (1,))

filtered_data = dt.get_p_values('mannwhitneyu', data, columns_healthy, columns_sars_cov2)
filtered_data

In [None]:
filtered_data.drop(['p-value'], axis = 1, inplace = True)
dt.plot_dendrogram(filtered_data)

In [None]:
labels = {}

models = AgglomerativeClustering(n_clusters = 10, affinity = 'euclidean', linkage ='ward')
labels['distance'] = models.fit_predict(filtered_data)

In [None]:
labels['distance']

In [None]:
dissimilarity = 1 - np.abs(filtered_data.T.corr())
hierarchy = linkage(squareform(dissimilarity), method='ward')
labels['pearson'] = fcluster(hierarchy, 10, criterion='maxclust') - 1

In [None]:
labels['pearson']

In [None]:
dissimilarity = 1 - np.abs(filtered_data.T.corr(method='spearman'))
hierarchy = linkage(squareform(dissimilarity), method='ward')
labels['spearman'] = fcluster(hierarchy, 10, criterion='maxclust') - 1

In [None]:
labels['spearman']

In [None]:
from itertools import groupby

clusters = {}

for key1, label1 in labels.items():
    clusters[key1] = [list(map(lambda x: x[0], v)) for k,v in groupby(sorted(zip(filtered_data.index, labels['spearman']), key = lambda x: x[1]), lambda s: s[1])]
    
    for key2, label2 in labels.items():
        if key1 != key2:
            print(key1, ' vs ', key2, ': ', adjusted_rand_score(label1, label2))

In [None]:
i = 0
for c in clusters['distance']:
    print("\n\nCluster", i, ":\n")
    i += 1
    for x in c:
        print(x)

In [None]:
i = 0
for c in clusters['pearson']:
    print("\n\nCluster", i, ":\n")
    i += 1
    for x in c:
        print(x)

In [None]:
i = 0
for c in clusters['spearman']:
    print("\n\nCluster", i, ":\n")
    i += 1
    for x in c:
        print(x)

In [None]:
import enrichmentAnalysis as ea

results_enrichment = {'distance': [], 'pearson': [], 'spearman': []}

for cluster_type in clusters:
    for cluster in clusters[cluster_type]:
        results_enrichment[cluster_type] += [ea.getEnrichment(list(cluster), 'GO_Biological_Process_2021')['GO_Biological_Process_2021']]

results_enrichment

In [None]:
import json

with open('results_NHBE.json', 'w') as file:
     #file.write(json.dumps(results_enrichment)) # use `json.loads` to do the reverse

In [3]:
import json

with open('results_NHBE.json') as file:
    results_enrichment = json.load(file)

In [4]:
dataset = {'p-value': [], 'Score': [], 'Cluster': [], 'Value': []}
index = []

data = dt.get_data('NHBE', 'healthy', 'sars-cov2', series = (1,))

columns_healthy = dt.get_columns('NHBE', 'healthy', series = (1,))
columns_sars_cov2 = dt.get_columns('NHBE', 'sars-cov2', series = (1,))

for cluster_num in range(len(results_enrichment['distance'])):
    for term in results_enrichment['distance'][cluster_num]:
        index += [term[1]]
        dataset['p-value'] += [term[6]]
        dataset['Score'] += [term[4]]
        dataset['Cluster'] += [cluster_num]
        
        genes = term[5]
        sub = data.loc[genes, columns_healthy].values - data.loc[genes, columns_sars_cov2].values
        avg_sub = np.mean(sub, axis = 1)
        
        downs = 0
        ups = 0
        
        for e in avg_sub:
            if e > 0:
                downs += 1
            elif e < 0:
                ups += 1
        
        dataset['Value'] += ['%d up, %d down' % (ups, downs)]
        
enrichment_dataset = pd.DataFrame(dataset, index = index)

In [9]:
len(enrichment_dataset[enrichment_dataset['p-value'] < 0.001].index)

20

In [None]:
pd.set_option("display.max_rows", None)
selection = enrichment_dataset[enrichment_dataset['p-value'] < 0.01].sort_values('Score', ascending = False)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('NHBE_table_with_updown.csv')
selection

In [None]:
results = ea.getEnrichment(list(filtered_data.index), 'GO_Biological_Process_2021')['GO_Biological_Process_2021']

dataset_test = {'p-value': [], 'c-score': []}
index_test = []

for term in results:
    index_test += [term[1]]
    dataset_test['p-value'] += [term[6]]
    dataset_test['c-score'] += [term[4]]
enrichment_dataset_test = pd.DataFrame(dataset_test, index = index_test)

In [None]:
results

In [None]:
pd.set_option("display.max_rows", None)
selection_test = enrichment_dataset_test[enrichment_dataset_test['p-value'] < 0.01].sort_values('c-score', ascending = False).head(25)

selection_test['p-value'] = selection_test['p-value'].map(lambda x: '%.2E' % x)
selection_test['c-score'] = selection_test['c-score'].map(lambda x: '%.2f' % x)

#selection_test.to_csv('comparison_joined_table.csv')
selection_test

## Clustering on columns

In [None]:
fig = plt.figure(figsize=(25, 10))
dendrogram = sch.dendrogram(sch.linkage(filtered_data.T, method="ward"))
plt.title(title)
plt.xlabel()
plt.ylabel('Distances')
plt.show()

In [None]:
labels_columns = {}

models = AgglomerativeClustering(n_clusters = 2, affinity = 'euclidean', linkage ='ward')
labels_columns['distance'] = models.fit_predict(filtered_data.T)

labels_columns['distance']