In [None]:
import sys
sys.path.append("..")

import dataInterpreter as dt
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering 
from scipy.cluster.hierarchy import linkage, fcluster
from scipy.spatial.distance import squareform
from sklearn.metrics import adjusted_rand_score

In [None]:
data = dt.get_data('A549', 'healthy', 'sars-cov2', series = (2, 5))
columns_healthy = dt.get_columns('A549', 'healthy', series = (2, 5))
columns_sars_cov2 = dt.get_columns('A549', 'sars-cov2', series = (2, 5))

filtered_data = dt.get_p_values('mannwhitneyu', data, columns_healthy, columns_sars_cov2, limit = 0.01)
filtered_data

In [None]:
filtered_data.drop(['p-value'], axis = 1, inplace = True)
dt.plot_dendrogram(filtered_data)

In [None]:
labels = {}

models = AgglomerativeClustering(n_clusters = 3, affinity = 'euclidean', linkage ='ward')
labels['distance'] = models.fit_predict(filtered_data)

In [None]:
labels['distance']

In [None]:
dissimilarity = 1 - np.abs(filtered_data.T.corr())
hierarchy = linkage(squareform(dissimilarity), method='ward')
labels['pearson'] = fcluster(hierarchy, 3, criterion='maxclust') - 1

In [None]:
labels['pearson']

In [None]:
dissimilarity = 1 - np.abs(filtered_data.T.corr(method='spearman'))
hierarchy = linkage(squareform(dissimilarity), method='ward')
labels['spearman'] = fcluster(hierarchy, 3, criterion='maxclust') - 1

In [None]:
labels['spearman']

In [None]:
from itertools import groupby

clusters = {}

for key1, label1 in labels.items():
    clusters[key1] = [list(map(lambda x: x[0], v)) for k,v in groupby(sorted(zip(filtered_data.index, labels['spearman']), key = lambda x: x[1]), lambda s: s[1])]
    
    for key2, label2 in labels.items():
        if key1 != key2:
            print(key1, ' vs ', key2, ': ', adjusted_rand_score(label1, label2))

In [None]:
i = 0
for c in clusters['distance']:
    print("\n\nCluster", i, ":\n")
    i += 1
    for x in c:
        print(x)

In [None]:
i = 0
for c in clusters['pearson']:
    print("\n\nCluster", i, ":\n")
    i += 1
    for x in c:
        print(x)

In [None]:
i = 0
for c in clusters['spearman']:
    print("\n\nCluster", i, ":\n")
    i += 1
    for x in c:
        print(x)

In [None]:
import enrichmentAnalysis as ea

results_enrichment = {'distance': [], 'pearson': [], 'spearman': []}

for cluster_type in clusters:
    for cluster in clusters[cluster_type]:
        results_enrichment[cluster_type] += [ea.getEnrichment(list(cluster), 'KEGG_2021_Human')['KEGG_2021_Human']]

results_enrichment

In [None]:
import json

#with open('results_A549_KEGG.json', 'w') as file:
     #file.write(json.dumps(results_enrichment)) # use `json.loads` to do the reverse

In [None]:
import json

with open('results_A549.json') as file:
    results_enrichment = json.load(file)

In [None]:
len(enrichment_dataset[enrichment_dataset['p-value'] < 0.001].index)

In [None]:
dataset = {'p-value': [], 'Score': [], 'Cluster': [], 'Value': []}
index = []

data = dt.get_data('A549', 'healthy', 'sars-cov2', series = (2, 5))
data.index = data.index.str.upper()

columns_healthy = dt.get_columns('A549', 'healthy', series = (2, 5))
columns_sars_cov2 = dt.get_columns('A549', 'sars-cov2', series = (2, 5))

for cluster_num in range(len(results_enrichment['distance'])):
    for term in results_enrichment['distance'][cluster_num]:
        index += [term[1]]
        dataset['p-value'] += [term[6]]
        dataset['Score'] += [term[4]]
        dataset['Cluster'] += [cluster_num]
        
        genes = term[5]
        sub = data.loc[genes, columns_healthy].values - data.loc[genes, columns_sars_cov2].values
        avg_sub = np.mean(sub, axis = 1)
        
        downs = 0
        ups = 0
        
        for e in avg_sub:
            if e > 0:
                downs += 1
            elif e < 0:
                ups += 1
        
        dataset['Value'] += ['%d up, %d down' % (ups, downs)]
        
enrichment_dataset = pd.DataFrame(dataset, index = index)

In [None]:
pd.set_option("display.max_rows", None)
selection = enrichment_dataset[enrichment_dataset['p-value'] < 0.01].sort_values('Score', ascending = False)

selection['p-value'] = selection['p-value'].map(lambda x: '%.2E' % x)
selection['Score'] = selection['Score'].map(lambda x: '%.2f' % x)

#selection.to_csv('A549_table_with_updown.csv')
selection

In [None]:
import requests

new_index = []

for item in selection.index:
    response = requests.get(
         'http://rest.kegg.jp/%s/%s/%s' % ('find', 'pathway', item)
    )
    
    pathway = response.text.split('\n')[0].split('\t')[0].split(':')[1]
    new_index += [item + ' (\href{https://www.kegg.jp/entry/%s}{%s}' % (pathway, pathway)  + ')']
    
new_index

In [None]:
selection.index = new_index
selection.to_csv('A549_table_KEGG.csv')
selection

## Clustering on columns

In [None]:
fig = plt.figure(figsize=(25, 10))
dendrogram = sch.dendrogram(sch.linkage(filtered_data.T, method="ward"), labels = list(['Healthy S2', 'Healthy S2', 'Healthy S2', 'SARS-CoV-2 S2', 'SARS-CoV-2 S2', 'SARS-CoV-2 S2', 'Healthy S5', 'Healthy S5', 'Healthy S5', 'SARS-CoV-2 S5', 'SARS-CoV-2 S5', 'SARS-CoV-2 S5']))
plt.show()

In [None]:
filtered_data.columns

In [None]:
labels_columns = {}

models = AgglomerativeClustering(n_clusters = 2, affinity = 'euclidean', linkage ='ward')
labels_columns['distance'] = models.fit_predict(filtered_data.T)

labels_true = [0 if col in columns_healthy else 1 for col in filtered_data.columns]

print("True: ", labels_true)
print("Predicted: ", labels_columns['distance'])
print(adjusted_rand_score(labels_true, labels_columns['distance']))