In [None]:
import os
import pandas as pd
import numpy as np
import json
import re

In [None]:
### JSON to dict
with open('/mnt/data1/wangj/GeneSets/hsa00001.json','r') as file:
    json_str = file.read()
print(json_str)

In [None]:
data_dict = json.loads(json_str)

In [None]:
with open('/mnt/data1/wangj/GeneSets/KEGG.gmt','w') as file:
    for i in range(0,8):
        collection = data_dict['children'][i]['children']
        for j in range(len(collection)):
            # set_N = re.findall(re.compile(r'\d+ (.*)'),collection[j]['name'])[0]
            subcollection = collection[j]['children']
            for k in range(len(subcollection)):
                name = subcollection[k]['name']
                try:
                    KEGG_ID = re.findall(re.compile(r'(hsa\d+)'),name)[0]
                    KEGG_Name = 'KEGG_' + re.findall(re.compile(r'\d+ (.*) \['),name)[0]
                    file.write(KEGG_Name+'\t'+KEGG_ID+'\t')
                    genes = subcollection[k]['children']
                    for gene in genes:
                        file.write(gene['name'].split(' ')[1].replace(';','')+'\t')
                    file.write('\n')
                except:
                    pass

In [None]:
### network plot:KEGG enrichment
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy as gp
import os

os.chdir('/mnt/data1/wangj/AgingScore/GSE163530_COVID-19/GSE171668_scnRNA/')

In [None]:
degs_up = pd.read_csv('Endo_sense_marker.csv',index_col=0)
enr_KEGG = gp.enrichr(degs_up.index.tolist(),
                    gene_sets='KEGG_2021_Human',
                    outdir=None)
Enrich_KEGG = enr_KEGG.results 
Enrich_KEGG = Enrich_KEGG[Enrich_KEGG['Adjusted P-value']<1]
Enrich_KEGG.shape

In [None]:
Enrich_KEGG_select = Enrich_KEGG[(Enrich_KEGG.Term.isin(list(filter(lambda x: 'signaling pathway' in x, Enrich_KEGG.Term))))]
Enrich_KEGG_select = Enrich_KEGG_select.sort_values(by='Adjusted P-value',ascending=True,ignore_index=True)
Enrich_KEGG_plot = pd.concat([Enrich_KEGG_select[0:10],Enrich_KEGG[(Enrich_KEGG.Term.isin(['Cellular senescence','Coronavirus disease']))]],ignore_index=True)                       

In [None]:
nodes, edges = gp.enrichment_map(Enrich_KEGG_plot,cutoff = 0.5,top_term = 20)
nodes.shape

In [None]:
G = nx.from_pandas_edgelist(edges,
                            source='src_idx',
                            target='targ_idx',
                            edge_attr=['jaccard_coef', 'overlap_coef', 'overlap_genes'])
node_list = sorted(G.nodes())
angle = []
angle_dict = {}
n = nodes.shape[0]
for i, node in zip(range(n),node_list):
    theta = 2.0*np.pi*i/n
    angle.append((np.cos(theta),np.sin(theta)))
    angle_dict[node] = theta
pos = {}
for node_i, node in enumerate(node_list):
    pos[node] = angle[node_i]

labels = nodes.Term.to_dict()

fig, ax = plt.subplots(figsize=(8,8))
margin=0.33
fig.subplots_adjust(margin, margin, 1.-margin, 1.-margin)
ax.axis('equal')

edge_weight = nx.get_edge_attributes(G, 'jaccard_coef').values()

net = nx.draw(G,
              pos=pos,
              with_labels=False, 
              ax=ax,
              width=list(map(lambda x: x*10, edge_weight)),
              edge_color='#CDDBD4',
              node_color=list(nodes['Adjusted P-value']),
              node_size=list(nodes.Hits_ratio *3000),
              cmap=plt.cm.RdYlBu)
description = nx.draw_networkx_labels(G,pos,labels=labels,font_size=15)

r = fig.canvas.get_renderer()
trans = plt.gca().transData.inverted()
for node, t in description.items():
    bb = t.get_window_extent(renderer=r)
    bbdata = bb.transformed(trans)
    radius = 1.2+bbdata.width/2.
    position = (radius*np.cos(angle_dict[node]),radius* np.sin(angle_dict[node]))
    t.set_position(position)
    t.set_rotation(angle_dict[node]*360.0/(2.0*np.pi))
    t.set_clip_on(False)
sm = plt.cm.ScalarMappable(cmap=plt.cm.RdYlBu, 
                           norm=plt.Normalize(vmin = nodes['Adjusted P-value'].min(), 
                                              vmax=nodes['Adjusted P-value'].max()))
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
ax_col = fig.add_axes([0.1, 0.04, 0.02, 0.15])
sns.set(font_scale=1)
ax_col.set_title('Adjusted P-value')
plt.colorbar(sm,shrink=0.4,cax=ax_col,orientation='vertical')
plt.savefig("/home/wangjing/wangj/codebase/HUSI/Figures/covid-19/Endo_state_KEGGenrich.png",dpi = 200,bbox_inches = 'tight')
plt.show()

In [8]:
### extract tables from pdf
import pdfplumber
import pandas as pd

In [22]:
df = pd.DataFrame()
with pdfplumber.open("/home/wangjing/wangj/AgingScore/GSE150316_COVID-19_bulk/41467_2020_20139_MOESM1_ESM.pdf") as pdf:
    for i in range(7,11):
        page = pdf.pages[i]
        table = page.extract_table()
        table = pd.DataFrame(table)
        df = pd.concat([df,table],ignore_index=True)

In [23]:
### remove \n in the table
df = df.replace('\n','',regex=True)
### first row as column names
df.columns = df.iloc[0,:]
df = df.drop(index=0)
df.to_csv("/home/wangjing/wangj/AgingScore/GSE150316_COVID-19_bulk/clinical.csv",index=False)