Notebook from Tine Claeys

In [3]:
import pandas as pd
import numpy as np
import os
import plotly.express as px
from matplotlib_venn import venn2
import plotly.graph_objects as go
pd.options.mode.chained_assignment = None

def find_tissue_profile(protein_list, path, only_present=False, normalisation="minmax"):
    """Make tissue profile using a list and a path to the folder containing the tissue predictor output. 
    If only_present is set to True, only the proteins with a mean abundance higher than zero are selected"""
    proteindf = pd.DataFrame(columns=['feature_name', 'importance', 'stdev', 'mean_abundance', 'tissue'])
    for filename in os.listdir(path):
        if filename.endswith('_mean_abundance_added.csv'):
            f = os.path.join(path,filename)
            tissue =filename.split('_')[0].split('.')[0]
            df=pd.read_csv(f, sep='/')
            #drop zero importance proteins
            df = df[df['importance'] > 0]
            if only_present:
                df = df[df['mean_abundance'] > 0]
            # add a ranking using their index
            df['rank'] = df.index
            if normalisation == "minmax":
                df['importance'] = (df['importance'] - df['importance'].min()) / (df['importance'].max() - df['importance'].min()) #min max normalisation to make it comparable between tissues
            if normalisation == "relative":
                df['importance'] = (df['importance'] / df.shape[0])*100 #relative normalisation to make it comparable between tissues
            if normalisation == "z-score":
                df['importance'] = (df['importance'] - df['importance'].mean()) / df['importance'].std()
            if normalisation == "rank":
                df['importance'] = df['rank']
            found_prot = df[df['feature_name'].isin(protein_list)]
            found_prot['tissue'] = tissue
            found_prot_values=found_prot.values.tolist()
            proteindf=pd.concat([proteindf, found_prot])
    proteindf.sort_values(by='importance', ascending=False)
    return proteindf

In [102]:
baseline=['A0MZ66','O00154','O00425','O00592','O00754','O43707','O43719','O60264','O60701','O75475','O75533','O75828','O75976','O95817','P00338','P04083','P05091','P05783','P05787','P07948','P08648','P08670','P09382','P11047','P11388','P11498','P11766','P12270','P12814','P12956','P14923','P15924','P16144','P16401','P16403','P16989','P17301','P17655','P18206','P19174','P19367','P20290','P20700','P20962','P21266','P21980','P23246','P26006','P26038','P26447','P27105','P29218','P29350','P29373','P30038','P30740','P31146','P31947','P32004','P32119','P35580','P35998','P40121','P43121','P46013','P46821','P48681','P48735','P48960','P49720','P50552','P51452','P52594','P61978','P62805','P62877','P78417','P78527','P80723','Q01970','Q02952','Q07065','Q08211','Q08380','Q08J23','Q10589','Q12965','Q13151','Q14116','Q14126','Q14151','Q14247','Q14315','Q14676','Q14764','Q15029','Q15149','Q15393','Q15459','Q15555','Q16352','Q16658','Q16762','Q3KQU3','Q53EP0','Q6NZI2','Q6WKZ4','Q7L5N7','Q7Z2K8','Q8IVF2','Q8IVT2','Q8IWE2','Q8IYS1','Q8N392','Q8NBS9','Q8NI08','Q8TBX8','Q8TDZ2','Q8TEM1','Q8WUF5','Q92616','Q92820','Q92922','Q96CX2','Q96DG6','Q96HQ2','Q96RS6','Q96TA1','Q99584','Q99943','Q99959','Q9BST9','Q9BW60','Q9BWF3','Q9BZK3','Q9BZQ8','Q9HC35','Q9NR30','Q9NS86','Q9NTX5','Q9NU22','Q9NUQ3','Q9NZM1','Q9NZN4','Q9UDT6','Q9UGI8','Q9UHD8','Q9UHR4','Q9UIG0','Q9UIQ6','Q9UJC3','Q9UKK3','Q9UKN8','Q9UQE7','Q9Y223','Q9Y446','Q9Y490','Q9Y5B9','Q9Y5Q9','Q9Y5V3','Q9Y5Z4']
crc = ["P08670","P11388","P16144","P20962","P27105","P46013","P50552","P52594","P62805","P62877","P80723","Q14151", "Q16352", "Q7L5N7", "Q8N392", "Q92922", "Q9BWF3", "Q9BZK3", "Q9UDT6", "Q9UHR4"]
breast = ["O00425","O75828","O75976","P05787","P11388","P12814","P20962","P21266","P29373","P30038","P48681","P48735","Q02952","Q14676","Q15149","Q15555","Q8IYS1","Q9NTX5","Q9UIQ6","Q9Y5V3"]
hcc = ["O00592","O60264","O60701","O75475","P05091","P11498","P21266","P29218","P51452","P52594","Q01970","Q08J23","Q15555","Q3KQU3","Q53EP0","Q8WUF5","Q96CX2","Q9BZQ8","Q9HC35","Q9UIQ6"]
hek = ["O00592","O60701","P09382","P16989","P20700","P26447","P50552","P62877","Q01970","Q12965","Q14676","Q15555","Q3KQU3","Q8IYS1","Q8TDZ2","Q96CX2","Q96HQ2","Q96RS6","Q96TA1","Q9BW60"]
hgsoc= ["O43719","O75976","P08648","P21980","P29218","P32004","P32119","P35998","P78417","Q08380","Q12965","Q14315","Q15555","Q53EP0","Q92820","Q96HQ2","Q99943","Q9BST9","Q9NTX5","Q9NUQ3"]
huvec=["O75533","P08648","P12270","P23246","P43121","P46013","P48960","P78527","Q08380","Q15029","Q15459","Q15555","Q53EP0","Q6NZI2","Q8N392","Q8NBS9","Q96CX2","Q9NR30","Q9NZN4","Q9UQE7"]
hela=["O00154","O00592","O43719","P19367","P26447","P27105","P35998","P48735","P48960","Q08J23","Q10589","Q12965","Q14116","Q8IVT2","Q8TEM1","Q96DG6","Q96HQ2","Q96RS6","Q9Y5V3","Q9Y5Z4"]
lymphoid=["A0MZ66","O00154","O60264","P12270","P17301","P29218","P29350","P31146","Q08211","Q13151","Q14247","Q15555","Q8IWE2","Q96TA1","Q9NZM1","Q9UHD8","Q9UIG0","Q9UIQ6","Q9Y223","Q9Y5B9"]
myeloid=["O00754","O75828","P07948","P11047","P14923","P15924","P17301","P19174","P19367","P27105","P30740","P32119","P35580","Q07065","Q08380","Q7L5N7","Q8IWE2","Q8N392","Q8TDZ2","Q9UKN8"]
neuroblastoma=["A0MZ66","P00338","P07948","P15924","P16989","P20290","P30740","P32004","P46821","P48681","P50552","Q12965","Q6NZI2","Q7Z2K8","Q8NI08","Q99584","Q99943","Q9UHD8","Q9UKN8","Q9Y223"]
endometrioid=["O75475","P05783","P05787","P08648","P12814","P20962","P29373","P35580","P40121","P43121","P80723","Q16658","Q8N392","Q92616","Q96CX2","Q9BZQ8","Q9NZN4","Q9UHD8","Q9UIG0","Q9Y5Q9"]
scc=["P16144","P19174","P20700","Q08211","Q14116","Q15029","Q3KQU3","Q6WKZ4","Q8IWE2","Q8TBX8","Q8TEM1","Q8WUF5","Q92616","Q9BZK3","Q9BZQ8","Q9NS86","Q9NTX5","Q9NZN4","Q9UIQ6","Q9UKN8"]
glioblastoma=["A0MZ66","P11388","P12814","P14923","P15924","P26006","P30038","P35580","P46821","P48960","P62805","P78417","Q14151","Q15149","Q16762","Q8IWE2","Q8TEM1","Q8WUF5","Q9UGI8","Q9Y5Q9"]
ipsc=["O00592","O43707","O75828","O95817","P17655","P20962","P21980","P48681","P48960","P61978","Q01970","Q14764","Q3KQU3","Q92922","Q9BW60","Q9BWF3","Q9BZK3","Q9NU22","Q9NZM1","Q9UIQ6","O00592"]
ovarian = ["O00592","P11047","P11766","P12814","P12956","P14923","P26006","P49720","Q02952","Q08J23","Q14126","Q16762","Q6NZI2","Q8IVF2","Q8IVT2","Q92820","Q96DG6","Q9BW60","Q9UJC3","Q9Y490"]


In [29]:
path = '/home/compomics/git/Tissue_prediction/Tissue_prediction_manuscript/Predictors/Feature_imp_class_filtered_tissue_nocart'


In [None]:
fig = go.Figure()
fig.update_layout(barmode='group')
for lijst, naam in zip([baseline,], ["baseline"]):
    print(naam)
    lijst_tp_rel = find_tissue_profile(lijst, path, only_present=True, normalisation="relative")
    lijst_tp_minmax = find_tissue_profile(lijst, path, only_present=True, normalisation="minmax")
    lijst_tp_zscore = find_tissue_profile(lijst, path, only_present=True, normalisation="z-score")
    lijst_tp_rank = find_tissue_profile(lijst, path, only_present=True, normalisation="rank")
    fig.add_trace(go.Bar(name="relative", x=lijst_tp_rel.groupby('tissue').sum().index, y=lijst_tp_rel.groupby('tissue').sum()['importance']/(lijst_tp_rel.groupby('tissue').count()['feature_name'])))
    fig.add_trace(go.Bar(name="minmax", x=lijst_tp_minmax.groupby('tissue').sum().index, y=lijst_tp_minmax.groupby('tissue').sum()['importance']/(lijst_tp_minmax.groupby('tissue').count()['feature_name'])))
    fig.add_trace(go.Bar(name="z-score", x=lijst_tp_zscore.groupby('tissue').sum().index, y=lijst_tp_zscore.groupby('tissue').sum()['importance']/(lijst_tp_zscore.groupby('tissue').count()['feature_name'])))
    fig.add_trace(go.Bar(name="rank", x=lijst_tp_rank.groupby('tissue').sum().index, y=lijst_tp_rank.groupby('tissue').mean()['rank']/lijst_tp_rank.groupby('tissue').count()['feature_name']))
fig.show()


In [123]:
fig = go.Figure()
fig.update_layout(barmode='group')
for lijst, naam in zip([baseline,], ["baseline"]):
    print(naam)
    lijst_tp_rel = find_tissue_profile(lijst, path, only_present=True, normalisation="relative")
    lijst_tp_minmax = find_tissue_profile(lijst, path, only_present=True, normalisation="minmax")
    lijst_tp_zscore = find_tissue_profile(lijst, path, only_present=True, normalisation="z-score")
    lijst_tp_rank = find_tissue_profile(lijst, path, only_present=True, normalisation="rank")
    fig.add_trace(go.Bar(name="relative", x=lijst_tp_rel.groupby('tissue').sum().index, y=lijst_tp_rel.groupby('tissue').sum()['importance']/(lijst_tp_rel.groupby('tissue').count()['feature_name'])))
    fig.add_trace(go.Bar(name="minmax", x=lijst_tp_minmax.groupby('tissue').sum().index, y=lijst_tp_minmax.groupby('tissue').sum()['importance']/(lijst_tp_minmax.groupby('tissue').count()['feature_name'])))
    fig.add_trace(go.Bar(name="z-score", x=lijst_tp_zscore.groupby('tissue').sum().index, y=lijst_tp_zscore.groupby('tissue').sum()['importance']/(lijst_tp_zscore.groupby('tissue').count()['feature_name'])))
    fig.add_trace(go.Bar(name="rank", x=lijst_tp_rank.groupby('tissue').sum().index, y=lijst_tp_rank.groupby('tissue').mean()['rank']/lijst_tp_rank.groupby('tissue').count()['feature_name']))
fig.show()


baseline


In [126]:
fig = go.Figure()
fig.update_layout(barmode='group')
for lijst, naam in zip([baseline, crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ["baseline", "crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    lijst_tp = find_tissue_profile(lijst, path, only_present=True, normalisation="minmax")
    print(f"{naam} exisists out of {len(lijst)} proteins from which {len(np.unique(lijst_tp['feature_name']))} are present in the data")
    fig.add_trace(go.Bar(name=naam, x=lijst_tp.groupby('tissue').sum().index, y=lijst_tp.groupby('tissue').sum()['importance']/(lijst_tp.groupby('tissue').count()['feature_name'])))
fig.show()


baseline exisists out of 161 proteins from which 150 are present in the data
crc exisists out of 20 proteins from which 19 are present in the data
breast exisists out of 20 proteins from which 18 are present in the data
hcc exisists out of 20 proteins from which 19 are present in the data
hek exisists out of 20 proteins from which 20 are present in the data
hgsoc exisists out of 20 proteins from which 18 are present in the data
huvec exisists out of 20 proteins from which 20 are present in the data
hela exisists out of 20 proteins from which 18 are present in the data
lymphoid exisists out of 20 proteins from which 19 are present in the data
myeloid exisists out of 20 proteins from which 19 are present in the data
neuroblastoma exisists out of 20 proteins from which 19 are present in the data
endometrioid exisists out of 20 proteins from which 17 are present in the data
scc exisists out of 20 proteins from which 18 are present in the data
glioblastoma exisists out of 20 proteins from w

In [202]:
baseline_tp = find_tissue_profile(baseline, path, only_present=True, normalisation="minmax")
for lijst, naam in zip([crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ["crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    lijst_tp = find_tissue_profile(lijst, path, only_present=True)
    if lijst_tp.shape[0] == 0:
        print(f"{naam} only contains not present data")
    else:
        print(f"{naam} exisists out of {len(lijst)} proteins from which {len(np.unique(lijst_tp['feature_name']))} are present in the data")
        fig = go.Figure(data=[
            go.Bar(name=naam, x=lijst_tp.groupby('tissue').sum().index, y=lijst_tp.groupby('tissue').sum()['importance']/(lijst_tp.groupby('tissue').count()['feature_name'])),
            go.Bar(name='baseline', x=baseline_tp.groupby('tissue').sum().index, y=baseline_tp.groupby('tissue').sum()['importance']/(baseline_tp.groupby('tissue').count()['feature_name']))
        ])
        # Change the bar mode
        fig.update_layout(barmode='group')
        fig.show()
        #save as html
        fig.write_html("/home/compomics/git/Tissue_prediction/Sam/{naam}_vs_baseline.html".format(naam=naam))

crc exisists out of 20 proteins from which 18 are present in the data


breast exisists out of 20 proteins from which 18 are present in the data


hcc exisists out of 20 proteins from which 16 are present in the data


hek exisists out of 20 proteins from which 19 are present in the data


hgsoc exisists out of 20 proteins from which 15 are present in the data


huvec exisists out of 20 proteins from which 19 are present in the data


hela exisists out of 20 proteins from which 17 are present in the data


lymphoid exisists out of 20 proteins from which 18 are present in the data


myeloid exisists out of 20 proteins from which 19 are present in the data


neuroblastoma exisists out of 20 proteins from which 20 are present in the data


endometrioid exisists out of 20 proteins from which 17 are present in the data


scc exisists out of 20 proteins from which 18 are present in the data


glioblastoma exisists out of 20 proteins from which 19 are present in the data


ipsc exisists out of 21 proteins from which 19 are present in the data


ovarian exisists out of 20 proteins from which 20 are present in the data


In [136]:
baseline_tp = find_tissue_profile(baseline, path, only_present=True, normalisation="minmax")
for lijst, naam in zip([crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ["crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    lijst_tp = find_tissue_profile(lijst, path, only_present=True)
    baseline_df = baseline_tp.groupby('tissue').sum()['importance']/(baseline_tp.groupby('tissue').count()['feature_name'])
    spec_df = lijst_tp.groupby('tissue').sum()['importance']/(lijst_tp.groupby('tissue').count()['feature_name'])
    #subtract the baseline from the specific 
    diff_df = spec_df - baseline_df
    print(f"{naam} exisists out of {len(lijst)} proteins from which {len(np.unique(lijst_tp['feature_name']))} are present in the data")
    fig = go.Figure(data=[
        go.Bar(name=naam, x=diff_df.index, y=diff_df)])
    # Change the bar mode
    fig.update_layout(barmode='group')
    fig.show()

crc exisists out of 20 proteins from which 18 are present in the data


breast exisists out of 20 proteins from which 18 are present in the data


hcc exisists out of 20 proteins from which 16 are present in the data


hek exisists out of 20 proteins from which 19 are present in the data


hgsoc exisists out of 20 proteins from which 15 are present in the data


huvec exisists out of 20 proteins from which 19 are present in the data


hela exisists out of 20 proteins from which 17 are present in the data


lymphoid exisists out of 20 proteins from which 18 are present in the data


myeloid exisists out of 20 proteins from which 19 are present in the data


neuroblastoma exisists out of 20 proteins from which 20 are present in the data


endometrioid exisists out of 20 proteins from which 17 are present in the data


scc exisists out of 20 proteins from which 18 are present in the data


glioblastoma exisists out of 20 proteins from which 19 are present in the data


ipsc exisists out of 21 proteins from which 19 are present in the data


ovarian exisists out of 20 proteins from which 20 are present in the data


In [201]:
fig = go.Figure()
fig.update_layout(barmode='group')
baseline_tp = find_tissue_profile(baseline, path, only_present=True, normalisation="minmax")
for lijst, naam in zip([crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ["crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    lijst_tp = find_tissue_profile(lijst, path, only_present=True)
    baseline_df = baseline_tp.groupby('tissue').sum()['importance']/(baseline_tp.groupby('tissue').count()['feature_name'])
    spec_df = lijst_tp.groupby('tissue').sum()['importance']/(lijst_tp.groupby('tissue').count()['feature_name'])
    #subtract the baseline from the specific 
    diff_df = spec_df - baseline_df
    print(f"{naam} exisists out of {len(lijst)} proteins from which {len(np.unique(lijst_tp['feature_name']))} are present in the data")
    fig.add_trace(go.Bar(name=naam, x=diff_df.index, y=diff_df))
    # Change the bar mode

fig.show()
#save figure as html
fig.write_html("/home/compomics/git/Tissue_prediction/Sam/cell_line_importance_minmax_totalproteins_minusbaseline.html")   

crc exisists out of 20 proteins from which 18 are present in the data
breast exisists out of 20 proteins from which 18 are present in the data
hcc exisists out of 20 proteins from which 16 are present in the data
hek exisists out of 20 proteins from which 19 are present in the data
hgsoc exisists out of 20 proteins from which 15 are present in the data
huvec exisists out of 20 proteins from which 19 are present in the data
hela exisists out of 20 proteins from which 17 are present in the data
lymphoid exisists out of 20 proteins from which 18 are present in the data
myeloid exisists out of 20 proteins from which 19 are present in the data
neuroblastoma exisists out of 20 proteins from which 20 are present in the data
endometrioid exisists out of 20 proteins from which 17 are present in the data
scc exisists out of 20 proteins from which 18 are present in the data
glioblastoma exisists out of 20 proteins from which 19 are present in the data
ipsc exisists out of 21 proteins from which 1

In [137]:
for lijst, naam in zip([crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ["crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    print(naam)
    lijst_tp = find_tissue_profile(lijst, path, only_present=True, normalisation="minmax")
    lijst_heatmap = lijst_tp.pivot(index='feature_name', columns='tissue', values='importance')
    lijst_heatmap = lijst_heatmap.fillna(0)
    fig = px.imshow(lijst_heatmap, color_continuous_scale='Blues')
    fig.show()

crc


breast


hcc


hek


hgsoc


huvec


hela


lymphoid


myeloid


neuroblastoma


endometrioid


scc


glioblastoma


ipsc


ovarian


In [138]:
hpa = pd.read_csv('/home/compomics/git/Tissue_prediction/Tissue_prediction_manuscript/HPA_comp/antibody_atlas_2206.csv', sep=',')

In [142]:
baseline_hpa = hpa[hpa['Uniprot_id'].isin(baseline)]
baseline_hpa.head()

Unnamed: 0,Uniprot_id,Organ_id,Tissue,Level
1772,A0MZ66,0.0,Adipose tissue,0.0
1773,A0MZ66,1.0,Adrenal gland,0.0
1774,A0MZ66,3.0,Appendix,0.0
1775,A0MZ66,6.0,Bone marrow,0.0
1776,A0MZ66,7.0,Caudate,1.0


In [143]:
baseline_hpa.pivot(index='Uniprot_id', columns='Tissue', values='Level').head()

Tissue,Adipose tissue,Adrenal gland,Appendix,Bone marrow,Breast,Bronchus,Caudate,Cerebellum,Cerebral cortex,Cervix,...,Smooth muscle,Soft tissue 1,Soft tissue 2,Spleen,Stomach 1,Stomach 2,Testis,Tonsil,Urinary bladder,Vagina
Uniprot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0MZ66,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0
O00154,0.0,1.0,0.5,0.0,0.0,0.0,2.0,0.0,1.5,0.0,...,1.0,1.0,2.5,0.0,0.0,0.0,0.0,1.0,0.0,0.0
O00425,0.0,1.0,0.5,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0
O00592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O00754,0.0,3.0,2.0,2.0,2.0,3.0,0.5,0.0,1.0,2.5,...,0.0,2.0,1.5,1.5,2.0,3.0,2.0,1.0,2.0,2.0


In [162]:
hpa_shap_proteins = pd.DataFrame()
for lijst, naam in zip([crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ["crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    lijst_hpa = hpa[hpa['Uniprot_id'].isin(lijst)]
    subset = pd.DataFrame(lijst_hpa[lijst_hpa['Level']>0]['Tissue'].value_counts()).T.reset_index()
    subset['index'] = naam
    #add subset to hpa_shap_proteins dataframe
    hpa_shap_proteins = pd.concat([hpa_shap_proteins, subset], axis=0)

In [206]:
#use plotly to make a grouped barplot of hpa_shap_proteins row wise
fig = go.Figure()
for i in range(hpa_shap_proteins.shape[0]):
    fig.add_trace(go.Bar(x=hpa_shap_proteins.columns[1:], y=hpa_shap_proteins.iloc[i,1:], name=hpa_shap_proteins.iloc[i,0]))
fig.update_layout(barmode='group')
fig.show()
fig.write_html("/home/compomics/git/Tissue_prediction/Sam/HPA_sum_level_tissuespec.html")
    

In [167]:
hpa_shap_proteins.head()

Unnamed: 0,index,Bone marrow,Endometrium 1,Endometrium 2,Caudate,Epididymis,Urinary bladder,Seminal vesicle,Cervix,Spleen,...,Soft tissue 1,Pancreas,Skin 2,Smooth muscle,Adipose tissue,Colon,Skeletal muscle,Salivary gland,Heart muscle,Cerebellum
0,crc,16,16,16,15,15,15,15,15,15,...,11,11,11,10,10,10,9,8,8,8
0,breast,9,13,11,10,11,13,12,11,8,...,8,10,8,9,8,10,8,12,9,7
0,hcc,6,13,10,12,12,13,14,11,9,...,5,11,11,6,4,11,7,10,9,9
0,hek,9,12,9,9,9,11,10,10,9,...,10,10,8,9,9,10,7,9,8,3
0,hgsoc,9,11,9,11,12,11,12,10,9,...,8,10,9,11,8,11,11,10,12,7


In [204]:
hpa_shap_proteins.to_csv('/home/compomics/git/Tissue_prediction/Sam/hpa_sum_of_tissuespec.csv', sep=',')

In [197]:
importance_df = pd.DataFrame()
for lijst, naam in zip([baseline, crc, breast, hcc, hek, hgsoc, huvec, hela, lymphoid, myeloid, neuroblastoma, endometrioid, scc, glioblastoma, ipsc, ovarian], ['baseline', "crc", "breast", "hcc", "hek", "hgsoc", "huvec", "hela", "lymphoid", "myeloid", "neuroblastoma", "endometrioid", "scc", "glioblastoma", "ipsc", "ovarian"]):
    lijst_tp = find_tissue_profile(lijst, path, only_present=True, normalisation="minmax")
    spec_df = lijst_tp.groupby('tissue').sum()['importance']/(lijst_tp.groupby('tissue').count()['feature_name'])
    spec_df = pd.DataFrame(spec_df).T.reset_index()
    spec_df['index'] = naam
    importance_df = pd.concat([importance_df, spec_df], axis=0)

In [203]:
importance_df.to_csv('/home/compomics/git/Tissue_prediction/Sam/importances.csv', sep=',')