In [2]:
from neo4j import GraphDatabase
from warnings import filterwarnings
filterwarnings("ignore")
import pandas as pd
from statistics import harmonic_mean


pd.set_option('display.max_rows', 500)

In [3]:

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost:7687"
AUTH = ("test", "666666")

with GraphDatabase.driver(URI, auth=AUTH) as driver: 

    driver.verify_connectivity() 

cell_type_now='astrocyte of the cerebral cortex'

query_jaccard="""
MATCH (ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g:Gene)
WITH g
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)
RETURN s1.species_scientific_name, count(DISTINCT og) as num_og"""


with driver.session() as session:
    result_ct = session.run(query_jaccard)
    df_ct = result_ct.to_df()

df_ct

query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g1:Gene)-[:GeneFromSpecies]->(s1),
    (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g2:Gene)-[:GeneFromSpecies]->(s2)
WHERE s1 <> s2
WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct, count(DISTINCT gf) AS num_og
RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2, num_og"""

with driver.session() as session:
    result2 = session.run(query2)
    df2 = result2.to_df()


df2

In [4]:
query_ct = """
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup {eggnog_dataset_name: 'primates'})<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[:GeneEnrichedInCellType]->(c:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, size(collect(DISTINCT og)) AS intersection
ORDER BY s1.species_scientific_name, s2.species_scientific_name
"""


with driver.session(database='broadtaxo') as session:
    result_ct = session.run(query_ct)
    df_ct = result_ct.to_df()

In [5]:
df_ct

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,intersection
0,cjacchus,ggorilla,CGE-derived,152
1,cjacchus,ggorilla,Micro-PVM,235
2,cjacchus,ggorilla,VLMC,274
3,cjacchus,ggorilla,OPC,107
4,cjacchus,ggorilla,Non-IT,120
...,...,...,...,...
195,ptroglodytes,mmulatta,Astro,233
196,ptroglodytes,mmulatta,Non-IT,121
197,ptroglodytes,mmulatta,IT,114
198,ptroglodytes,mmulatta,CGE-derived,169


In [8]:
## this is for each cell type, want to do this for broad type at different levels

cell_type_col = 'cell_type_name'


# all-by-all cell tyope enriched genes belongs to the same OG
# note the change to counting genes, instead of OGs, to account for many in-paralogs


In [17]:

def get_cell_type_intersection(cell_type_col:str):

    query_ct_all = f"""
    MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
    (g)-[:GeneEnrichedInCellType]->(c:CellType),
    (c2:CellType)<-[:GeneEnrichedInCellType]-(g2)
    WHERE id(s1) <> id(s2)
    RETURN s1.species_scientific_name, s2.species_scientific_name, c.{cell_type_col}, c2.{cell_type_col}, og.eggnog_dataset_name, size(collect(DISTINCT g)) AS num_gene_sp1, size(collect(DISTINCT g2)) AS num_gene_sp2
    ORDER BY s1.species_scientific_name, s2.species_scientific_name, c.{cell_type_col}, c2.{cell_type_col}, og.eggnog_dataset_name
    """

    ## all by all cell type enriched genes belongs to the same OG

    with driver.session(database='broadtaxo') as session:
        result_ct_all = session.run(query_ct_all)
        df_ct_all = result_ct_all.to_df()

    return df_ct_all



In [18]:
get_cell_type_intersection('broad_type').to_csv("results/taxo_cs/cell_type_enriched_genes_same_og_all_to_all_all_datasets_broad_type.csv", index=False)
get_cell_type_intersection('cell_type_name').to_csv("results/taxo_cs/cell_type_enriched_genes_same_og_all_to_all_all_datasets_cell_type_name.csv", index=False)    
get_cell_type_intersection('broad_type_2').to_csv("results/taxo_cs/cell_type_enriched_genes_same_og_all_to_all_all_datasets_broad_type_2.csv", index=False)    
get_cell_type_intersection('broad_type_3').to_csv("results/taxo_cs/cell_type_enriched_genes_same_og_all_to_all_all_datasets_broad_type_3.csv", index=False)      

query_score= """MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[e1:GeneEnrichedInCellType]->(c:CellType)<-[e2:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, og.id, g.external_gene_name, g2.external_gene_name, e1.specificity_score, e2.specificity_score
ORDER BY s1.species_scientific_name, s2.species_scientific_name"""

with driver.session(database='broadtaxo') as session:
    result_score = session.run(query_score)
    df_score = result_score.to_df()

df_score

df_score.to_csv("intersection_enriched_score_metazoa.csv")

In [11]:

def get_enrichment_score(cell_type_col:str):


    query_score_all= f"""MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
    (g)-[e1:GeneEnrichedInCellType]->(c:CellType),
    (c2:CellType)<-[e2:GeneEnrichedInCellType]-(g2)
    WHERE id(s1) <> id(s2)
    RETURN s1.species_scientific_name, s2.species_scientific_name, c.{cell_type_col}, c2.{cell_type_col}, og.id, og.eggnog_dataset_name, g.external_gene_name, g2.external_gene_name, e1.specificity_score, e2.specificity_score
    ORDER BY s1.species_scientific_name, s2.species_scientific_name, c.{cell_type_col}, c2.{cell_type_col}, og.eggnog_dataset_name"""


    ## all by all cell type enriched genes belongs to the same OG

    with driver.session(database='broadtaxo') as session:
        result_score_all = session.run(query_score_all)
        df_score_all = result_score_all.to_df()
        df_score_all = df_score_all.drop_duplicates()

    return df_score_all

In [12]:
get_enrichment_score("cell_type_name").to_csv("results/taxo_cs/intersection_enriched_score_all_datasets_cell_type_name.csv")
get_enrichment_score("broad_type").to_csv("results/taxo_cs/intersection_enriched_score_all_datasets_broad_type.csv")
get_enrichment_score("broad_type_2").to_csv("results/taxo_cs/intersection_enriched_score_all_datasets_broad_type_2.csv")
get_enrichment_score("broad_type_3").to_csv("results/taxo_cs/intersection_enriched_score_all_datasets_broad_type_3.csv")

query_union = """MATCH (s1:Species)<-[:GeneFromSpecies]-(g1:Gene)-[:GeneInOrthologousGroup]->(og1:OrthologousGroup),
(s2:Species)<-[:GeneFromSpecies]-(g2:Gene)-[:GeneInOrthologousGroup]->(og2:OrthologousGroup),
(g1)-[:GeneEnrichedInCellType]->(c:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, size(collect(DISTINCT og1)) AS og_1, size(collect(DISTINCT og2)) AS og_2
ORDER BY s1.species_scientific_name, s2.species_scientific_name
"""

with driver.session() as session:
    result_union = session.run(query_union)
    df_union = result_union.to_df()

df_union


In [14]:

# get all enriched genes in cell type, including orphan genes that does not belong to a OG, which is allowed

def get_all_enriched(cell_type_col:str):


    query_perct_all = f"""MATCH (s1:Species)<-[:GeneFromSpecies]-(g1:Gene),
    (g1)-[:GeneEnrichedInCellType]->(c:CellType)
    WITH s1, g1, c
    OPTIONAL MATCH (g1:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)
    RETURN s1.species_scientific_name, c.{cell_type_col}, og.eggnog_dataset_name, size(collect(DISTINCT g1)) AS num_genes_all_species
    ORDER BY s1.species_scientific_name, c.{cell_type_col}, og.eggnog_dataset_name
    """

    with driver.session(database='broadtaxo') as session:
        result_perct_all= session.run(query_perct_all)
        df_perct_all = result_perct_all.to_df()

    return df_perct_all


In [15]:

get_all_enriched('cell_type_name').to_csv("results/taxo_cs/cell_type_enriched_genes_per_ct_all_datasets_cell_type_name.csv", index=False)
get_all_enriched('broad_type').to_csv("results/taxo_cs/cell_type_enriched_genes_per_ct_all_datasets_broad_type.csv", index=False)
get_all_enriched('broad_type_2').to_csv("results/taxo_cs/cell_type_enriched_genes_per_ct_all_datasets_broad_type_2.csv", index=False)
get_all_enriched('broad_type_3').to_csv("results/taxo_cs/cell_type_enriched_genes_per_ct_all_datasets_broad_type_3.csv", index=False)

# sum per species per ct minus intersection is the union

In [19]:

for level in ['cell_type_name', 'broad_type', 'broad_type_2', 'broad_type_3']:

      df_ct_all = pd.read_csv(f"results/taxo_cs/cell_type_enriched_genes_same_og_all_to_all_all_datasets_{level}.csv")
      df_perct_all = pd.read_csv(f"results/taxo_cs/cell_type_enriched_genes_per_ct_all_datasets_{level}.csv")

      df_ct_all['num_genes_total_same_og'] = df_ct_all['num_gene_sp1'] + df_ct_all['num_gene_sp2']
      df_ct_all
      # represent genes from both species in shared OGs
      df_perct_all
      # represent total specific genes, irrespective of OGs, from a certain species
      df_lonely_genes = df_perct_all.loc[df_perct_all['og.eggnog_dataset_name'].isna(), :]
      df_lonely_genes

      df_lonely_genes.loc[df_lonely_genes['s1.species_scientific_name'] == 'ggorilla', :]
      df_lonely_genes.drop(columns=['og.eggnog_dataset_name'], inplace=True)
      df_lonely_genes
      df_perct_all = df_perct_all.loc[~df_perct_all['og.eggnog_dataset_name'].isna(), :]
      df_perct_all_new = pd.merge(df_perct_all, df_lonely_genes, on=['s1.species_scientific_name',  f"c.{level}"], how='left',  suffixes=('', '_lonely'))
      df_perct_all_new.num_genes_all_species_lonely.fillna(0, inplace=True)
      df_perct_all_new['total_specific_genes_species']= df_perct_all_new['num_genes_all_species'] + df_perct_all_new['num_genes_all_species_lonely']
      df_perct_all_new.drop(columns=['num_genes_all_species_lonely', 'num_genes_all_species'], inplace=True)
      df_perct_all_new.loc[df_perct_all_new['s1.species_scientific_name'] == 'ggorilla', :].dropna()
      df_jaccard_all = df_ct_all.merge(df_perct_all_new, on=["s1.species_scientific_name", f"c.{level}", "og.eggnog_dataset_name"]).rename(columns={"total_specific_genes_species": "total_specific_genes_species_sp1"}). \
            merge(df_perct_all_new, left_on=["s2.species_scientific_name", f"c2.{level}", "og.eggnog_dataset_name"], 
                  right_on=["s1.species_scientific_name", f"c.{level}", "og.eggnog_dataset_name"]).rename(columns={"total_specific_genes_species": "total_specific_genes_species_sp2"}).drop(['s1.species_scientific_name_y', f'c.{level}_y'], axis=1). \
                  rename(columns={"s1.species_scientific_name_x": "s1.species_scientific_name", f"c.{level}_x": f"c.{level}"})
      
      df_jaccard_all['union'] = df_jaccard_all['total_specific_genes_species_sp1'] + df_jaccard_all['total_specific_genes_species_sp2']  ## this is not really a jaccard index because intersection happens at OG level, not gene level
      # it is more like for genes in the same OG / total number of genes
      df_jaccard_all['jaccard'] = df_jaccard_all['num_genes_total_same_og'] / df_jaccard_all['union']
      df_jaccard_all.to_csv(f"results/taxo_cs/jaccard_cell_type_enriched_all_to_all_all_datasets_{level}.csv", index=False)

In [22]:
cell_type_col="cell_type_name"

query_per_species_enriched = f"""MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup),
    (g)-[enr:GeneEnrichedInCellType]->(c:CellType)
    RETURN s1.species_scientific_name, c.{cell_type_col}, og.id, og.eggnog_dataset_name, g.external_gene_name, g.id, g.ensembl_peptide_id, enr.specificity_score
    ORDER BY s1.species_scientific_name, c.{cell_type_col}"""

with driver.session(database='broadtaxo') as session:
    result_query_per_species_enriched = session.run(query_per_species_enriched)
    df_per_species_enriched = result_query_per_species_enriched.to_df()

df_per_species_enriched.head()

Unnamed: 0,s1.species_scientific_name,c.cell_type_name,og.id,og.eggnog_dataset_name,g.external_gene_name,g.id,g.ensembl_peptide_id,enr.specificity_score
0,cjacchus,Astro,8Z904,mammalia,WIF1,ENSCJAG00000014629,ENSCJAP00000026960,10.065118789672852
1,cjacchus,Astro,4ZYDU,primates,WIF1,ENSCJAG00000014629,ENSCJAP00000026960,10.065118789672852
2,cjacchus,Astro,HVKHF,metazoa,WIF1,ENSCJAG00000014629,ENSCJAP00000026960,10.065118789672852
3,cjacchus,Astro,9G6H5,vertebrata,WIF1,ENSCJAG00000014629,ENSCJAP00000026960,10.065118789672852
4,cjacchus,Astro,H3WA5,bilateria,WIF1,ENSCJAG00000014629,ENSCJAP00000026960,10.065118789672852


In [23]:
df_per_species_enriched.to_csv("results/taxo_cs/per_species_enriched.csv", index=False)

In [24]:
cell_type_col="cell_type_name"

query_per_species_enhanced = f"""MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup),
    (g)-[enh:GeneEnhancedInCellType]->(c:CellType)
    RETURN s1.species_scientific_name, c.{cell_type_col}, og.id, og.eggnog_dataset_name, g.external_gene_name, g.id, g.ensembl_peptide_id, enh.specificity_score
    ORDER BY s1.species_scientific_name, c.{cell_type_col}"""

with driver.session(database='broadtaxo') as session:
    result_query_per_species_enhanced = session.run(query_per_species_enhanced)
    df_per_species_enhanced = result_query_per_species_enhanced.to_df()

df_per_species_enhanced.head()

Unnamed: 0,s1.species_scientific_name,c.cell_type_name,og.id,og.eggnog_dataset_name,g.external_gene_name,g.id,g.ensembl_peptide_id,enh.specificity_score
0,cjacchus,Astro,8ZMK0,mammalia,TTPA,ENSCJAG00000009205,ENSCJAP00000016858,4.600818157196045
1,cjacchus,Astro,4ZUQ6,primates,TTPA,ENSCJAG00000009205,ENSCJAP00000016858,4.600818157196045
2,cjacchus,Astro,HVK72,metazoa,TTPA,ENSCJAG00000009205,ENSCJAP00000016858,4.600818157196045
3,cjacchus,Astro,9GCAS,vertebrata,TTPA,ENSCJAG00000009205,ENSCJAP00000016858,4.600818157196045
4,cjacchus,Astro,H4TR6,bilateria,TTPA,ENSCJAG00000009205,ENSCJAP00000016858,4.600818157196045


In [25]:
df_per_species_enhanced.to_csv("results/taxo_cs/per_species_enhanced.csv", index=False)

In [27]:
df_ct

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,intersection
0,cjacchus,ggorilla,CGE-derived,152
1,cjacchus,ggorilla,Micro-PVM,235
2,cjacchus,ggorilla,VLMC,274
3,cjacchus,ggorilla,OPC,107
4,cjacchus,ggorilla,Non-IT,120
...,...,...,...,...
195,ptroglodytes,mmulatta,Astro,233
196,ptroglodytes,mmulatta,Non-IT,121
197,ptroglodytes,mmulatta,IT,114
198,ptroglodytes,mmulatta,CGE-derived,169


In [28]:
data_final = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 2]:
    print(cell_type_now)

    # get all cell type senriched genes
    query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session(database='broadtaxo') as session:
        result = session.run(query)
        df = result.to_df()

    # get cell type enriched genes in same OG
    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g1:Gene)-[:GeneFromSpecies]->(s1),
        (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g2:Gene)-[:GeneFromSpecies]->(s2)
    WHERE s1 <> s2
    WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
    RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

    with driver.session(database='broadtaxo') as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    # calculate pct and hmean
    df_new = df.merge(df2)
    df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']
    df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)
    df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})
    df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
    df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']
    df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)
    df_all['cell_type'] = cell_type_now
    df_all['specifcity_category'] = 'gene enriched in cell type'

    if data_final.shape[0] == 0:
        data_final = df_all
    else:
        data_final = pd.concat([data_final, df_all]).reset_index(drop=True)


    
    


    

CGE-derived
Micro-PVM
VLMC
OPC
Non-IT
MGE-derived
IT
Oligo
Astro
Endo
Micro-PVM
Oligo
OPC
MGE-derived
CGE-derived
Astro
Endo
VLMC
Non-IT
IT
Micro-PVM
Endo
OPC
VLMC
Oligo
Astro
MGE-derived
IT
Non-IT
CGE-derived
CGE-derived
Micro-PVM
OPC
VLMC
Endo
Non-IT
MGE-derived
IT
Astro
Oligo
CGE-derived
Micro-PVM
VLMC
OPC
Non-IT
MGE-derived
IT
Oligo
Astro
Endo
Micro-PVM
Oligo
Non-IT
OPC
Astro
VLMC
MGE-derived
IT
CGE-derived
Endo
Micro-PVM
Oligo
OPC
VLMC
Non-IT
Astro
Endo
MGE-derived
IT
CGE-derived
CGE-derived
Micro-PVM
OPC
Astro
VLMC
Non-IT
MGE-derived
IT
Endo
Oligo
Micro-PVM
Oligo
OPC
MGE-derived
CGE-derived
Astro
Endo
VLMC
Non-IT
IT
Micro-PVM
Oligo
Non-IT
OPC
Astro
VLMC
MGE-derived
IT
CGE-derived
Endo
Micro-PVM
Oligo
Non-IT
MGE-derived
IT
CGE-derived
Endo
OPC
VLMC
Astro
Micro-PVM
OPC
Astro
VLMC
Non-IT
MGE-derived
IT
CGE-derived
Endo
Oligo
Micro-PVM
Endo
OPC
VLMC
Oligo
Astro
MGE-derived
IT
Non-IT
CGE-derived
Micro-PVM
Oligo
OPC
VLMC
Non-IT
Astro
Endo
MGE-derived
IT
CGE-derived
Micro-PVM
Oligo
Non-

data_final.head(10)

In [29]:
data_final.groupby(['cell_type']).apply(lambda x: x.sort_values(['hmean_pct'], ascending=False))[['sp1', 'sp2', 'hmean_pct', 'cell_type']].drop_duplicates('hmean_pct').reset_index(drop=True).to_csv("results/taxo_cs/enriched_genes_hmean_pct.csv")


In [31]:
data_final_enhanced = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 2]:
    print(cell_type_now)

    # get all cell type enriched genes
    query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session(database='broadtaxo') as session:
        result = session.run(query)
        df = result.to_df()

    # get cell type enriched genes in same OG
    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g1:Gene)-[:GeneFromSpecies]->(s1),
        (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnhancedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g2:Gene)-[:GeneFromSpecies]->(s2)
    WHERE s1 <> s2
    WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
    RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

    with driver.session(database='broadtaxo') as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    # calculate pct and hmean
    df_new = df.merge(df2)
    df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']
    df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)
    df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})
    df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
    df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']
    df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)
    df_all['cell_type'] = cell_type_now
    df_all['specifcity_category'] = 'gene enhanced in cell type'

    if data_final.shape[0] == 0:
        data_final_enhanced = df_all
    else:
        data_final_enhanced = pd.concat([data_final_enhanced, df_all]).reset_index(drop=True)

CGE-derived
Micro-PVM
VLMC
OPC
Non-IT
MGE-derived
IT
Oligo
Astro
Endo
Micro-PVM
Oligo
OPC
MGE-derived
CGE-derived
Astro
Endo
VLMC
Non-IT
IT
Micro-PVM
Endo
OPC
VLMC
Oligo
Astro
MGE-derived
IT
Non-IT
CGE-derived
CGE-derived
Micro-PVM
OPC
VLMC
Endo
Non-IT
MGE-derived
IT
Astro
Oligo
CGE-derived
Micro-PVM
VLMC
OPC
Non-IT
MGE-derived
IT
Oligo
Astro
Endo
Micro-PVM
Oligo
Non-IT
OPC
Astro
VLMC
MGE-derived
IT
CGE-derived
Endo
Micro-PVM
Oligo
OPC
VLMC
Non-IT
Astro
Endo
MGE-derived
IT
CGE-derived
CGE-derived
Micro-PVM
OPC
Astro
VLMC
Non-IT
MGE-derived
IT
Endo
Oligo
Micro-PVM
Oligo
OPC
MGE-derived
CGE-derived
Astro
Endo
VLMC
Non-IT
IT
Micro-PVM
Oligo
Non-IT
OPC
Astro
VLMC
MGE-derived
IT
CGE-derived
Endo
Micro-PVM
Oligo
Non-IT
MGE-derived
IT
CGE-derived
Endo
OPC
VLMC
Astro
Micro-PVM
OPC
Astro
VLMC
Non-IT
MGE-derived
IT
CGE-derived
Endo
Oligo
Micro-PVM
Endo
OPC
VLMC
Oligo
Astro
MGE-derived
IT
Non-IT
CGE-derived
Micro-PVM
Oligo
OPC
VLMC
Non-IT
Astro
Endo
MGE-derived
IT
CGE-derived
Micro-PVM
Oligo
Non-

In [32]:
pd.concat([data_final, data_final_enhanced]).reset_index(drop=True).to_csv("results/taxo_cs/sum_enhanced_enriched_1TPM.csv")

In [33]:
data_final_ct = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 2]:
    print(cell_type_now)


    query1="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType {specificity_category: 'cell type enhanced'}]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session(database='broadtaxo') as session:
        result1 = session.run(query1)
        df1 = result1.to_df()

    df1['specificity_category'] = 'cell type enhanced'
    df1['cell_type'] = cell_type_now


    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType {specificity_category: 'cell type enriched'}]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session(database='broadtaxo') as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    df2['specificity_category'] = 'cell type enriched'
    df2['cell_type'] = cell_type_now

    query3="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType {specificity_category: 'group enhanced'}]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session(database='broadtaxo') as session:
        result3 = session.run(query3)
        df3 = result3.to_df()

    df3['specificity_category'] = 'group enhanced'
    df3['cell_type'] = cell_type_now


    query4="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType {specificity_category: 'group enriched'}]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session(database='broadtaxo') as session:
        result4 = session.run(query4)
        df4 = result4.to_df()

    df4['specificity_category'] = 'group enriched'
    df4['cell_type'] = cell_type_now


    df_all=pd.concat([df1, df2, df3, df4]).reset_index(drop=True)

    if data_final_ct.shape[0] == 0:
        data_final_ct = df_all
    else:
        data_final_ct= pd.concat([data_final_ct, df_all]).reset_index(drop=True)
   

CGE-derived
Micro-PVM
VLMC
OPC
Non-IT
MGE-derived
IT
Oligo
Astro
Endo
Micro-PVM
Oligo
OPC
MGE-derived
CGE-derived
Astro
Endo
VLMC
Non-IT
IT
Micro-PVM
Endo
OPC
VLMC
Oligo
Astro
MGE-derived
IT
Non-IT
CGE-derived
CGE-derived
Micro-PVM
OPC
VLMC
Endo
Non-IT
MGE-derived
IT
Astro
Oligo
CGE-derived
Micro-PVM
VLMC
OPC
Non-IT
MGE-derived
IT
Oligo
Astro
Endo
Micro-PVM
Oligo
Non-IT
OPC
Astro
VLMC
MGE-derived
IT
CGE-derived
Endo
Micro-PVM
Oligo
OPC
VLMC
Non-IT
Astro
Endo
MGE-derived
IT
CGE-derived
CGE-derived
Micro-PVM
OPC
Astro
VLMC
Non-IT
MGE-derived
IT
Endo
Oligo
Micro-PVM
Oligo
OPC
MGE-derived
CGE-derived
Astro
Endo
VLMC
Non-IT
IT
Micro-PVM
Oligo
Non-IT
OPC
Astro
VLMC
MGE-derived
IT
CGE-derived
Endo
Micro-PVM
Oligo
Non-IT
MGE-derived
IT
CGE-derived
Endo
OPC
VLMC
Astro
Micro-PVM
OPC
Astro
VLMC
Non-IT
MGE-derived
IT
CGE-derived
Endo
Oligo
Micro-PVM
Endo
OPC
VLMC
Oligo
Astro
MGE-derived
IT
Non-IT
CGE-derived
Micro-PVM
Oligo
OPC
VLMC
Non-IT
Astro
Endo
MGE-derived
IT
CGE-derived
Micro-PVM
Oligo
Non-

In [34]:
data_final_ct.drop_duplicates().to_csv("results/taxo_cs/sum_enhanced_enriched_1TPM_by_category.csv")