In [17]:
from neo4j import GraphDatabase
from warnings import filterwarnings
filterwarnings("ignore")
import pandas as pd
from statistics import harmonic_mean


In [18]:
pd.set_option('display.max_rows', 500)

In [19]:

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost:7687"
AUTH = ("test", "666666")

with GraphDatabase.driver(URI, auth=AUTH) as driver: 

    driver.verify_connectivity() 

In [20]:
cell_type_now='astrocyte of the cerebral cortex'

In [22]:
query_jaccard="""
MATCH (ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g:Gene)
WITH g
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)
RETURN s1.species_scientific_name, count(DISTINCT og) as num_og"""

In [23]:

with driver.session() as session:
    result_ct = session.run(query_jaccard)
    df_ct = result_ct.to_df()

In [24]:
df_ct

Unnamed: 0,s1.species_scientific_name,num_og
0,ptroglodytes,447
1,ggorilla,437
2,cjacchus,297
3,mmulatta,400
4,hsapiens,445


In [25]:
query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g1:Gene)-[:GeneFromSpecies]->(s1),
    (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g2:Gene)-[:GeneFromSpecies]->(s2)
WHERE s1 <> s2
WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct, count(DISTINCT gf) AS num_og
RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2, num_og"""

with driver.session() as session:
    result2 = session.run(query2)
    df2 = result2.to_df()


In [26]:
df2

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,genes1,genes2,num_og
0,ptroglodytes,ggorilla,238,233,229
1,ptroglodytes,mmulatta,195,195,188
2,ptroglodytes,hsapiens,226,224,218
3,ptroglodytes,cjacchus,129,126,125
4,ggorilla,ptroglodytes,233,238,229
5,ggorilla,cjacchus,124,122,122
6,ggorilla,mmulatta,183,183,180
7,ggorilla,hsapiens,216,218,212
8,cjacchus,ggorilla,122,124,122
9,cjacchus,mmulatta,127,131,126


In [27]:
query_ct = """
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[:GeneEnrichedInCellType]->(c:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, size(collect(DISTINCT og)) AS intersection
ORDER BY s1.species_scientific_name, s2.species_scientific_name
"""


with driver.session() as session:
    result_ct = session.run(query_ct)
    df_ct = result_ct.to_df()

In [28]:
df_ct

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,intersection
0,cjacchus,ggorilla,astrocyte of the cerebral cortex,122
1,cjacchus,ggorilla,oligodendrocyte,90
2,cjacchus,ggorilla,vascular leptomeningeal cell,210
3,cjacchus,ggorilla,microglial cell,213
4,cjacchus,ggorilla,oligodendrocyte precursor cell,63
5,cjacchus,ggorilla,cerebral cortex endothelial cell,266
6,cjacchus,ggorilla,near-projecting glutamatergic cortical neuron,11
7,cjacchus,ggorilla,corticothalamic-projecting glutamatergic corti...,7
8,cjacchus,ggorilla,L6b glutamatergic cortical neuron,8
9,cjacchus,ggorilla,L5 extratelencephalic projecting glutamatergic...,11


In [29]:
query_union = """MATCH (s1:Species)<-[:GeneFromSpecies]-(g1:Gene)-[:GeneInOrthologousGroup]->(og1:OrthologousGroup),
(s2:Species)<-[:GeneFromSpecies]-(g2:Gene)-[:GeneInOrthologousGroup]->(og2:OrthologousGroup),
(g1)-[:GeneEnrichedInCellType]->(c:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, size(collect(DISTINCT og1)) AS og_1, size(collect(DISTINCT og2)) AS og_2
ORDER BY s1.species_scientific_name, s2.species_scientific_name
"""


In [30]:
with driver.session() as session:
    result_union = session.run(query_union)
    df_union = result_union.to_df()

df_union

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,og_1,og_2
0,cjacchus,ggorilla,astrocyte of the cerebral cortex,297,437
1,cjacchus,ggorilla,oligodendrocyte,253,340
2,cjacchus,ggorilla,vascular leptomeningeal cell,569,753
3,cjacchus,ggorilla,microglial cell,601,756
4,cjacchus,ggorilla,oligodendrocyte precursor cell,195,235
5,cjacchus,ggorilla,cerebral cortex endothelial cell,646,824
6,cjacchus,ggorilla,near-projecting glutamatergic cortical neuron,53,84
7,cjacchus,ggorilla,corticothalamic-projecting glutamatergic corti...,33,55
8,cjacchus,ggorilla,L6b glutamatergic cortical neuron,50,67
9,cjacchus,ggorilla,L5 extratelencephalic projecting glutamatergic...,44,71


In [31]:
df_jaccard = df_ct.merge(df_union, on=['s1.species_scientific_name', 's2.species_scientific_name', 'c.cell_type_name'])

In [32]:
df_jaccard['union'] = df_jaccard['og_1'] + df_jaccard['og_2'] - df_jaccard['intersection']

In [33]:
df_jaccard['jaccard'] =  df_jaccard['intersection'] / df_jaccard['union']

In [38]:
df_jaccard.sort_values(by=['jaccard'], ascending=False)

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,intersection,og_1,og_2,union,jaccard
358,ptroglodytes,ggorilla,oligodendrocyte,186,347,340,501,0.371257
148,ggorilla,ptroglodytes,oligodendrocyte,186,340,347,501,0.371257
379,ptroglodytes,hsapiens,oligodendrocyte,199,347,405,553,0.359855
232,hsapiens,ptroglodytes,oligodendrocyte,199,405,347,553,0.359855
106,ggorilla,hsapiens,oligodendrocyte,193,340,405,552,0.349638
190,hsapiens,ggorilla,oligodendrocyte,193,405,340,552,0.349638
147,ggorilla,ptroglodytes,astrocyte of the cerebral cortex,229,437,447,655,0.349618
357,ptroglodytes,ggorilla,astrocyte of the cerebral cortex,229,447,437,655,0.349618
360,ptroglodytes,ggorilla,microglial cell,410,871,756,1217,0.336894
150,ggorilla,ptroglodytes,microglial cell,410,756,871,1217,0.336894


In [39]:
df_jaccard.sort_values(by=['jaccard'], ascending=False).to_csv("cell_type_og_jaccard_1TPM.csv")

In [14]:
query_ct="""MATCH (n:CellType)
RETURN n.cell_type_name"""

with driver.session() as session:
    result_ct = session.run(query_ct)
    df_ct = result_ct.to_df()

print(df_ct)

                                     n.cell_type_name
0                    astrocyte of the cerebral cortex
1                                     oligodendrocyte
2                        vascular leptomeningeal cell
3                                     microglial cell
4                      oligodendrocyte precursor cell
5                    cerebral cortex endothelial cell
6       near-projecting glutamatergic cortical neuron
7   corticothalamic-projecting glutamatergic corti...
8                   L6b glutamatergic cortical neuron
9   L5 extratelencephalic projecting glutamatergic...
10  caudal ganglionic eminence derived GABAergic c...
11                 vip GABAergic cortical interneuron
12                sncg GABAergic cortical interneuron
13               lamp5 GABAergic cortical interneuron
14                 sst GABAergic cortical interneuron
15               pvalb GABAergic cortical interneuron
16    chandelier pvalb GABAergic cortical interneuron
17  L5 intratelencephalic pr

In [6]:
cell_type_now=df_ct.iloc[0,0]
cell_type_now

'astrocyte of the cerebral cortex'

In [7]:
query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
WITH s1,  COUNT(DISTINCT g3) AS total1, ct
RETURN s1.species_scientific_name, total1"""

with driver.session() as session:
    result = session.run(query)
    df = result.to_df()

print(df)

  s1.species_scientific_name  total1
0                   cjacchus     221
1                   mmulatta     295
2                   hsapiens     320
3                   ggorilla     312
4               ptroglodytes     303


In [8]:
query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g1:Gene)-[:GeneFromSpecies]->(s1),
    (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g2:Gene)-[:GeneFromSpecies]->(s2)
WHERE s1 <> s2
WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

with driver.session() as session:
    result2 = session.run(query2)
    df2 = result2.to_df()

print(df2)

   s1.species_scientific_name s2.species_scientific_name  genes1  genes2
0                    cjacchus               ptroglodytes      93      94
1                    cjacchus                   ggorilla      93      94
2                    cjacchus                   mmulatta      94      96
3                    cjacchus                   hsapiens      89      92
4                    mmulatta                   ggorilla     135     134
5                    mmulatta                   hsapiens     147     147
6                    mmulatta               ptroglodytes     141     139
7                    mmulatta                   cjacchus      96      94
8                    hsapiens               ptroglodytes     167     166
9                    hsapiens                   ggorilla     148     147
10                   hsapiens                   cjacchus      92      89
11                   hsapiens                   mmulatta     147     147
12                   ggorilla                   hsa

In [9]:
df_new = df.merge(df2)

In [10]:
df_new

Unnamed: 0,s1.species_scientific_name,total1,s2.species_scientific_name,genes1,genes2
0,cjacchus,221,ptroglodytes,93,94
1,cjacchus,221,ggorilla,93,94
2,cjacchus,221,mmulatta,94,96
3,cjacchus,221,hsapiens,89,92
4,mmulatta,295,ggorilla,135,134
5,mmulatta,295,hsapiens,147,147
6,mmulatta,295,ptroglodytes,141,139
7,mmulatta,295,cjacchus,96,94
8,hsapiens,320,ptroglodytes,167,166
9,hsapiens,320,ggorilla,148,147


In [11]:
df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']

In [12]:
df2

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,genes1,genes2
0,cjacchus,ptroglodytes,93,94
1,cjacchus,ggorilla,93,94
2,cjacchus,mmulatta,94,96
3,cjacchus,hsapiens,89,92
4,mmulatta,ggorilla,135,134
5,mmulatta,hsapiens,147,147
6,mmulatta,ptroglodytes,141,139
7,mmulatta,cjacchus,96,94
8,hsapiens,ptroglodytes,167,166
9,hsapiens,ggorilla,148,147


In [13]:
df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)

In [14]:
df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})

In [15]:
df_all

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched
0,cjacchus,221,ptroglodytes,93,94,303
1,mmulatta,295,ptroglodytes,141,139,303
2,hsapiens,320,ptroglodytes,167,166,303
3,ggorilla,312,ptroglodytes,162,165,303
4,cjacchus,221,ggorilla,93,94,312
5,mmulatta,295,ggorilla,135,134,312
6,hsapiens,320,ggorilla,148,147,312
7,ptroglodytes,303,ggorilla,165,162,312
8,cjacchus,221,mmulatta,94,96,295
9,hsapiens,320,mmulatta,147,147,295


In [16]:
df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']

In [17]:
df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)

In [18]:
df_all

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct
0,cjacchus,221,ptroglodytes,93,94,303,0.420814,0.310231,0.357159
1,mmulatta,295,ptroglodytes,141,139,303,0.477966,0.458746,0.468159
2,hsapiens,320,ptroglodytes,167,166,303,0.521875,0.547855,0.534549
3,ggorilla,312,ptroglodytes,162,165,303,0.519231,0.544554,0.531591
4,cjacchus,221,ggorilla,93,94,312,0.420814,0.301282,0.351155
5,mmulatta,295,ggorilla,135,134,312,0.457627,0.429487,0.443111
6,hsapiens,320,ggorilla,148,147,312,0.4625,0.471154,0.466787
7,ptroglodytes,303,ggorilla,165,162,312,0.544554,0.519231,0.531591
8,cjacchus,221,mmulatta,94,96,295,0.425339,0.325424,0.368733
9,hsapiens,320,mmulatta,147,147,295,0.459375,0.498305,0.478049


In [19]:
df_all.sort_values('hmean_pct', ascending=False)[['sp1', 'sp2', 'hmean_pct']].drop_duplicates('hmean_pct').reset_index(drop=True)

Unnamed: 0,sp1,sp2,hmean_pct
0,hsapiens,ptroglodytes,0.534549
1,ggorilla,ptroglodytes,0.531591
2,hsapiens,mmulatta,0.478049
3,mmulatta,ptroglodytes,0.468159
4,ggorilla,hsapiens,0.466787
5,ggorilla,mmulatta,0.443111
6,mmulatta,cjacchus,0.368733
7,cjacchus,ptroglodytes,0.357159
8,cjacchus,ggorilla,0.351155
9,cjacchus,hsapiens,0.335491


In [20]:
df_all['cell_type'] = cell_type_now

In [21]:
df_all

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct,cell_type
0,cjacchus,221,ptroglodytes,93,94,303,0.420814,0.310231,0.357159,astrocyte of the cerebral cortex
1,mmulatta,295,ptroglodytes,141,139,303,0.477966,0.458746,0.468159,astrocyte of the cerebral cortex
2,hsapiens,320,ptroglodytes,167,166,303,0.521875,0.547855,0.534549,astrocyte of the cerebral cortex
3,ggorilla,312,ptroglodytes,162,165,303,0.519231,0.544554,0.531591,astrocyte of the cerebral cortex
4,cjacchus,221,ggorilla,93,94,312,0.420814,0.301282,0.351155,astrocyte of the cerebral cortex
5,mmulatta,295,ggorilla,135,134,312,0.457627,0.429487,0.443111,astrocyte of the cerebral cortex
6,hsapiens,320,ggorilla,148,147,312,0.4625,0.471154,0.466787,astrocyte of the cerebral cortex
7,ptroglodytes,303,ggorilla,165,162,312,0.544554,0.519231,0.531591,astrocyte of the cerebral cortex
8,cjacchus,221,mmulatta,94,96,295,0.425339,0.325424,0.368733,astrocyte of the cerebral cortex
9,hsapiens,320,mmulatta,147,147,295,0.459375,0.498305,0.478049,astrocyte of the cerebral cortex


In [15]:
data_final = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 0]:
    print(cell_type_now)

    # get all cell type senriched genes
    query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session() as session:
        result = session.run(query)
        df = result.to_df()

    # get cell type enriched genes in same OG
    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g1:Gene)-[:GeneFromSpecies]->(s1),
        (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g2:Gene)-[:GeneFromSpecies]->(s2)
    WHERE s1 <> s2
    WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
    RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

    with driver.session() as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    # calculate pct and hmean
    df_new = df.merge(df2)
    df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']
    df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)
    df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})
    df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
    df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']
    df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)
    df_all['cell_type'] = cell_type_now
    df_all['specifcity_category'] = 'gene enriched in cell type'

    if data_final.shape[0] == 0:
        data_final = df_all
    else:
        data_final = pd.concat([data_final, df_all]).reset_index(drop=True)


    
    


    

astrocyte of the cerebral cortex
oligodendrocyte
vascular leptomeningeal cell
microglial cell
oligodendrocyte precursor cell
cerebral cortex endothelial cell
near-projecting glutamatergic cortical neuron
corticothalamic-projecting glutamatergic cortical neuron
L6b glutamatergic cortical neuron
L5 extratelencephalic projecting glutamatergic cortical neuron
caudal ganglionic eminence derived GABAergic cortical interneuron
vip GABAergic cortical interneuron
sncg GABAergic cortical interneuron
lamp5 GABAergic cortical interneuron
sst GABAergic cortical interneuron
pvalb GABAergic cortical interneuron
chandelier pvalb GABAergic cortical interneuron
L5 intratelencephalic projecting glutamatergic neuron
L4 intratelencephalic projecting glutamatergic neuron
L2/3 intratelencephalic projecting glutamatergic neuron
L6 intratelencephalic projecting glutamatergic neuron


In [16]:
data_final.head(10)

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct,cell_type,specifcity_category
0,ptroglodytes,458,ggorilla,238,233,441,0.519651,0.528345,0.523962,astrocyte of the cerebral cortex,gene enriched in cell type
1,cjacchus,296,ggorilla,122,124,441,0.412162,0.281179,0.334298,astrocyte of the cerebral cortex,gene enriched in cell type
2,mmulatta,409,ggorilla,183,183,441,0.447433,0.414966,0.430588,astrocyte of the cerebral cortex,gene enriched in cell type
3,hsapiens,456,ggorilla,218,216,441,0.47807,0.489796,0.483862,astrocyte of the cerebral cortex,gene enriched in cell type
4,ptroglodytes,458,mmulatta,195,195,409,0.425764,0.476773,0.449827,astrocyte of the cerebral cortex,gene enriched in cell type
5,ggorilla,441,mmulatta,183,183,409,0.414966,0.447433,0.430588,astrocyte of the cerebral cortex,gene enriched in cell type
6,cjacchus,296,mmulatta,127,131,409,0.429054,0.320293,0.366781,astrocyte of the cerebral cortex,gene enriched in cell type
7,hsapiens,456,mmulatta,195,196,409,0.427632,0.479218,0.451957,astrocyte of the cerebral cortex,gene enriched in cell type
8,ptroglodytes,458,hsapiens,226,224,456,0.49345,0.491228,0.492336,astrocyte of the cerebral cortex,gene enriched in cell type
9,ggorilla,441,hsapiens,216,218,456,0.489796,0.47807,0.483862,astrocyte of the cerebral cortex,gene enriched in cell type


In [24]:
data_final.groupby(['cell_type']).apply(lambda x: x.sort_values(['hmean_pct'], ascending=False))[['sp1', 'sp2', 'hmean_pct', 'cell_type']].drop_duplicates('hmean_pct').reset_index(drop=True).to_csv("enriched_genes_hmean_pct.csv")


In [25]:
data_final_enhanced = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 0]:
    print(cell_type_now)

    # get all cell type senriched genes
    query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session() as session:
        result = session.run(query)
        df = result.to_df()

    # get cell type enriched genes in same OG
    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g1:Gene)-[:GeneFromSpecies]->(s1),
        (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnhancedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g2:Gene)-[:GeneFromSpecies]->(s2)
    WHERE s1 <> s2
    WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
    RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

    with driver.session() as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    # calculate pct and hmean
    df_new = df.merge(df2)
    df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']
    df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)
    df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})
    df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
    df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']
    df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)
    df_all['cell_type'] = cell_type_now
    df_all['specifcity_category'] = 'gene enhanced in cell type'

    if data_final.shape[0] == 0:
        data_final_enhanced = df_all
    else:
        data_final_enhanced = pd.concat([data_final_enhanced, df_all]).reset_index(drop=True)

astrocyte of the cerebral cortex
oligodendrocyte
vascular leptomeningeal cell
microglial cell
oligodendrocyte precursor cell
cerebral cortex endothelial cell
near-projecting glutamatergic cortical neuron
corticothalamic-projecting glutamatergic cortical neuron
L6b glutamatergic cortical neuron
L5 extratelencephalic projecting glutamatergic cortical neuron
caudal ganglionic eminence derived GABAergic cortical interneuron
vip GABAergic cortical interneuron
sncg GABAergic cortical interneuron
lamp5 GABAergic cortical interneuron
sst GABAergic cortical interneuron
pvalb GABAergic cortical interneuron
chandelier pvalb GABAergic cortical interneuron
L5 intratelencephalic projecting glutamatergic neuron
L4 intratelencephalic projecting glutamatergic neuron
L2/3 intratelencephalic projecting glutamatergic neuron
L6 intratelencephalic projecting glutamatergic neuron


In [26]:
pd.concat([data_final, data_final_enhanced]).reset_index(drop=True).to_csv("sum_enhanced_enriched.csv")

In [27]:
data_final_enhanced

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct,cell_type,specifcity_category
0,cjacchus,197,hsapiens,53,53,231,0.269036,0.229437,0.247664,astrocyte of the cerebral cortex,gene enhanced in cell type
1,mmulatta,211,hsapiens,66,67,231,0.312796,0.290043,0.30099,astrocyte of the cerebral cortex,gene enhanced in cell type
2,ggorilla,205,hsapiens,62,61,231,0.302439,0.264069,0.281955,astrocyte of the cerebral cortex,gene enhanced in cell type
3,ptroglodytes,214,hsapiens,85,85,231,0.397196,0.367965,0.382022,astrocyte of the cerebral cortex,gene enhanced in cell type
4,cjacchus,197,ggorilla,41,41,205,0.208122,0.2,0.20398,astrocyte of the cerebral cortex,gene enhanced in cell type
5,mmulatta,211,ggorilla,50,51,205,0.236967,0.24878,0.24273,astrocyte of the cerebral cortex,gene enhanced in cell type
6,hsapiens,231,ggorilla,61,62,205,0.264069,0.302439,0.281955,astrocyte of the cerebral cortex,gene enhanced in cell type
7,ptroglodytes,214,ggorilla,72,73,205,0.336449,0.356098,0.345994,astrocyte of the cerebral cortex,gene enhanced in cell type
8,cjacchus,197,ptroglodytes,37,37,214,0.187817,0.172897,0.180049,astrocyte of the cerebral cortex,gene enhanced in cell type
9,mmulatta,211,ptroglodytes,58,58,214,0.274882,0.271028,0.272941,astrocyte of the cerebral cortex,gene enhanced in cell type
