In [1]:
from neo4j import GraphDatabase
from warnings import filterwarnings
filterwarnings("ignore")
import pandas as pd
from statistics import harmonic_mean


pd.set_option('display.max_rows', 500)

In [2]:

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost:7687"
AUTH = ("test", "666666")

with GraphDatabase.driver(URI, auth=AUTH) as driver: 

    driver.verify_connectivity() 

cell_type_now='astrocyte of the cerebral cortex'

query_jaccard="""
MATCH (ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g:Gene)
WITH g
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)
RETURN s1.species_scientific_name, count(DISTINCT og) as num_og"""


with driver.session() as session:
    result_ct = session.run(query_jaccard)
    df_ct = result_ct.to_df()

df_ct

query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g1:Gene)-[:GeneFromSpecies]->(s1),
    (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g2:Gene)-[:GeneFromSpecies]->(s2)
WHERE s1 <> s2
WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct, count(DISTINCT gf) AS num_og
RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2, num_og"""

with driver.session() as session:
    result2 = session.run(query2)
    df2 = result2.to_df()


df2

In [3]:
query_ct = """
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup {eggnog_dataset_name: 'primates'})<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[:GeneEnrichedInCellType]->(c:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, size(collect(DISTINCT og)) AS intersection
ORDER BY s1.species_scientific_name, s2.species_scientific_name
"""


with driver.session() as session:
    result_ct = session.run(query_ct)
    df_ct = result_ct.to_df()

In [5]:
df_ct

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,intersection
0,cjacchus,ggorilla,vip GABAergic cortical interneuron,13
1,cjacchus,ggorilla,sncg GABAergic cortical interneuron,10
2,cjacchus,ggorilla,microglial cell,208
3,cjacchus,ggorilla,vascular leptomeningeal cell,205
4,cjacchus,ggorilla,oligodendrocyte precursor cell,63
...,...,...,...,...
415,ptroglodytes,mmulatta,caudal ganglionic eminence derived GABAergic c...,23
416,ptroglodytes,mmulatta,vip GABAergic cortical interneuron,29
417,ptroglodytes,mmulatta,sncg GABAergic cortical interneuron,23
418,ptroglodytes,mmulatta,sst GABAergic cortical interneuron,17


In [6]:
query_ct_all = """
MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[:GeneEnrichedInCellType]->(c:CellType),
(c2:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, c2.cell_type_name, og.eggnog_dataset_name, size(collect(DISTINCT og)) AS intersection
ORDER BY s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, c2.cell_type_name, og.eggnog_dataset_name
"""

## all by all cell type enriched genes belongs to the same OG

with driver.session() as session:
    result_ct_all = session.run(query_ct_all)
    df_ct_all = result_ct_all.to_df()

df_ct_all.head()

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,c2.cell_type_name,og.eggnog_dataset_name,intersection
0,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,11
1,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,mammalia,9
2,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,metazoa,12
3,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,primates,9
4,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,vertebrata,10


In [7]:
df_ct_all.to_csv("cell_type_enriched_genes_same_og_all_to_all_all_datasets.csv", index=False)    

In [8]:
query_score= """MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[e1:GeneEnrichedInCellType]->(c:CellType)<-[e2:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, og.id, g.external_gene_name, g2.external_gene_name, e1.specificity_score, e2.specificity_score
ORDER BY s1.species_scientific_name, s2.species_scientific_name"""

with driver.session() as session:
    result_score = session.run(query_score)
    df_score = result_score.to_df()

In [9]:
df_score

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,og.id,g.external_gene_name,g2.external_gene_name,e1.specificity_score,e2.specificity_score
0,cjacchus,ggorilla,astrocyte of the cerebral cortex,4ZRTM,TRPS1,TRPS1,4.302997969268695,5.932227819264398
1,cjacchus,ggorilla,astrocyte of the cerebral cortex,9FWMC,TRPS1,TRPS1,4.302997969268695,5.932227819264398
2,cjacchus,ggorilla,astrocyte of the cerebral cortex,HT37R,TRPS1,TRPS1,4.302997969268695,5.932227819264398
3,cjacchus,ggorilla,astrocyte of the cerebral cortex,8YY54,TRPS1,TRPS1,4.302997969268695,5.932227819264398
4,cjacchus,ggorilla,astrocyte of the cerebral cortex,H4VD8,TRPS1,TRPS1,4.302997969268695,5.932227819264398
...,...,...,...,...,...,...,...,...
223387,ptroglodytes,mmulatta,oligodendrocyte,4ZZ6R,GJC2,GJC2,5.907806533826207,8.901623725891113
223388,ptroglodytes,mmulatta,oligodendrocyte,HSWM1,GJC2,GJC2,5.907806533826207,8.901623725891113
223389,ptroglodytes,mmulatta,oligodendrocyte,8Z5BV,GJC2,GJC2,5.907806533826207,8.901623725891113
223390,ptroglodytes,mmulatta,oligodendrocyte,9FFQ1,GJB1,GJB1,25.550498962402344,6.8717949937124745


In [10]:
df_score.to_csv("intersection_enriched_score_metazoa.csv")

In [11]:
query_score_all= """MATCH (s1:Species)<-[:GeneFromSpecies]-(g:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup)<-[:GeneInOrthologousGroup]-(g2:Gene)-[:GeneFromSpecies]->(s2:Species),
(g)-[e1:GeneEnrichedInCellType]->(c:CellType),
(c2:CellType)<-[e2:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, c2.cell_type_name, og.id, og.eggnog_dataset_name, g.external_gene_name, g2.external_gene_name, e1.specificity_score, e2.specificity_score
ORDER BY s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, c2.cell_type_name, og.eggnog_dataset_name"""


## all by all cell type enriched genes belongs to the same OG

with driver.session() as session:
    result_score_all = session.run(query_score_all)
    df_score_all = result_score_all.to_df()

df_score_all.head()

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,c2.cell_type_name,og.id,og.eggnog_dataset_name,g.external_gene_name,g2.external_gene_name,e1.specificity_score,e2.specificity_score
0,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,H4BV1,bilateria,ANKRD33B,ANKRD33B,4.253041879712056,8.212555345302412
1,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,H50SP,bilateria,NWD2,NWD2,4.0008192613833815,4.695822189172423
2,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,H3BC5,bilateria,GRM1,GRM2,4.007138221226412,4.9509887797914685
3,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,H40SM,bilateria,CPNE4,CPNE4,5.105036232546879,7.51276252624591
4,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,H6SF4,bilateria,SV2B,SV2B,10.76340847554009,38.70605574126589


In [12]:
df_score_all.to_csv("intersection_enriched_score_all_datasets.csv")

In [8]:
query_union = """MATCH (s1:Species)<-[:GeneFromSpecies]-(g1:Gene)-[:GeneInOrthologousGroup]->(og1:OrthologousGroup),
(s2:Species)<-[:GeneFromSpecies]-(g2:Gene)-[:GeneInOrthologousGroup]->(og2:OrthologousGroup),
(g1)-[:GeneEnrichedInCellType]->(c:CellType)<-[:GeneEnrichedInCellType]-(g2)
WHERE id(s1) <> id(s2)
RETURN s1.species_scientific_name, s2.species_scientific_name, c.cell_type_name, size(collect(DISTINCT og1)) AS og_1, size(collect(DISTINCT og2)) AS og_2
ORDER BY s1.species_scientific_name, s2.species_scientific_name
"""

with driver.session() as session:
    result_union = session.run(query_union)
    df_union = result_union.to_df()

df_union


In [13]:
query_perct_all = """MATCH (s1:Species)<-[:GeneFromSpecies]-(g1:Gene)-[:GeneInOrthologousGroup]->(og:OrthologousGroup),
(g1)-[:GeneEnrichedInCellType]->(c:CellType)
RETURN s1.species_scientific_name, c.cell_type_name, og.eggnog_dataset_name, size(collect(DISTINCT og)) AS og_all
ORDER BY s1.species_scientific_name, c.cell_type_name, og.eggnog_dataset_name
"""

with driver.session() as session:
    result_perct_all= session.run(query_perct_all)
    df_perct_all = result_perct_all.to_df()

df_perct_all.head()


Unnamed: 0,s1.species_scientific_name,c.cell_type_name,og.eggnog_dataset_name,og_all
0,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,bilateria,36
1,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,mammalia,38
2,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,metazoa,36
3,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,primates,38
4,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,vertebrata,38


In [14]:

df_perct_all.to_csv("cell_type_enriched_genes_per_ct_all_datasets.csv", index=False)
# sum per species per ct minus intersection is the union

In [15]:
df_ct_all.head()

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,c2.cell_type_name,og.eggnog_dataset_name,intersection
0,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,11
1,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,mammalia,9
2,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,metazoa,12
3,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,primates,9
4,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,vertebrata,10


In [16]:
df_perct_all.head(n=30)

Unnamed: 0,s1.species_scientific_name,c.cell_type_name,og.eggnog_dataset_name,og_all
0,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,bilateria,36
1,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,mammalia,38
2,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,metazoa,36
3,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,primates,38
4,cjacchus,L2/3 intratelencephalic projecting glutamaterg...,vertebrata,38
5,cjacchus,L4 intratelencephalic projecting glutamatergic...,bilateria,48
6,cjacchus,L4 intratelencephalic projecting glutamatergic...,mammalia,49
7,cjacchus,L4 intratelencephalic projecting glutamatergic...,metazoa,48
8,cjacchus,L4 intratelencephalic projecting glutamatergic...,primates,47
9,cjacchus,L4 intratelencephalic projecting glutamatergic...,vertebrata,48


In [17]:
df_jaccard_all = df_ct_all.merge(df_perct_all, on=["s1.species_scientific_name", "c.cell_type_name", "og.eggnog_dataset_name"]).rename(columns={"og_all": "og_all_sp1"}). \
merge(df_perct_all, left_on=["s2.species_scientific_name", "c2.cell_type_name", "og.eggnog_dataset_name"], 
      right_on=["s1.species_scientific_name", "c.cell_type_name", "og.eggnog_dataset_name"]).rename(columns={"og_all": "og_all_sp2"}).drop(['s1.species_scientific_name_y', 'c.cell_type_name_y'], axis=1). \
      rename(columns={"s1.species_scientific_name_x": "s1.species_scientific_name", "c.cell_type_name_x": "c.cell_type_name"})

df_jaccard_all.head()

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,c2.cell_type_name,og.eggnog_dataset_name,intersection,og_all_sp1,og_all_sp2
0,cjacchus,ggorilla,L2/3 intratelencephalic projecting glutamaterg...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,11,36,57
1,cjacchus,ggorilla,L4 intratelencephalic projecting glutamatergic...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,6,48,57
2,cjacchus,ggorilla,L5 extratelencephalic projecting glutamatergic...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,8,43,57
3,cjacchus,ggorilla,L5 intratelencephalic projecting glutamatergic...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,7,35,57
4,cjacchus,ggorilla,L6 intratelencephalic projecting glutamatergic...,L2/3 intratelencephalic projecting glutamaterg...,bilateria,10,46,57


In [18]:
df_jaccard_all['union'] = df_jaccard_all['og_all_sp1'] + df_jaccard_all['og_all_sp2'] - df_jaccard_all['intersection']

In [19]:
df_jaccard_all['jaccard'] = df_jaccard_all['intersection'] / df_jaccard_all['union']

In [20]:
df_jaccard_all.to_csv("jaccard_cell_type_enriched_all_to_all_all_datasets.csv", index=False)

In [10]:
df_jaccard = df_ct.merge(df_union, on=['s1.species_scientific_name', 's2.species_scientific_name', 'c.cell_type_name'])

In [11]:
df_jaccard['union'] = df_jaccard['og_1'] + df_jaccard['og_2'] - df_jaccard['intersection']

In [12]:
df_jaccard['jaccard'] =  df_jaccard['intersection'] / df_jaccard['union']

In [13]:
df_jaccard.sort_values(by=['jaccard'], ascending=False)

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,c.cell_type_name,intersection,og_1,og_2,union,jaccard
148,ggorilla,ptroglodytes,oligodendrocyte,183,324,327,468,0.391026
358,ptroglodytes,ggorilla,oligodendrocyte,183,327,324,468,0.391026
232,hsapiens,ptroglodytes,oligodendrocyte,197,382,327,512,0.384766
379,ptroglodytes,hsapiens,oligodendrocyte,197,327,382,512,0.384766
357,ptroglodytes,ggorilla,astrocyte of the cerebral cortex,231,416,418,603,0.383085
...,...,...,...,...,...,...,...,...
335,mmulatta,ptroglodytes,L6 intratelencephalic projecting glutamatergic...,9,54,65,110,0.081818
230,hsapiens,mmulatta,L6 intratelencephalic projecting glutamatergic...,9,68,54,113,0.079646
314,mmulatta,hsapiens,L6 intratelencephalic projecting glutamatergic...,9,54,68,113,0.079646
303,mmulatta,hsapiens,L5 extratelencephalic projecting glutamatergic...,8,58,67,117,0.068376


In [14]:
df_jaccard.sort_values(by=['jaccard'], ascending=False).to_csv("cell_type_og_jaccard_1TPM_metazoa.csv")

In [61]:
query_ct="""MATCH (n:CellType)
RETURN n.cell_type_name"""

with driver.session() as session:
    result_ct = session.run(query_ct)
    df_ct = result_ct.to_df()

print(df_ct)

                                     n.cell_type_name
0                    astrocyte of the cerebral cortex
1                                     oligodendrocyte
2                        vascular leptomeningeal cell
3                                     microglial cell
4                      oligodendrocyte precursor cell
5                    cerebral cortex endothelial cell
6       near-projecting glutamatergic cortical neuron
7   corticothalamic-projecting glutamatergic corti...
8                   L6b glutamatergic cortical neuron
9   L5 extratelencephalic projecting glutamatergic...
10  caudal ganglionic eminence derived GABAergic c...
11                 vip GABAergic cortical interneuron
12                sncg GABAergic cortical interneuron
13               lamp5 GABAergic cortical interneuron
14                 sst GABAergic cortical interneuron
15               pvalb GABAergic cortical interneuron
16    chandelier pvalb GABAergic cortical interneuron
17  L5 intratelencephalic pr

In [6]:
cell_type_now=df_ct.iloc[0,0]
cell_type_now

'astrocyte of the cerebral cortex'

In [7]:
query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
WITH s1,  COUNT(DISTINCT g3) AS total1, ct
RETURN s1.species_scientific_name, total1"""

with driver.session() as session:
    result = session.run(query)
    df = result.to_df()

print(df)

  s1.species_scientific_name  total1
0                   cjacchus     221
1                   mmulatta     295
2                   hsapiens     320
3                   ggorilla     312
4               ptroglodytes     303


In [72]:
query_all_enrich="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType)<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
RETURN s1.species_scientific_name, ct.cell_type_name, g3.external_gene_name"""

with driver.session() as session:
    result_all_enrich = session.run(query_all_enrich)
    df_all_enrich = result_all_enrich.to_df()

df_all_enrich.head()
df_all_enrich.to_csv("all_enriched_genes.csv")

In [8]:
query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g1:Gene)-[:GeneFromSpecies]->(s1),
    (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g2:Gene)-[:GeneFromSpecies]->(s2)
WHERE s1 <> s2
WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

with driver.session() as session:
    result2 = session.run(query2)
    df2 = result2.to_df()

print(df2)

   s1.species_scientific_name s2.species_scientific_name  genes1  genes2
0                    cjacchus               ptroglodytes      93      94
1                    cjacchus                   ggorilla      93      94
2                    cjacchus                   mmulatta      94      96
3                    cjacchus                   hsapiens      89      92
4                    mmulatta                   ggorilla     135     134
5                    mmulatta                   hsapiens     147     147
6                    mmulatta               ptroglodytes     141     139
7                    mmulatta                   cjacchus      96      94
8                    hsapiens               ptroglodytes     167     166
9                    hsapiens                   ggorilla     148     147
10                   hsapiens                   cjacchus      92      89
11                   hsapiens                   mmulatta     147     147
12                   ggorilla                   hsa

In [9]:
df_new = df.merge(df2)

In [10]:
df_new

Unnamed: 0,s1.species_scientific_name,total1,s2.species_scientific_name,genes1,genes2
0,cjacchus,221,ptroglodytes,93,94
1,cjacchus,221,ggorilla,93,94
2,cjacchus,221,mmulatta,94,96
3,cjacchus,221,hsapiens,89,92
4,mmulatta,295,ggorilla,135,134
5,mmulatta,295,hsapiens,147,147
6,mmulatta,295,ptroglodytes,141,139
7,mmulatta,295,cjacchus,96,94
8,hsapiens,320,ptroglodytes,167,166
9,hsapiens,320,ggorilla,148,147


In [11]:
df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']

In [12]:
df2

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,genes1,genes2
0,cjacchus,ptroglodytes,93,94
1,cjacchus,ggorilla,93,94
2,cjacchus,mmulatta,94,96
3,cjacchus,hsapiens,89,92
4,mmulatta,ggorilla,135,134
5,mmulatta,hsapiens,147,147
6,mmulatta,ptroglodytes,141,139
7,mmulatta,cjacchus,96,94
8,hsapiens,ptroglodytes,167,166
9,hsapiens,ggorilla,148,147


In [13]:
df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)

In [14]:
df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})

In [15]:
df_all

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched
0,cjacchus,221,ptroglodytes,93,94,303
1,mmulatta,295,ptroglodytes,141,139,303
2,hsapiens,320,ptroglodytes,167,166,303
3,ggorilla,312,ptroglodytes,162,165,303
4,cjacchus,221,ggorilla,93,94,312
5,mmulatta,295,ggorilla,135,134,312
6,hsapiens,320,ggorilla,148,147,312
7,ptroglodytes,303,ggorilla,165,162,312
8,cjacchus,221,mmulatta,94,96,295
9,hsapiens,320,mmulatta,147,147,295


In [16]:
df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']

In [17]:
df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)

In [18]:
df_all

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct
0,cjacchus,221,ptroglodytes,93,94,303,0.420814,0.310231,0.357159
1,mmulatta,295,ptroglodytes,141,139,303,0.477966,0.458746,0.468159
2,hsapiens,320,ptroglodytes,167,166,303,0.521875,0.547855,0.534549
3,ggorilla,312,ptroglodytes,162,165,303,0.519231,0.544554,0.531591
4,cjacchus,221,ggorilla,93,94,312,0.420814,0.301282,0.351155
5,mmulatta,295,ggorilla,135,134,312,0.457627,0.429487,0.443111
6,hsapiens,320,ggorilla,148,147,312,0.4625,0.471154,0.466787
7,ptroglodytes,303,ggorilla,165,162,312,0.544554,0.519231,0.531591
8,cjacchus,221,mmulatta,94,96,295,0.425339,0.325424,0.368733
9,hsapiens,320,mmulatta,147,147,295,0.459375,0.498305,0.478049


In [19]:
df_all.sort_values('hmean_pct', ascending=False)[['sp1', 'sp2', 'hmean_pct']].drop_duplicates('hmean_pct').reset_index(drop=True)

Unnamed: 0,sp1,sp2,hmean_pct
0,hsapiens,ptroglodytes,0.534549
1,ggorilla,ptroglodytes,0.531591
2,hsapiens,mmulatta,0.478049
3,mmulatta,ptroglodytes,0.468159
4,ggorilla,hsapiens,0.466787
5,ggorilla,mmulatta,0.443111
6,mmulatta,cjacchus,0.368733
7,cjacchus,ptroglodytes,0.357159
8,cjacchus,ggorilla,0.351155
9,cjacchus,hsapiens,0.335491


In [20]:
df_all['cell_type'] = cell_type_now

In [21]:
df_all

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct,cell_type
0,cjacchus,221,ptroglodytes,93,94,303,0.420814,0.310231,0.357159,astrocyte of the cerebral cortex
1,mmulatta,295,ptroglodytes,141,139,303,0.477966,0.458746,0.468159,astrocyte of the cerebral cortex
2,hsapiens,320,ptroglodytes,167,166,303,0.521875,0.547855,0.534549,astrocyte of the cerebral cortex
3,ggorilla,312,ptroglodytes,162,165,303,0.519231,0.544554,0.531591,astrocyte of the cerebral cortex
4,cjacchus,221,ggorilla,93,94,312,0.420814,0.301282,0.351155,astrocyte of the cerebral cortex
5,mmulatta,295,ggorilla,135,134,312,0.457627,0.429487,0.443111,astrocyte of the cerebral cortex
6,hsapiens,320,ggorilla,148,147,312,0.4625,0.471154,0.466787,astrocyte of the cerebral cortex
7,ptroglodytes,303,ggorilla,165,162,312,0.544554,0.519231,0.531591,astrocyte of the cerebral cortex
8,cjacchus,221,mmulatta,94,96,295,0.425339,0.325424,0.368733,astrocyte of the cerebral cortex
9,hsapiens,320,mmulatta,147,147,295,0.459375,0.498305,0.478049,astrocyte of the cerebral cortex


In [62]:
data_final = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 0]:
    print(cell_type_now)

    # get all cell type senriched genes
    query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session() as session:
        result = session.run(query)
        df = result.to_df()

    # get cell type enriched genes in same OG
    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g1:Gene)-[:GeneFromSpecies]->(s1),
        (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g2:Gene)-[:GeneFromSpecies]->(s2)
    WHERE s1 <> s2
    WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
    RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

    with driver.session() as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    # calculate pct and hmean
    df_new = df.merge(df2)
    df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']
    df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)
    df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})
    df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
    df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']
    df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)
    df_all['cell_type'] = cell_type_now
    df_all['specifcity_category'] = 'gene enriched in cell type'

    if data_final.shape[0] == 0:
        data_final = df_all
    else:
        data_final = pd.concat([data_final, df_all]).reset_index(drop=True)


    
    


    

astrocyte of the cerebral cortex
oligodendrocyte
vascular leptomeningeal cell
microglial cell
oligodendrocyte precursor cell
cerebral cortex endothelial cell
near-projecting glutamatergic cortical neuron
corticothalamic-projecting glutamatergic cortical neuron
L6b glutamatergic cortical neuron
L5 extratelencephalic projecting glutamatergic cortical neuron
caudal ganglionic eminence derived GABAergic cortical interneuron
vip GABAergic cortical interneuron
sncg GABAergic cortical interneuron
lamp5 GABAergic cortical interneuron
sst GABAergic cortical interneuron
pvalb GABAergic cortical interneuron
chandelier pvalb GABAergic cortical interneuron
L5 intratelencephalic projecting glutamatergic neuron
L4 intratelencephalic projecting glutamatergic neuron
L2/3 intratelencephalic projecting glutamatergic neuron
L6 intratelencephalic projecting glutamatergic neuron


In [63]:
data_final.head(10)

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct,cell_type,specifcity_category
0,ptroglodytes,458,ggorilla,238,233,441,0.519651,0.528345,0.523962,astrocyte of the cerebral cortex,gene enriched in cell type
1,cjacchus,296,ggorilla,122,124,441,0.412162,0.281179,0.334298,astrocyte of the cerebral cortex,gene enriched in cell type
2,mmulatta,409,ggorilla,183,183,441,0.447433,0.414966,0.430588,astrocyte of the cerebral cortex,gene enriched in cell type
3,hsapiens,456,ggorilla,218,216,441,0.47807,0.489796,0.483862,astrocyte of the cerebral cortex,gene enriched in cell type
4,ptroglodytes,458,cjacchus,129,126,296,0.281659,0.425676,0.339006,astrocyte of the cerebral cortex,gene enriched in cell type
5,ggorilla,441,cjacchus,124,122,296,0.281179,0.412162,0.334298,astrocyte of the cerebral cortex,gene enriched in cell type
6,mmulatta,409,cjacchus,131,127,296,0.320293,0.429054,0.366781,astrocyte of the cerebral cortex,gene enriched in cell type
7,hsapiens,456,cjacchus,121,118,296,0.265351,0.398649,0.31862,astrocyte of the cerebral cortex,gene enriched in cell type
8,ptroglodytes,458,mmulatta,195,195,409,0.425764,0.476773,0.449827,astrocyte of the cerebral cortex,gene enriched in cell type
9,cjacchus,296,mmulatta,127,131,409,0.429054,0.320293,0.366781,astrocyte of the cerebral cortex,gene enriched in cell type


In [64]:
data_final.groupby(['cell_type']).apply(lambda x: x.sort_values(['hmean_pct'], ascending=False))[['sp1', 'sp2', 'hmean_pct', 'cell_type']].drop_duplicates('hmean_pct').reset_index(drop=True).to_csv("enriched_genes_hmean_pct.csv")


In [65]:
data_final_enhanced = pd.DataFrame()

for cell_type_now in df_ct.iloc[:, 0]:
    print(cell_type_now)

    # get all cell type senriched genes
    query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
    WITH s1,  COUNT(DISTINCT g3) AS total1, ct
    RETURN s1.species_scientific_name, total1"""

    with driver.session() as session:
        result = session.run(query)
        df = result.to_df()

    # get cell type enriched genes in same OG
    query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: '""" + cell_type_now + """'})<-[:GeneEnhancedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g1:Gene)-[:GeneFromSpecies]->(s1),
        (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnhancedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
        (g2:Gene)-[:GeneFromSpecies]->(s2)
    WHERE s1 <> s2
    WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
    RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

    with driver.session() as session:
        result2 = session.run(query2)
        df2 = result2.to_df()

    # calculate pct and hmean
    df_new = df.merge(df2)
    df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og_sp1', 'num_enriched_share_og_sp2']
    df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)
    df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})
    df_all['sp1_pct'] = df_all['num_enriched_share_og_sp1'] / df_all['sp1_total_enriched']
    df_all['sp2_pct'] = df_all['num_enriched_share_og_sp2'] / df_all['sp2_total_enriched']
    df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)
    df_all['cell_type'] = cell_type_now
    df_all['specifcity_category'] = 'gene enhanced in cell type'

    if data_final.shape[0] == 0:
        data_final_enhanced = df_all
    else:
        data_final_enhanced = pd.concat([data_final_enhanced, df_all]).reset_index(drop=True)

astrocyte of the cerebral cortex
oligodendrocyte
vascular leptomeningeal cell
microglial cell
oligodendrocyte precursor cell
cerebral cortex endothelial cell
near-projecting glutamatergic cortical neuron
corticothalamic-projecting glutamatergic cortical neuron
L6b glutamatergic cortical neuron
L5 extratelencephalic projecting glutamatergic cortical neuron
caudal ganglionic eminence derived GABAergic cortical interneuron
vip GABAergic cortical interneuron
sncg GABAergic cortical interneuron
lamp5 GABAergic cortical interneuron
sst GABAergic cortical interneuron
pvalb GABAergic cortical interneuron
chandelier pvalb GABAergic cortical interneuron
L5 intratelencephalic projecting glutamatergic neuron
L4 intratelencephalic projecting glutamatergic neuron
L2/3 intratelencephalic projecting glutamatergic neuron
L6 intratelencephalic projecting glutamatergic neuron


In [66]:
pd.concat([data_final, data_final_enhanced]).reset_index(drop=True).to_csv("sum_enhanced_enriched_1TPM.csv")

Unnamed: 0,sp1,sp1_total_enriched,sp2,num_enriched_share_og_sp1,num_enriched_share_og_sp2,sp2_total_enriched,sp1_pct,sp2_pct,hmean_pct,cell_type,specifcity_category
0,ptroglodytes,270,ggorilla,78,78,254,0.288889,0.307087,0.29771,astrocyte of the cerebral cortex,gene enhanced in cell type
1,cjacchus,216,ggorilla,42,41,254,0.194444,0.161417,0.176398,astrocyte of the cerebral cortex,gene enhanced in cell type
2,mmulatta,231,ggorilla,48,48,254,0.207792,0.188976,0.197938,astrocyte of the cerebral cortex,gene enhanced in cell type
3,hsapiens,284,ggorilla,67,68,254,0.235915,0.267717,0.250812,astrocyte of the cerebral cortex,gene enhanced in cell type
4,ptroglodytes,270,mmulatta,52,51,231,0.192593,0.220779,0.205725,astrocyte of the cerebral cortex,gene enhanced in cell type
5,cjacchus,216,mmulatta,44,44,231,0.203704,0.190476,0.196868,astrocyte of the cerebral cortex,gene enhanced in cell type
6,ggorilla,254,mmulatta,48,48,231,0.188976,0.207792,0.197938,astrocyte of the cerebral cortex,gene enhanced in cell type
7,hsapiens,284,mmulatta,71,71,231,0.25,0.307359,0.275728,astrocyte of the cerebral cortex,gene enhanced in cell type
8,ptroglodytes,270,hsapiens,85,85,284,0.314815,0.299296,0.306859,astrocyte of the cerebral cortex,gene enhanced in cell type
9,cjacchus,216,hsapiens,55,54,284,0.25463,0.190141,0.21771,astrocyte of the cerebral cortex,gene enhanced in cell type
