In [70]:
from neo4j import GraphDatabase
from warnings import filterwarnings
filterwarnings("ignore")
import pandas as pd
from statistics import harmonic_mean


In [24]:

# URI examples: "neo4j://localhost", "neo4j+s://xxx.databases.neo4j.io"
URI = "neo4j://localhost:7687"
AUTH = ("test", "666666")

with GraphDatabase.driver(URI, auth=AUTH) as driver: 

    driver.verify_connectivity() 

In [31]:
query="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: 'L2/3 intratelencephalic projecting glutamatergic neuron'})<-[:GeneEnrichedInCellType]-(g3:Gene)-[:GeneFromSpecies]->(s1)
WITH s1,  COUNT(DISTINCT g3) AS total1, ct
RETURN s1.species_scientific_name, total1"""

with driver.session() as session:
    result = session.run(query)
    df = result.to_df()

print(df)

  s1.species_scientific_name  total1
0                   marmoset       4
1                    macaque      24
2                      human      32
3                    gorilla      16
4                      chimp      12


In [32]:
query2="""MATCH (s1:Species)<-[:CellTypeFromSpecies]-(ct:CellType {cell_type_name: 'L2/3 intratelencephalic projecting glutamatergic neuron'})<-[:GeneEnrichedInCellType]-(g1:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g1:Gene)-[:GeneFromSpecies]->(s1),
    (s2:Species)<-[:CellTypeFromSpecies]-(ct)<-[:GeneEnrichedInCellType]-(g2:Gene)-[:GeneInOrthologousGroup]->(gf:OrthologousGroup),
    (g2:Gene)-[:GeneFromSpecies]->(s2)
WHERE s1 <> s2
WITH s1, s2, COUNT(DISTINCT g1) AS genes1, COUNT(DISTINCT g2) AS genes2, ct
RETURN s1.species_scientific_name, s2.species_scientific_name, genes1, genes2"""

with driver.session() as session:
    result2 = session.run(query2)
    df2 = result2.to_df()

print(df2)

   s1.species_scientific_name s2.species_scientific_name  genes1  genes2
0                    marmoset                      human       1       1
1                    marmoset                    macaque       1       1
2                     macaque                    gorilla       3       3
3                     macaque                      chimp       2       2
4                     macaque                      human       3       3
5                     macaque                   marmoset       1       1
6                       human                      chimp       5       5
7                       human                    gorilla       3       3
8                       human                    macaque       3       3
9                       human                   marmoset       1       1
10                    gorilla                      chimp       2       2
11                    gorilla                      human       3       3
12                    gorilla                    ma

In [46]:
df_new = df.merge(df2)
df_new.drop('genes2', axis=1, inplace=True)

In [47]:
df_new.columns = ['sp1', 'sp1_total_enriched', 'sp2', 'num_enriched_share_og']

In [48]:
df2

Unnamed: 0,s1.species_scientific_name,s2.species_scientific_name,genes1,genes2
0,marmoset,human,1,1
1,marmoset,macaque,1,1
2,macaque,gorilla,3,3
3,macaque,chimp,2,2
4,macaque,human,3,3
5,macaque,marmoset,1,1
6,human,chimp,5,5
7,human,gorilla,3,3
8,human,macaque,3,3
9,human,marmoset,1,1


In [53]:
df_all = df_new.merge(df, left_on='sp2', right_on='s1.species_scientific_name').drop('s1.species_scientific_name', axis=1)

In [56]:
df_all = df_all.rename(columns = {'total1':'sp2_total_enriched'})

In [59]:
df_all['sp1_pct'] = df_all['num_enriched_share_og'] / df_all['sp1_total_enriched']
df_all['sp2_pct'] = df_all['num_enriched_share_og'] / df_all['sp2_total_enriched']

In [74]:
df_all['hmean_pct'] = df_all.apply(lambda x: harmonic_mean([x['sp1_pct'], x['sp2_pct']]), axis=1)

In [84]:
df_all.sort_values('hmean_pct', ascending=False)[['sp1', 'sp2', 'hmean_pct']].drop_duplicates('hmean_pct').reset_index(drop=True)

Unnamed: 0,sp1,sp2,hmean_pct
0,chimp,human,0.227273
1,gorilla,macaque,0.15
2,chimp,gorilla,0.142857
3,gorilla,human,0.125
4,chimp,macaque,0.111111
5,macaque,human,0.107143
6,marmoset,macaque,0.071429
7,marmoset,human,0.055556
