# Example Query: Get Relationships Genes - Diseases from ClinGen
xxx

### ðŸ“¦ Step 1: Import Biofilter and connect to the database

In [1]:
from biofilter import Biofilter
from biofilter.query import Query
from sqlalchemy import select
from sqlalchemy.orm import aliased

In [2]:
# Replace with the appropriate connection string for your environment
db_uri = "postgresql+psycopg2://bioadmin:bioadmin@localhost/biofilter"

# Instance of Biofilter
bf = Biofilter(db_uri)
q = Query(bf.db.get_session())

### ðŸ“¦ Step 2: Get Models

In [11]:
EntityRelationship = q.get_model("EntityRelationship")
EntityAlias = q.get_model("EntityAlias")
Entity = q.get_model("Entity")
EntityGroup = q.get_model("EntityGroup")
EntityRelationshipType = q.get_model("EntityRelationshipType")
ETLDataSource = q.get_model("ETLDataSource")

### ðŸ“¦ Step 3: Run Query

In [21]:
e1 = aliased(Entity, name="entity1")
e2 = aliased(Entity, name="entity2")
a1 = aliased(EntityAlias, name="alias1")
a2 = aliased(EntityAlias, name="alias2")
g1 = aliased(EntityGroup, name="group1")
g2 = aliased(EntityGroup, name="group2")

ds_id = 41  # DS ClinGen

In [27]:
stmt = (
    select(
        a1.alias_value.label("Entity1_Alias"),
        g1.name.label("Entity1_Group"),
        EntityRelationshipType.code.label("Relationship_Type"),
        a2.alias_value.label("Entity2_Alias"),
        g2.name.label("Entity2_Group"),
        ETLDataSource.name.label("DataSource")
    )
    .select_from(EntityRelationship)
    # --- Entity 1 side ---
    .join(e1, e1.id == EntityRelationship.entity_1_id)
    .join(a1, (a1.entity_id == e1.id) & (a1.is_primary == True))
    .join(g1, g1.id == e1.group_id)
    # --- Relationship type ---
    .join(EntityRelationshipType, EntityRelationshipType.id == EntityRelationship.relationship_type_id)
    # --- Entity 2 side ---
    .join(e2, e2.id == EntityRelationship.entity_2_id)
    .join(a2, (a2.entity_id == e2.id) & (a2.is_primary == True))
    .join(g2, g2.id == e2.group_id)
    # --- DataSource ---
    .join(ETLDataSource, ETLDataSource.id == EntityRelationship.data_source_id)
    .where(EntityRelationship.data_source_id == ds_id)
    # .limit(50)
)

In [29]:
df = q.run_query(stmt, return_df=True)
print(f"We have: {len(df)} links between Genes and Diseases from ClinGen")
df

We have: 3240 links between Genes and Diseases from ClinGen


Unnamed: 0,Entity1_Alias,Entity1_Group,Relationship_Type,Entity2_Alias,Entity2_Group,DataSource
0,RGS9,Genes,part_of,MONDO:0012033,Diseases,clingen
1,RHO,Genes,part_of,MONDO:0019118,Diseases,clingen
2,GRK1,Genes,part_of,MONDO:0019152,Diseases,clingen
3,RIPK1,Genes,part_of,MONDO:0020849,Diseases,clingen
4,RIT1,Genes,part_of,MONDO:0018997,Diseases,clingen
...,...,...,...,...,...,...
3235,RFC1,Genes,part_of,MONDO:0044720,Diseases,clingen
3236,RFX3,Genes,part_of,MONDO:0100038,Diseases,clingen
3237,RFX5,Genes,part_of,MONDO:0008855,Diseases,clingen
3238,RFXANK,Genes,part_of,MONDO:0008855,Diseases,clingen


In [20]:
q.session.rollback()

In [34]:
import pandas as pd
from pathlib import Path

# Path to your Parquet file
parquet_file = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/curation_activity_summary.parquet")
parquet_file2 = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_disease_validity.parquet")
parquet_file3 = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_dosage.parquet")

# Load Parquet into a DataFrame
df = pd.read_parquet(parquet_file)

# Show info and first 10 rows
print("Columns in file:", df.columns.tolist())
print("\nFirst 10 rows:")
display(df.head(10))

Columns in file: ['gene_symbol', 'hgnc_id', 'disease_label', 'mondo_id', 'mode_of_inheritance', 'gene_disease_validity_assertion_classifications', 'gene_disease_validity_assertion_reports', 'gene_disease_validity_gceps']

First 10 rows:


Unnamed: 0,gene_symbol,hgnc_id,disease_label,mondo_id,mode_of_inheritance,gene_disease_validity_assertion_classifications,gene_disease_validity_assertion_reports,gene_disease_validity_gceps
0,A2ML1,HGNC:23336,Noonan syndrome,MONDO:0018997,Autosomal dominant inheritance,disputing (06/07/2018),https://search.clinicalgenome.org/kb/gene-vali...,RASopathy
1,A4GALT,HGNC:18149,A4GALT-congenital disorder of glycosylation,MONDO:0100587,Autosomal recessive inheritance,definitive evidence (05/30/2025),https://search.clinicalgenome.org/kb/gene-vali...,Congenital Disorders of Glycosylation
2,AAGAB,HGNC:25662,"palmoplantar keratoderma, punctate type 1A",MONDO:0007858,,,,
3,AARS1,HGNC:20,Charcot-Marie-Tooth disease axonal type 2N,MONDO:0013212,Autosomal dominant inheritance,definitive evidence (03/14/2024),https://search.clinicalgenome.org/kb/gene-vali...,Charcot-Marie-Tooth
4,AARS1,HGNC:20,AARS1-related leukoencephalopathy,MONDO:1010132,Autosomal dominant inheritance,limited evidence (08/27/2025),https://search.clinicalgenome.org/kb/gene-vali...,Leukodystrophy and Leukoencephalopathy
5,AARS2,HGNC:21022,mitochondrial disease,MONDO:0044970,Autosomal recessive inheritance,definitive evidence (04/18/2022),https://search.clinicalgenome.org/kb/gene-vali...,Mitochondrial Diseases
6,AARS2,HGNC:21022,"leukoencephalopathy, progressive, with ovarian...",MONDO:0014387,,,,
7,AASS,HGNC:17366,hyperlysinemia,MONDO:0009388,Autosomal recessive inheritance,definitive evidence (10/14/2022),https://search.clinicalgenome.org/kb/gene-vali...,Aminoacidopathy
8,ABAT,HGNC:23,genetic developmental and epileptic encephalop...,MONDO:0100062,Autosomal recessive inheritance,moderate evidence (04/19/2022),https://search.clinicalgenome.org/kb/gene-vali...,Epilepsy
9,ABAT,HGNC:23,GABA aminotransaminase deficiency,MONDO:0013166,,,,


In [31]:
first_row = df.head(1).transpose()
print("First row (transposed):")
display(first_row)

First row (transposed):


Unnamed: 0,0
gene_symbol,A2ML1
hgnc_id,HGNC:23336
disease_label,Noonan syndrome
mondo_id,MONDO:0018997
mode_of_inheritance,Autosomal dominant inheritance
gene_disease_validity_assertion_classifications,disputing (06/07/2018)
gene_disease_validity_assertion_reports,https://search.clinicalgenome.org/kb/gene-vali...
gene_disease_validity_gceps,RASopathy


In [36]:
# Path to your Parquet file
curation_activity_summary = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/curation_activity_summary.parquet")
gene_disease_validity = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_disease_validity.parquet")
gene_dosage = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_dosage.parquet")

In [44]:
import duckdb

# Connect to an in-memory DuckDB database
con = duckdb.connect()

# Run a query directly on the Parquet file
query = f"SELECT * FROM parquet_scan('{curation_activity_summary}') LIMIT 10"
df = con.execute(query).df()

query_2 = f"SELECT * FROM parquet_scan('{gene_disease_validity}') LIMIT 10"
df2 = con.execute(query_2).df()

# Show the first row transposed
first_row = df.head(1).transpose()
print("\nFirst row to curation_activity_summary:")
display(first_row)

# Show the first row transposed gene_disease_validity
first_row = df2.head(1).transpose()
print("\nFirst row to gene_disease_validity:")
display(first_row)



First row to curation_activity_summary:


Unnamed: 0,0
gene_symbol,A2ML1
hgnc_id,HGNC:23336
disease_label,Noonan syndrome
mondo_id,MONDO:0018997
mode_of_inheritance,Autosomal dominant inheritance
gene_disease_validity_assertion_classifications,disputing (06/07/2018)
gene_disease_validity_assertion_reports,https://search.clinicalgenome.org/kb/gene-vali...
gene_disease_validity_gceps,RASopathy



First row to gene_disease_validity:


Unnamed: 0,0
hgnc_id,HGNC:10004
gene_symbol,RGS9
mondo_id,MONDO:0012033
disease_label,bradyopsia
moi,AR
sop_version,8
classification,Moderate
assertion_date,2021-09-08
gcep,Retina Gene Curation Expert Panel
report_url,https://search.clinicalgenome.org/kb/gene-vali...
