# Example Query: Get Relationships Genes - Diseases from ClinGen
xxx

### ðŸ“¦ Step 1: Import Biofilter and connect to the database

In [1]:
from biofilter import Biofilter
from biofilter.query import Query
from sqlalchemy import select
from sqlalchemy.orm import aliased

In [2]:
# Replace with the appropriate connection string for your environment
db_uri = "postgresql+psycopg2://bioadmin:bioadmin@localhost/biofilter"

# Instance of Biofilter
bf = Biofilter(db_uri)
q = Query(bf.db.get_session())

### ðŸ“¦ Step 2: Get Models

In [3]:
EntityRelationship = q.get_model("EntityRelationship")
EntityAlias = q.get_model("EntityAlias")
Entity = q.get_model("Entity")
EntityGroup = q.get_model("EntityGroup")
EntityRelationshipType = q.get_model("EntityRelationshipType")
ETLDataSource = q.get_model("ETLDataSource")

### ðŸ“¦ Step 3: Run Query

In [4]:
e1 = aliased(Entity, name="entity1")
e2 = aliased(Entity, name="entity2")
a1 = aliased(EntityAlias, name="alias1")
a2 = aliased(EntityAlias, name="alias2")
g1 = aliased(EntityGroup, name="group1")
g2 = aliased(EntityGroup, name="group2")

ds_id = 41  # DS ClinGen

In [5]:
stmt = (
    select(
        a1.alias_value.label("Entity1_Alias"),
        g1.name.label("Entity1_Group"),
        EntityRelationshipType.code.label("Relationship_Type"),
        a2.alias_value.label("Entity2_Alias"),
        g2.name.label("Entity2_Group"),
        ETLDataSource.name.label("DataSource")
    )
    .select_from(EntityRelationship)
    # --- Entity 1 side ---
    .join(e1, e1.id == EntityRelationship.entity_1_id)
    .join(a1, (a1.entity_id == e1.id) & (a1.is_primary == True))
    .join(g1, g1.id == e1.group_id)
    # --- Relationship type ---
    .join(EntityRelationshipType, EntityRelationshipType.id == EntityRelationship.relationship_type_id)
    # --- Entity 2 side ---
    .join(e2, e2.id == EntityRelationship.entity_2_id)
    .join(a2, (a2.entity_id == e2.id) & (a2.is_primary == True))
    .join(g2, g2.id == e2.group_id)
    # --- DataSource ---
    .join(ETLDataSource, ETLDataSource.id == EntityRelationship.data_source_id)
    .where(EntityRelationship.data_source_id == ds_id)
    # .limit(50)
)

In [6]:
df = q.run_query(stmt, return_df=True)
print(f"We have: {len(df)} links between Genes and Diseases from ClinGen")
df

We have: 3281 links between Genes and Diseases from ClinGen


Unnamed: 0,Entity1_Alias,Entity1_Group,Relationship_Type,Entity2_Alias,Entity2_Group,DataSource
0,NPM1,Genes,part_of,MONDO:0000159,Diseases,clingen
1,CHRNB2,Genes,part_of,MONDO:0000030,Diseases,clingen
2,SLC2A1,Genes,part_of,MONDO:0000188,Diseases,clingen
3,CHRNA4,Genes,part_of,MONDO:0000030,Diseases,clingen
4,CHRNA2,Genes,part_of,MONDO:0000030,Diseases,clingen
...,...,...,...,...,...,...
3276,INTU,Genes,part_of,MONDO:1060154,Diseases,clingen
3277,HPCA,Genes,part_of,MONDO:1060159,Diseases,clingen
3278,HAVCR2,Genes,part_of,MONDO:1060169,Diseases,clingen
3279,C1QA,Genes,part_of,MONDO:1060174,Diseases,clingen


In [7]:
q.session.rollback()

In [8]:
import pandas as pd
from pathlib import Path

# Path to your Parquet file
parquet_file = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/curation_activity_summary.parquet")
parquet_file2 = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_disease_validity.parquet")
parquet_file3 = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_dosage.parquet")

# Load Parquet into a DataFrame
df = pd.read_parquet(parquet_file)

# Show info and first 10 rows
print("Columns in file:", df.columns.tolist())
print("\nFirst 10 rows:")
display(df.head(10))

FileNotFoundError: [Errno 2] No such file or directory: '/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/curation_activity_summary.parquet'

In [31]:
first_row = df.head(1).transpose()
print("First row (transposed):")
display(first_row)

First row (transposed):


Unnamed: 0,0
gene_symbol,A2ML1
hgnc_id,HGNC:23336
disease_label,Noonan syndrome
mondo_id,MONDO:0018997
mode_of_inheritance,Autosomal dominant inheritance
gene_disease_validity_assertion_classifications,disputing (06/07/2018)
gene_disease_validity_assertion_reports,https://search.clinicalgenome.org/kb/gene-vali...
gene_disease_validity_gceps,RASopathy


In [36]:
# Path to your Parquet file
curation_activity_summary = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/curation_activity_summary.parquet")
gene_disease_validity = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_disease_validity.parquet")
gene_dosage = Path("/home/bioadmin/biofilter/biofilter_data/processed/CLINGEN/clingen/gene_dosage.parquet")

In [44]:
import duckdb

# Connect to an in-memory DuckDB database
con = duckdb.connect()

# Run a query directly on the Parquet file
query = f"SELECT * FROM parquet_scan('{curation_activity_summary}') LIMIT 10"
df = con.execute(query).df()

query_2 = f"SELECT * FROM parquet_scan('{gene_disease_validity}') LIMIT 10"
df2 = con.execute(query_2).df()

# Show the first row transposed
first_row = df.head(1).transpose()
print("\nFirst row to curation_activity_summary:")
display(first_row)

# Show the first row transposed gene_disease_validity
first_row = df2.head(1).transpose()
print("\nFirst row to gene_disease_validity:")
display(first_row)



First row to curation_activity_summary:


Unnamed: 0,0
gene_symbol,A2ML1
hgnc_id,HGNC:23336
disease_label,Noonan syndrome
mondo_id,MONDO:0018997
mode_of_inheritance,Autosomal dominant inheritance
gene_disease_validity_assertion_classifications,disputing (06/07/2018)
gene_disease_validity_assertion_reports,https://search.clinicalgenome.org/kb/gene-vali...
gene_disease_validity_gceps,RASopathy



First row to gene_disease_validity:


Unnamed: 0,0
hgnc_id,HGNC:10004
gene_symbol,RGS9
mondo_id,MONDO:0012033
disease_label,bradyopsia
moi,AR
sop_version,8
classification,Moderate
assertion_date,2021-09-08
gcep,Retina Gene Curation Expert Panel
report_url,https://search.clinicalgenome.org/kb/gene-vali...
