In [None]:
from omop_alchemy import get_engine_name, load_environment, TEST_PATH, ROOT_PATH
from orm_loader.helpers import get_logger
from dotenv import load_dotenv
from pathlib import Path
import os
import pandas as pd
# old enumerator classes from monolithic version of omop_alchemy - selection of cancer-relevant codes
import concept_enums

base_path = TEST_PATH / "fixtures" / "athena_source"
load_dotenv()
source_path = Path(os.getenv('SOURCE_PATH', 'update/path/to/athena/source/as/required'))

In [2]:
concept = pd.read_csv(source_path / 'CONCEPT.csv', delimiter='\t', low_memory=False)
concept_class = pd.read_csv(source_path / 'CONCEPT_CLASS.csv', delimiter='\t')
relationship = pd.read_csv(source_path / 'RELATIONSHIP.csv', delimiter='\t')
domain = pd.read_csv(source_path / 'DOMAIN.csv', delimiter='\t')
vocabulary = pd.read_csv(source_path / 'VOCABULARY.csv', delimiter='\t')

In [None]:
required_concepts = set(concept_class.concept_class_concept_id) | set(relationship.relationship_concept_id) | set(domain.domain_concept_id) | set(vocabulary.vocabulary_concept_id)
required_concepts_df = concept[concept.concept_id.isin(required_concepts)]

selected = []
for d in set(domain.domain_id):
    try:
        c = concept[(concept.domain_id == d) & (concept.standard_concept == 'S')]
        selected.append(c.sample(min(50, len(c)), random_state=1))
    except ValueError:
        print(f"Not enough standard concepts in domain {d}")
        pass

In [None]:
standard_concept_by_domain_df = pd.concat(selected)

additional_test_concepts = set([x for y in 
                                    [concept_enums.__dict__[cls].member_values() 
                                        for cls in dir(concept_enums) 
                                        if hasattr(concept_enums.__dict__[cls], 'member_values')
                                    ] 
                                for x in y])

additional_test_concept_df = concept[concept.concept_id.isin(additional_test_concepts)]

metadata = concept[concept.domain_id == 'Metadata']
language = concept[concept.domain_id == 'Language']
locations = concept[(concept.concept_class_id=='Location') & (concept.standard_concept.notna())].sample(frac=0.1, replace=False)

additional_cancer_ones = []

for vocab, frac in {'Cancer Modifier': 1.0, 'HemOnc': 0.1, 'ICDO3': 0.05}.items():
    additional_cancer_ones.append(concept[(concept.vocabulary_id == vocab) & concept.standard_concept.notna()].sample(frac=frac, replace=False))

cancer_specific_df = pd.concat(additional_cancer_ones)

selected_concept_df = pd.concat(
    [
        standard_concept_by_domain_df,
        required_concepts_df,
        additional_test_concept_df,
        cancer_specific_df,
        locations,
        metadata,
        language
    ]
).drop_duplicates()

In [None]:
selected_relationships = []

for concept_rel in pd.read_csv(source_path / 'CONCEPT_RELATIONSHIP.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_rel[
        (concept_rel.concept_id_1.isin(selected_concept_df.concept_id)) &
        (concept_rel.concept_id_2.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_relationships.append(filtered)

selected_ancestry = []

for concept_anc in pd.read_csv(source_path / 'CONCEPT_ANCESTOR.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_anc[
        (concept_anc.ancestor_concept_id.isin(selected_concept_df.concept_id)) &
        (concept_anc.descendant_concept_id.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_ancestry.append(filtered)

selected_synonyms = []

for concept_syn in pd.read_csv(source_path / 'CONCEPT_SYNONYM.csv', delimiter='\t', low_memory=False, chunksize=100000):
    filtered = concept_syn[
        (concept_syn.concept_id.isin(selected_concept_df.concept_id))
    ]
    if not filtered.empty:
        selected_synonyms.append(filtered)


selected_relationship_df = pd.concat(selected_relationships)
selected_ancestry_df = pd.concat(selected_ancestry)
selected_synonyms_df = pd.concat(selected_synonyms)


selected_relationship_df.to_csv(base_path / 'CONCEPT_RELATIONSHIP.csv', sep='\t', index=False)
selected_synonyms_df.to_csv(base_path / 'CONCEPT_SYNONYM.csv', sep='\t', index=False)
selected_ancestry_df.to_csv(base_path / 'CONCEPT_ANCESTOR.csv', sep='\t', index=False)
selected_concept_df.to_csv(base_path / 'CONCEPT.csv', sep='\t', index=False)
domain.to_csv(base_path / 'DOMAIN.csv', sep='\t', index=False)
vocabulary.to_csv(base_path / 'VOCABULARY.csv', sep='\t', index=False)
relationship.to_csv(base_path / 'RELATIONSHIP.csv', sep='\t', index=False)
concept_class.to_csv(base_path / 'CONCEPT_CLASS.csv', sep='\t', index=False)

In [None]:
for f in [domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df, selected_synonyms_df]:
    for col in f.columns:
        if 'concept_id' in col:
            if len(f[~f[col].isin(selected_concept_df.concept_id)]) > 0:
                raise ValueError(f"Found concept_id in {col} not in selected concepts")

In [None]:
assert len(selected_relationship_df[~selected_relationship_df.relationship_id.isin(relationship.relationship_id.unique())]) == 0, "Found relationship_id not in selected relationships"
assert len(concept[~concept.concept_class_id.isin(concept_class.concept_class_id.unique())]) == 0, "Found concept_class_id not in selected concepts"
assert len(concept[~concept.domain_id.isin(domain.domain_id.unique())]) == 0, "Found domain_id not in selected domains"
assert len(concept[~concept.vocabulary_id.isin(vocabulary.vocabulary_id.unique())]) == 0, "Found vocabulary_id not in selected vocabularies"

In [None]:
for f in [selected_concept_df, domain, vocabulary, relationship, concept_class, selected_relationship_df, selected_ancestry_df]:
    assert(len(f[f.duplicated()]) == 0), f"Found duplicated rows in {f}"

In [None]:
# this is the import issue...TODO: add pk null normalisation on load
vocabulary.loc[vocabulary.vocabulary_id.isna(), 'vocabulary_id'] = 'Unknown_Vocabulary'

In [None]:
metadata[metadata.concept_id==1147138]

In [None]:
len(selected_concept_df)

In [None]:
os.environ.get('SOURCE_PATH')