# Define standardized organelle terms

Pulled from Gene Ontology (GO), the terms represent a standardized vocabulary for known organelles.

The terms exist in the overall parent: `cellular_compartment` -> `cellular_anatomical_entity` -> `organelle`

And within only a subset of the following child terms representing intracellular and extracellular membrane and non-membrane bound organelles:

| go_term    | go_id                                        | go_last_parent          |
|-----------:| :--------------------------------------------|:------------------------|
| GO:0043231 | intracellular membrane-bounded organelle     | intracellular organelle |
| GO:0065010 | extracellular membrane-bounded organelle     | extracellular organelle |
| GO:0043232 | intracellular non-membrane-bounded organelle | intracellular organelle |
| GO:0043264 | extracellular non-membrane-bounded organelle | extracellular organelle |

In [1]:
import pathlib
import pandas as pd
from pronto import Ontology

In [2]:
output_file = "organelles.tsv"

In [3]:
# Download Gene Ontology
# https://www.ebi.ac.uk/ols/ontologies/go
go_owl = str(pathlib.Path("data/go.owl"))

go = Ontology(go_owl)

In [4]:
# Define the "organelle" GO term
organelle = "GO:0043226"
organell_go_term = go[organelle]

In [5]:
build_info = {}
for term in organell_go_term.rchildren():
    build_info[term.id] = [term.name, term.parents[-1].name]
    
organelle_df = pd.DataFrame(build_info).transpose().reset_index()
organelle_df.columns = ["go_term", "go_id", "go_last_parent"]

print(organelle_df.shape)
organelle_df.head(3)

(395, 3)


Unnamed: 0,go_term,go_id,go_last_parent
0,GO:0043227,membrane-bounded organelle,organelle
1,GO:0043228,non-membrane-bounded organelle,organelle
2,GO:0043229,intracellular organelle,organelle


In [6]:
# Extract extracellular and intracellular organelles
levels = ["extracellular organelle", "intracellular organelle"]
go_parents_df = organelle_df.query("go_last_parent in @levels").reset_index(drop=True)
go_parents_df

Unnamed: 0,go_term,go_id,go_last_parent
0,GO:0043231,intracellular membrane-bounded organelle,intracellular organelle
1,GO:0065010,extracellular membrane-bounded organelle,extracellular organelle
2,GO:0043232,intracellular non-membrane-bounded organelle,intracellular organelle
3,GO:0043264,extracellular non-membrane-bounded organelle,extracellular organelle


In [7]:
# Pull all child terms from the intra- and extracellular membrane and non-membrane bounded organelles
all_organelle_info = {}
for term in go_parents_df.go_term.to_list():
    suborganelle_term = go[term]
    for sub_term in suborganelle_term.rchildren():
        all_organelle_info[sub_term.id] = [sub_term.name, suborganelle_term.name, suborganelle_term.id]

In [8]:
# Save the organelle data to disk
suborganelle_df = pd.DataFrame(all_organelle_info).transpose().reset_index()
suborganelle_df.columns = ["go_term", "go_organelle", "go_membrane_term", "go_membrane_id"]

suborganelle_df = (
    suborganelle_df
    .sort_values(by="go_organelle", key=lambda col: col.str.lower())
    .reset_index(drop=True)
)

suborganelle_df.to_csv(output_file, sep="\t", index=False)

print(suborganelle_df.shape)
suborganelle_df.head(3)

(357, 4)


Unnamed: 0,go_term,go_organelle,go_membrane_term,go_membrane_id
0,GO:0020022,acidocalcisome,intracellular membrane-bounded organelle,GO:0043231
1,GO:0001669,acrosomal vesicle,intracellular membrane-bounded organelle,GO:0043231
2,GO:0015629,actin cytoskeleton,intracellular non-membrane-bounded organelle,GO:0043232
