# create gene ontology dict

### Parse .obo file and create a dictionary with it

### code base from https://github.com/bio-ontology-research-group/deepgo

In [22]:
import copy
import pickle

def geneontology():
    go = dict()
    obj = None
    alt = []
    with open("go.obo", 'r') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            if line == '[Term]':
                if obj is not None:
                    go[obj['id']] = obj
                    if len(alt) >= 1:
                        for i in alt:
                            go[i] = obj
                        alt = []
                obj = dict()
                obj['is_a'] = list()
                obj['is_obsolete'] = False
                obj['is_mf'] = False
                continue
            elif line == '[Typedef]':
                obj = None
            else:
                if obj is None:
                    continue
                l = line.split(": ")
                if l[0] == 'id':
                    obj['id'] = l[1]
                elif l[0] == 'is_a':
                    obj['is_a'].append(l[1].split(' ! ')[0])
                elif l[0] == 'name':
                    obj['name'] = l[1]
                elif l[0] == 'is_obsolete' and l[1] == 'true':
                    obj['is_obsolete'] = True
                elif l[0] == 'namespace' and l[1] == 'molecular_function':
                    obj['is_mf'] = True
                elif l[0] == 'alt_id':
                    alt.append(l[1])
        if obj is not None:
            go[obj['id']] = obj

    go_copy = copy.deepcopy(go)

    for go_id in go_copy.keys():
        if go[go_id]['is_obsolete']:
            del go[go_id]
        elif not go[go_id]['is_mf']:
            del go[go_id]

    for go_id, val in go.items():
        if 'children' not in val:
            val['children'] = set()
        for p_id in val['is_a']:
            if p_id in go:
                if 'children' not in go[p_id]:
                    go[p_id]['children'] = set()
                go[p_id]['children'].add(go_id)
        
    return go

In [23]:
go = geneontology()

In [24]:
with open('hierarchy.pkl', 'wb') as handle:
    pickle.dump(go, handle, protocol=pickle.HIGHEST_PROTOCOL)

# get ancestors + term

In [25]:
def getAncestors(id):
    term = go[id]
    parents = dict()
    return getParents(term, parents)

def getParents(term, parents):
    if term == 'GO:0003674':
        parents['GO:0003674'] = term
        return parents
    elif len(parents) == 0:
        parents[term['id']] = term
    for parent in term['is_a']:
        if parent not in parents:
            parents[parent] = go[parent]
            parents = getParents(go[parent], parents)
    return parents

### check whether my code is right + adding all my prediction classes

In [26]:
myTerms = ["GO:0042802","GO:0042803","GO:0003677","GO:0043565","GO:0003700","GO:0046982","GO:0008270","GO:0004674","GO:0000978","GO:0001228"]

In [27]:
classes = set()
for go_id in myTerms:
    me = getAncestors(go_id)
    for i in me:
        classes.add(i)

In [28]:
classes = [i for i in sorted(classes) if i != "GO:0003674"]
len(classes)

38

### the classes we are creating predictions for

In [29]:
for i in classes:
    print(i+'\t'+go[i]['name'])

GO:0000976	transcription regulatory region sequence-specific DNA binding
GO:0000977	RNA polymerase II regulatory region sequence-specific DNA binding
GO:0000978	RNA polymerase II proximal promoter sequence-specific DNA binding
GO:0000981	DNA-binding transcription factor activity, RNA polymerase II-specific
GO:0000987	proximal promoter sequence-specific DNA binding
GO:0001012	RNA polymerase II regulatory region DNA binding
GO:0001067	regulatory region nucleic acid binding
GO:0001216	DNA-binding transcription activator activity
GO:0001228	DNA-binding transcription activator activity, RNA polymerase II-specific
GO:0003676	nucleic acid binding
GO:0003677	DNA binding
GO:0003690	double-stranded DNA binding
GO:0003700	DNA-binding transcription factor activity
GO:0003824	catalytic activity
GO:0004672	protein kinase activity
GO:0004674	protein serine/threonine kinase activity
GO:0005488	binding
GO:0005515	protein binding
GO:0008270	zinc ion binding
GO:0016301	kinase activity
GO:0016740	transfer

In [30]:
classesDict = {i: e for e, i in enumerate(classes)}

In [32]:
with open('classes.pkl', 'wb') as f:
    pickle.dump(classesDict, f)

# calc all parents of go terms appearing in the data set

In [33]:
import pandas as pd
import numpy as np

In [34]:
df = pd.read_pickle("../1_Dataset/Dataset.pkl")
df = df.drop("index", axis=1)
for e,i in enumerate(df["Go_terms"]):
    df.loc[e]["Go_terms"] = i.split(",")
df.loc[33310][1] = [i for i in df.loc[33310][1] if i != "GO:0005395"]

In [38]:
preds = []
for i in range(len(df)):
    vector = np.zeros(38)
    for j in df.loc[i]["Go_terms"]:
        anc = getAncestors(j).keys()
        for k in anc:
            try:
                vector[classesDict[k]] = 1
            except:
                pass
    preds.append(vector)
df["preds"] = preds

In [41]:
a=[i[37]==1 for i in df["preds"]]
len(df[a])

1326

In [42]:
appDict = {}
for e, i in enumerate(classesDict):
    appDict[e] = len(df[[j[e]==1 for j in df["preds"]]])

In [49]:
for i in classesDict:
    line = i+"\t"+str(classesDict[i])+"\t"+str(appDict[classesDict[i]])+"\t"
    for j in go[i]["is_a"]:
        if j!="GO:0003674":
            line += str(appDict[classesDict[j]])+"\t"
    print(line)

GO:0000976	0	1191	1509	1326	
GO:0000977	1	910	1191	922	
GO:0000978	2	612	910	681	
GO:0000981	3	1006	1853	
GO:0000987	4	681	1191	
GO:0001012	5	922	1509	
GO:0001067	6	1515	4915	
GO:0001216	7	602	1853	
GO:0001228	8	534	1006	602	
GO:0003676	9	4915	6768	6659	
GO:0003677	10	3467	4915	
GO:0003690	11	1551	3467	
GO:0003700	12	1853	2415	
GO:0003824	13	15822	
GO:0004672	14	1261	1828	1642	3883	
GO:0004674	15	817	1261	
GO:0005488	16	18942	
GO:0005515	17	10979	18942	
GO:0008270	18	560	1182	
GO:0016301	19	1828	2330	
GO:0016740	20	5980	15822	
GO:0016772	21	2330	5980	
GO:0016773	22	1642	2330	
GO:0042802	23	3562	10979	
GO:0042803	24	1435	3562	1939	
GO:0043167	25	4173	18942	
GO:0043169	26	2253	4173	
GO:0043565	27	2199	3467	
GO:0044212	28	1509	1515	3467	
GO:0046872	29	2100	2253	
GO:0046914	30	1182	2100	
GO:0046982	31	684	1939	
GO:0046983	32	1939	10979	
GO:0097159	33	6768	18942	
GO:0140096	34	3883	15822	
GO:0140110	35	2415	
GO:1901363	36	6659	18942	
GO:1990837	37	1326	1551	2199	


In [50]:
df.to_pickle("Dataset_withVector.pkl")