In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -U pandas
!pip install -U numpy



### Import the libraries

In [None]:
import os
import pandas as pd
import re
import numpy as np

## DOID ontology
**DOID ONTOLOGY** has been developed as a standardized ontology for human disease with the purpose of providing the biomedical community with consistent, reusable and sustainable descriptions of human disease terms, phenotype characteristics and related medical vocabulary disease concepts.

**Source:**https://bioportal.bioontology.org/ontologies/DOID

In [None]:
GENERAL_PATH = "../DATASET"
DATA_PATH = GENERAL_PATH + "/data"

### Read DOID ontology

In [None]:
file_path = '/content/drive/MyDrive/gena-db-master/src/DATASET/data/resource_csv/DOID.csv'

doid_df = pd.read_csv(file_path)
doid_df.columns

  doid_df = pd.read_csv(file_path)


Index(['Class ID', 'Preferred Label', 'Synonyms', 'Definitions', 'Obsolete',
       'CUI', 'Semantic Types', 'Parents', 'adjacent_to', 'auto-generated-by',
       ...
       'related via exposure to',
       'relation between physical entity and a process or stage', 'saved-by',
       'sexually_transmitted_infectious_disease', 'subset_property',
       'tick-borne_infectious_disease', 'title', 'TopNodes_DOcancerslim',
       'transmitted by', 'zoonotic_infectious_disease'],
      dtype='object', length=107)

### Ontologies that has been used in DOID

In [None]:
regex = r"^(.+//)(.+/)*(.+)(_\d+)\/*$"
regex_2 = r"http://purl.obolibrary.org/obo/doid#(.+)"
re.match(regex, 'http://purl.obolibrary.org/obo/DOID_8986').group(3)

'DOID'

In [None]:
doid_df['Type'] = doid_df['Class ID'].map(lambda x: re.match(regex, x).group(3) if re.match(regex, x) else re.match(regex_2, x).group(1).upper())
doid_df['Type'] = doid_df['Type'].map(lambda x: 'SO' if x == 'SEQUENCE' else x)
doid_df['Type'].value_counts()

Type
DOID         13322
NCBITaxon     1095
SYMP           847
UBERON         779
HP             439
CHEBI          374
SO             246
OMIM           202
CL             184
FOODON          56
TRANS           32
GENO            31
ECO             13
DISDRIV          7
ExO              4
UPHENO           1
Name: count, dtype: int64

### Drop the non-value columns

In [None]:
non_value_columns = []
for column in list(doid_df.columns):
    if len(list(doid_df[column].value_counts())) == 0:
        doid_df = doid_df.drop(columns=column)
        non_value_columns.append(column)
doid_df.head(3)


Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,Parents,adjacent_to,comment,contributes to condition,created_by,...,http://data.bioontology.org/metadata/prefixIRI,http://purl.obolibrary.org/obo/doid#has_symptom,http://www.w3.org/2004/02/skos/core#notation,id,IEDB alternative term,in_subset,label,owl:deprecated,transmitted by,Type
0,http://purl.obolibrary.org/obo/DOID_8986,narcolepsy,"Narcolepsy, without cataplexy|paroxysmal sleep",A sleep disorder that involves an excessive ur...,False,http://purl.obolibrary.org/obo/DOID_535,,Xref MGI.,,,...,,,DOID:8986,DOID:8986,,http://purl.obolibrary.org/obo/doid#DO_rare_sl...,narcolepsy,,,DOID
1,http://purl.obolibrary.org/obo/DOID_7233,adult central nervous system embryonal carcinoma,Embryonal carcinoma of the adult central nervo...,A central nervous system adult germ cell tumor...,False,http://purl.obolibrary.org/obo/DOID_5349,,,,,...,,,DOID:7233,DOID:7233,,http://purl.obolibrary.org/obo/doid#NCIthesaurus,adult central nervous system embryonal carcinoma,,,DOID
2,http://purl.obolibrary.org/obo/HP_0011138,Abnormality of skin adnexa morphology,,,False,http://purl.obolibrary.org/obo/HP_0001574,,,,,...,,,HP:0011138,HP:0011138,,,Abnormality of skin adnexa morphology,,,HP


### The ontologies that may be useful
* **DOID**: human disease
* **FOODON**: food ontology
* **CHEBI**: Chemical Entities of Biological Interest Ontology

### Get fad_df (food and disease)

In [None]:
types = ['DOID', 'FOODON', 'CHEBI']
fad_df = doid_df[doid_df['Type'].isin(types)].reset_index(drop=True)
fad_df.dropna()
fad_df = fad_df.drop_duplicates(subset='Class ID', keep='last')
fad_df.head(5)

Unnamed: 0,Class ID,Preferred Label,Synonyms,Definitions,Obsolete,Parents,adjacent_to,comment,contributes to condition,created_by,...,http://data.bioontology.org/metadata/prefixIRI,http://purl.obolibrary.org/obo/doid#has_symptom,http://www.w3.org/2004/02/skos/core#notation,id,IEDB alternative term,in_subset,label,owl:deprecated,transmitted by,Type
0,http://purl.obolibrary.org/obo/DOID_8986,narcolepsy,"Narcolepsy, without cataplexy|paroxysmal sleep",A sleep disorder that involves an excessive ur...,False,http://purl.obolibrary.org/obo/DOID_535,,Xref MGI.,,,...,,,DOID:8986,DOID:8986,,http://purl.obolibrary.org/obo/doid#DO_rare_sl...,narcolepsy,,,DOID
1,http://purl.obolibrary.org/obo/DOID_7233,adult central nervous system embryonal carcinoma,Embryonal carcinoma of the adult central nervo...,A central nervous system adult germ cell tumor...,False,http://purl.obolibrary.org/obo/DOID_5349,,,,,...,,,DOID:7233,DOID:7233,,http://purl.obolibrary.org/obo/doid#NCIthesaurus,adult central nervous system embryonal carcinoma,,,DOID
2,http://purl.obolibrary.org/obo/DOID_5236,subungual glomus tumor,Subungual Glomus tumour|Subungual Glomus tumor...,,False,http://purl.obolibrary.org/obo/DOID_2431,,,,,...,,,DOID:5236,DOID:5236,,http://purl.obolibrary.org/obo/doid#NCIthesaurus,subungual glomus tumor,,,DOID
3,http://purl.obolibrary.org/obo/DOID_1934,dysostosis,,A bone development disease that results in def...,False,http://purl.obolibrary.org/obo/DOID_0080006,,,,,...,,,DOID:1934,DOID:1934,,http://purl.obolibrary.org/obo/doid#NCIthesaurus,dysostosis,,,DOID
4,http://purl.obolibrary.org/obo/DOID_1086,obsolete congenital chromosomal disease,,,True,,,,,,...,,,DOID:1086,DOID:1086,,,obsolete congenital chromosomal disease,True,,DOID


### Information dictionaries

In [None]:
dct_classID = {}
for index, row in fad_df.iterrows():
    dct_classID[row['Class ID']] = row['Preferred Label']

dct_label = {}
for index, row in fad_df.iterrows():
    dct_label[row['Preferred Label']] = {}
    dct_label[row['Preferred Label']]['synonyms'] = row['Synonyms']
    dct_label[row['Preferred Label']]['definitions'] = row['Definitions']

### Get the relations between the entities in fad_df

In [None]:
columns = list(fad_df.columns)
list_relations = columns[6:7] + [columns[8]] + columns[13:24] + [columns[-2]]
list_relations

['adjacent_to',
 'contributes to condition',
 'derives_from',
 'disease has basis in',
 'disease has location',
 'existence starts during',
 'has allergic trigger',
 'has broader match',
 'has exact match',
 'has exposure stressor',
 'has material basis in',
 'has narrower match',
 'has phenotype',
 'transmitted by']

In [None]:
fad_data = []
regex_fad = r'.*(FOODON|DOID|CHEBI).*$'
for index, row in fad_df.iterrows():
    for relation in list_relations:
        if not (pd.isna(row[relation])) and re.match(regex_fad, row[relation]):
            fad_data.append((row['Class ID'], dct_classID.get(row['Class ID']), relation, row[relation]))

In [None]:
len(fad_data)

128

In [None]:
fad_data[:3]

[('http://purl.obolibrary.org/obo/DOID_0040102',
  "N,N'-diethylthiourea allergic contact dermatitis",
  'has allergic trigger',
  'http://purl.obolibrary.org/obo/CHEBI_82448'),
 ('http://purl.obolibrary.org/obo/DOID_251',
  'alcohol-induced mental disorder',
  'has exposure stressor',
  'http://purl.obolibrary.org/obo/CHEBI_30879'),
 ('http://purl.obolibrary.org/obo/DOID_252',
  'alcoholic psychosis',
  'has exposure stressor',
  'http://purl.obolibrary.org/obo/CHEBI_30879')]

### Flatten the tuples

In [None]:
fad_final = []
for (h, d, r, t) in fad_data:
    ts = t.split('|')
    if len(ts) > 1:
        ts = list(filter(lambda mem: re.match(regex_fad, mem), ts))
        fad_final = fad_final + [(h, d, r, sub_t) for sub_t in ts]
    else:
        fad_final = fad_final + [(h, d, r, t)]
len(fad_final)

128

### Create a dataframe for information tuple

In [None]:
fad_df = pd.DataFrame(fad_final, columns=['head_id', 'head_name', 'relation', 'tail_id'])
fad_df.head(5)

Unnamed: 0,head_id,head_name,relation,tail_id
0,http://purl.obolibrary.org/obo/DOID_0040102,"N,N'-diethylthiourea allergic contact dermatitis",has allergic trigger,http://purl.obolibrary.org/obo/CHEBI_82448
1,http://purl.obolibrary.org/obo/DOID_251,alcohol-induced mental disorder,has exposure stressor,http://purl.obolibrary.org/obo/CHEBI_30879
2,http://purl.obolibrary.org/obo/DOID_252,alcoholic psychosis,has exposure stressor,http://purl.obolibrary.org/obo/CHEBI_30879
3,http://purl.obolibrary.org/obo/DOID_0040010,mepivacaine allergy,has allergic trigger,http://purl.obolibrary.org/obo/CHEBI_6759
4,http://purl.obolibrary.org/obo/DOID_0040103,cefotiam allergy,has allergic trigger,http://purl.obolibrary.org/obo/CHEBI_355510


----

## BC5CDR Corpus
**BC5CDR Corpus** is a corpus about Chemical, Disease and their Relation.

**Source**: https://biocreative.bioinformatics.udel.edu/tasks/biocreative-v/track-3-cdr/

In [None]:
def get_cdr(content:list):
    """
    Get CDR(Comma Delimited Record) data and save them into a list of tuples
    """
    lines_tab = []
    for line in content:
        if "\t" in line:
            lines_tab.append(line.replace("\n", "").split("\t"))
    return lines_tab

In [None]:
with open(f"/content/drive/MyDrive/gena-db-master/src/DATASET/data/BC5CDR/CDR_TrainingSet.PubTator.txt", "r") as f:
    content = f.readlines()

with open(f"/content/drive/MyDrive/gena-db-master/src/DATASET/data/BC5CDR/CDR_DevelopmentSet.PubTator.txt", "r") as f:
    content += f.readlines()

with open(f"/content/drive/MyDrive/gena-db-master/src/DATASET/data//BC5CDR/CDR_TestSet.PubTator.txt", "r") as f:
    content += f.readlines()

In [None]:
cdrs = get_cdr(content)
entities = list(filter(lambda line: len(line) == 6, cdrs))
entities_sub_id = []
entities_no_sub_id = []
for ent in entities:
    if len(ent[-1].split('|')) > 1:
        for sub_id in ent[-1].split('|'):
            entities_sub_id = entities_sub_id + [ent[:-1] + [sub_id]]
    else:
        entities_no_sub_id.append(ent)
entities = entities_sub_id + entities_no_sub_id
entities[:2]

[['2234245', '250', '270', 'audiovisual toxicity', 'Disease', 'D014786'],
 ['2234245', '250', '270', 'audiovisual toxicity', 'Disease', 'D006311']]

In [None]:
entities_df = pd.DataFrame(entities)
entities_df.columns = ["id", "start", "end", "name", "type", "mesh_id"]
entities_df = entities_df[['mesh_id', 'name', 'type']]
entities_df.head(5)

Unnamed: 0,mesh_id,name,type
0,D014786,audiovisual toxicity,Disease
1,D006311,audiovisual toxicity,Disease
2,D014786,audiovisual toxicity,Disease
3,D006311,audiovisual toxicity,Disease
4,D005355,fibrous myopathy,Disease


In [None]:
relations = list(filter(lambda line: len(line) == 4, cdrs))
relations_df = pd.DataFrame(relations)
relations_df.columns = ["id", "relation", "head_id", "tail_id"]
relations_df = relations_df[["head_id", "tail_id", "relation"]]
relations_df.head(5)

Unnamed: 0,head_id,tail_id,relation
0,D008750,D007022,CID
1,D008012,D006323,CID
2,D013390,D005207,CID
3,D012601,D062787,CID
4,D008094,D006973,CID


In [None]:
mesh_ids_list = []
mesh_names_list = []

others = list(filter(lambda line: len(line) == 7, cdrs))
for other in others:
    mesh_ids = other[:][5]
    mesh_names = other[:][6]
    mesh_ids_list.append(mesh_ids)
    mesh_names_list.append(mesh_names)
    del other[6]

In [None]:
n = len(mesh_ids_list)
mesh_ids = []
mesh_names = []
mesh_types = []
for i in range(n):
    sub_ids = mesh_ids_list[i].split('|')
    mesh_ids += sub_ids
    mesh_names += mesh_names_list[i].split('|')
    mesh_types += ['Disease' if sub_ids[0][0] == 'D' else 'Chemical']*len(sub_ids)
len(mesh_ids), len(mesh_names), len(mesh_types)

(504, 504, 504)

In [None]:
n = len(mesh_ids)
mesh = [(mesh_ids[i], mesh_names[i], mesh_types[i]) for i in range(n) ]
mesh_df = pd.DataFrame(mesh, columns=["mesh_id", "name", "type"])
entities_df = pd.concat([entities_df, mesh_df])
entities_df.head(5)

Unnamed: 0,mesh_id,name,type
0,D014786,audiovisual toxicity,Disease
1,D006311,audiovisual toxicity,Disease
2,D014786,audiovisual toxicity,Disease
3,D006311,audiovisual toxicity,Disease
4,D005355,fibrous myopathy,Disease


In [None]:
def name_of_id_mesh(id_mesh, mesh_df):
    """
    Get name from given id in MeSH DataFrame
    """
#    print(id_mesh)
#    print(mesh_df[mesh_df['mesh_id'] == id_mesh].reset_index()['name'])
    return mesh_df[mesh_df['mesh_id'] == id_mesh].reset_index()['name'][0]

In [None]:
relations_df["head_name"] = relations_df["head_id"].map(lambda x: name_of_id_mesh(x, entities_df))
relations_df["tail_name"] = relations_df["tail_id"].map(lambda x: name_of_id_mesh(x, entities_df))
relations_df = relations_df[['head_id', 'head_name', 'tail_id', 'tail_name', 'relation']]
relations_df.head(5)

Unnamed: 0,head_id,head_name,tail_id,tail_name,relation
0,D008750,alpha-methyldopa,D007022,hyper- or hypotension,CID
1,D008012,Lidocaine,D006323,cardiac asystole,CID
2,D013390,Suxamethonium,D005207,fasciculations,CID
3,D012601,scopolamine,D062787,overdosage,CID
4,D008094,lithium,D006973,hyper- or hypotension,CID


In [None]:
relations_df = relations_df.drop_duplicates().reset_index(drop=True)
entities_df = entities_df.drop_duplicates().reset_index(drop=True)

In [None]:
relations_df.head(5)

Unnamed: 0,head_id,head_name,tail_id,tail_name,relation
0,D008750,alpha-methyldopa,D007022,hyper- or hypotension,CID
1,D008012,Lidocaine,D006323,cardiac asystole,CID
2,D013390,Suxamethonium,D005207,fasciculations,CID
3,D012601,scopolamine,D062787,overdosage,CID
4,D008094,lithium,D006973,hyper- or hypotension,CID


-------------------------------

## Merge 2 types of relations

In [None]:
fad_df.head(5)

Unnamed: 0,head_id,head_name,relation,tail_id
0,http://purl.obolibrary.org/obo/DOID_0040102,"N,N'-diethylthiourea allergic contact dermatitis",has allergic trigger,http://purl.obolibrary.org/obo/CHEBI_82448
1,http://purl.obolibrary.org/obo/DOID_251,alcohol-induced mental disorder,has exposure stressor,http://purl.obolibrary.org/obo/CHEBI_30879
2,http://purl.obolibrary.org/obo/DOID_252,alcoholic psychosis,has exposure stressor,http://purl.obolibrary.org/obo/CHEBI_30879
3,http://purl.obolibrary.org/obo/DOID_0040010,mepivacaine allergy,has allergic trigger,http://purl.obolibrary.org/obo/CHEBI_6759
4,http://purl.obolibrary.org/obo/DOID_0040103,cefotiam allergy,has allergic trigger,http://purl.obolibrary.org/obo/CHEBI_355510


In [None]:
fad_df['tail_name'] = fad_df['tail_id'].map(lambda row: dct_classID.get(row))
dn_df = pd.concat([relations_df, fad_df[list(relations_df.columns)]]).reset_index(drop=True)
dn_df.head(5)

Unnamed: 0,head_id,head_name,tail_id,tail_name,relation
0,D008750,alpha-methyldopa,D007022,hyper- or hypotension,CID
1,D008012,Lidocaine,D006323,cardiac asystole,CID
2,D013390,Suxamethonium,D005207,fasciculations,CID
3,D012601,scopolamine,D062787,overdosage,CID
4,D008094,lithium,D006973,hyper- or hypotension,CID


In [None]:
dn_df = dn_df[['head_id', 'head_name', 'relation', 'tail_id', 'tail_name']]
dn_df['relation'].unique()

array(['CID', 'has allergic trigger', 'has exposure stressor'],
      dtype=object)

In [None]:
print(f'There are {len(dn_df)} available relations between Nutrition and Disease.')

There are 2562 available relations between Nutrition and Disease.


In [None]:
# Save to file
dn_df.to_csv(f"/content/drive/MyDrive/gena-db-master/src/DATASET//Results/final_data/available_fad.csv", index=False)

OSError: Cannot save file into a non-existent directory: '/content/drive/MyDrive/gena-db-master/src/DATASET/Results/final_data'