The variables below should be updated accordingly.  
`INPUT_FILE_PATH` is where the processed SemRep triples is stored (output of Stage 2).  
`OUTPUT_FILE_PATH` is where the integrated triples/the output from Stage 4 will be stored (as a .csv file).  
`NODE_MAPPING_FILE_PATH` is where node information for the processed triples is stored (output of Stage 2).  

In [1]:
INPUT_FILE_PATH = '../data/semrep_relationships_processed.csv'
OUTPUT_FILE_PATH = '../data/semrep_relationships_integrated.csv'
NODE_MAPPING_FILE_PATH = '../data/node_id_mapping.csv'

In [2]:
DATA_DIR = 'integration_data_processed' # processed data from sources

In [3]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib

In [4]:
# columns in the edges file
id_col = 'id' # the preferred name of the entity
id_col_1 = id_col + "1"
id_col_2 = id_col + "2"

identifier_col = 'identifier' # the CUI of the entity
identifier_col_1 = identifier_col + "1"
identifier_col_2 = identifier_col + "2"

type_col = 'type' # the semantic group of the entity
type_col_1 = type_col + "1"
type_col_2 = type_col + "2"

info_col = 'info' # synonyms/original texts for the entity
info_col_1 = info_col + "1"
info_col_2 = info_col + "2"

source_col = 'evidence_source' # what source the triple is from
rel_col = ':TYPE' # relationship
weight_col = 'weight' # name of column with triple edge weight

# what to name the columns after aggregation
identifier_col_agg = identifier_col + "s"
type_col_agg = type_col + "s"
info_col_agg = info_col + "s"

In [5]:
# header is the output column format/order
literature_df = pd.read_csv(INPUT_FILE_PATH)
header = list(literature_df.columns)
literature_df.to_csv(OUTPUT_FILE_PATH, index=False)

In [6]:
# standardize entity names using node mapping file
def standardize_ents(df, nodes_df):
    df = df.merge(nodes_df[[identifier_col, id_col, info_col_agg, type_col_agg, identifier_col_agg]], left_on=identifier_col_1, right_on=identifier_col, how='left').drop([identifier_col, identifier_col_1], axis=1).rename(columns={identifier_col_agg: identifier_col_1, type_col_agg: type_col_1, info_col_agg: info_col_1, id_col: id_col_1})
    df = df.merge(nodes_df[[identifier_col, id_col, info_col_agg, type_col_agg, identifier_col_agg]], left_on=identifier_col_2, right_on=identifier_col, how='left').drop([identifier_col, identifier_col_2], axis=1).rename(columns={identifier_col_agg: identifier_col_2, type_col_agg: type_col_2, info_col_agg: info_col_2, id_col: id_col_2})
    return df

def integrate_edgefile(header, nodes_filename, source_filename, output_filename, ent1_type, ent1_col, ent2_type, ent2_col, source, weight_col_source, processing_function, weight_col, source_col, identifier_col, id_col, type_col, relationship, ent1_gene=False, ent2_gene=False):
    # header: output column format.order
    # nodes_filename: file that has desired knowledge graph nodes
    # source_filename: input filename
    # output_filename: output filename

    # ent1_type: type the first entity should be
    # ent1_col: name of the column in the source that has the first entity
    # ent2_type: type the second entity should be
    # ent2_col: name of the column in the source that has the second entity

    # source: source name of relationships
    # weight_col_source: weight column in source file

    # weight_col: name of source column in output
    # source_col: name of source column in output

    # identifier_col: name of column that refers to the node ID (used to filter the source)
    # id_col: id used to standardize entities
    # type_col: name of column that refers to the node type

    # get all IDs for ent1 and ent2 types from nodes_df
    nodes_df = pd.read_csv(nodes_filename)
    if ent1_type and ent2_type:
        ent1_all_ids = set(nodes_df.loc[nodes_df[type_col] == ent1_type, identifier_col])
        ent2_all_ids = set(nodes_df.loc[nodes_df[type_col] == ent2_type, identifier_col])
#         ent1_all_ids = set(nodes_df[nodes_df[type_col] == ent1_type][identifier_col])
#         ent2_all_ids = set(nodes_df[nodes_df[type_col] == ent2_type][identifier_col])
    else:
        ent1_all_ids = set(nodes_df[identifier_col])
        ent2_all_ids = set(nodes_df[identifier_col])
    
    # filter for required nodes
    source_df = pd.read_csv(source_filename)    
    # filter
    source_df = source_df[(source_df[ent1_col].isin(ent1_all_ids))&(source_df[ent2_col].isin(ent2_all_ids))]
        
    # set weight to zero if there is no weight column in the data
    if not weight_col_source:
        weight_col_source = 'weight_col_source'
        source_df[weight_col_source] = 0
    
    edges_df = source_df.loc[:,[ent1_col, ent2_col, weight_col_source]]
    edges_df.columns = [identifier_col_1, identifier_col_2, weight_col]
    edges_df[source_col] = source
    edges_df[":TYPE"] = relationship
    
    # standardize entities
    edges_df = processing_function(edges_df, nodes_df)
    
    # make sure columns in edges file output is in correct order
    for col in header:
        if not col in edges_df.columns:
            edges_df.loc[:,col] = np.nan

    edges_df = edges_df[header]
    edges_df.drop_duplicates(inplace=True)
    edges_df = edges_df[edges_df[id_col_1] != edges_df[id_col_2]] # remove self loops
    print(f'{len(edges_df)} edges added from source {source_filename}')
    print()
    edges_df.to_csv(output_filename, mode='a', header=False, index=False)


In [7]:
# disease-gene relationships
integrate_edgefile(header=header, nodes_filename=NODE_MAPPING_FILE_PATH, source_filename=os.path.join(DATA_DIR, 'disease_gene_relationships.csv'), output_filename=OUTPUT_FILE_PATH, ent1_type=None, ent1_col='CUI_disease', ent2_type=None, ent2_col='CUI_gene', source='DisGeNET', weight_col_source='score', processing_function=standardize_ents, weight_col=weight_col, source_col=source_col, identifier_col=identifier_col, id_col=id_col, type_col=type_col, relationship='disease-gene')

# drug-gene relationships
integrate_edgefile(header=header, nodes_filename=NODE_MAPPING_FILE_PATH, source_filename=os.path.join(DATA_DIR, 'drug_gene_relationships.csv'), output_filename=OUTPUT_FILE_PATH, ent1_type=None, ent1_col='CUI_drug', ent2_type=None, ent2_col='CUI_gene', source='DGIdb', weight_col_source='interaction_group_score', processing_function=standardize_ents, weight_col=weight_col, source_col=source_col, identifier_col=identifier_col, id_col=id_col, type_col=type_col, relationship='drug-gene')

# gene-gene relationships
integrate_edgefile(header=header, nodes_filename=NODE_MAPPING_FILE_PATH, source_filename=os.path.join(DATA_DIR, 'gene_gene_relationships.csv'), output_filename=OUTPUT_FILE_PATH, ent1_type=None, ent1_col='CUI_gene_1', ent2_type=None, ent2_col='CUI_gene_2', source='STRING', weight_col_source='combined_score', processing_function=standardize_ents, weight_col=weight_col, source_col=source_col, identifier_col=identifier_col, id_col=id_col, type_col=type_col, relationship='gene-gene')

# gene-GO relationships
integrate_edgefile(header=header, nodes_filename=NODE_MAPPING_FILE_PATH, source_filename=os.path.join(DATA_DIR, 'gene_GO_relationships.csv'), output_filename=OUTPUT_FILE_PATH, ent1_type=None, ent1_col='CUI_gene', ent2_type=None, ent2_col='CUI_GO', source='Uniprot', weight_col_source=None, weight_col=weight_col, processing_function=standardize_ents, source_col=source_col, identifier_col=identifier_col, id_col=id_col, type_col=type_col, relationship='gene-GO')

# gene-phenotype relationships
integrate_edgefile(header=header, nodes_filename=NODE_MAPPING_FILE_PATH, source_filename=os.path.join(DATA_DIR, 'gene_phenotype_relationships.csv'), output_filename=OUTPUT_FILE_PATH, ent1_type=None, ent1_col='CUI_gene', ent2_type=None, ent2_col='CUI_HPO', source='HPO', weight_col_source=None, weight_col=weight_col, processing_function=standardize_ents, source_col=source_col, identifier_col=identifier_col, id_col=id_col, type_col=type_col, relationship='gene-phenotype')

# disease-phenotype relationships
integrate_edgefile(header=header, nodes_filename=NODE_MAPPING_FILE_PATH, source_filename=os.path.join(DATA_DIR, 'disease_phenotype_relationships.csv'), output_filename=OUTPUT_FILE_PATH, ent1_type=None, ent1_col='CUI_disease', ent2_type=None, ent2_col='CUI_HPO', source='HPO', weight_col_source=None, weight_col=weight_col, processing_function=standardize_ents, source_col=source_col, identifier_col=identifier_col, id_col=id_col, type_col=type_col, relationship='disease-phenotype')


5679 edges added from source integration_data_processed/disease_gene_relationships.csv

2891 edges added from source integration_data_processed/drug_gene_relationships.csv

21700 edges added from source integration_data_processed/gene_gene_relationships.csv

6837 edges added from source integration_data_processed/gene_GO_relationships.csv

7879 edges added from source integration_data_processed/gene_phenotype_relationships.csv

1748 edges added from source integration_data_processed/disease_phenotype_relationships.csv

