The variables below should be updated accordingly.  
`INPUT_FILE_PATH` is where the .csv file with SemRep relationships is located (i.e. the output from Stage 1).  
`OUTPUT_FILE_PATH` is where the processed triples/the output from Stage 2 will be stored (as a .csv file).  
`NODE_MAPPING_FILE_PATH` is where node information for the processed triples will be stored (as a .csv file).  

In [1]:
INPUT_FILE_PATH = '../data/semrep_relationships.csv'
OUTPUT_FILE_PATH = '../data/semrep_relationships_processed.csv'
NODE_MAPPING_FILE_PATH = '../data/node_id_mapping.csv'
NEO4J_OUTPUT_DIR = '../neo4j-import'

In [2]:
import numpy as np
import pandas as pd
import re, os

In [3]:
subj_col = 'subj_preferred_name'
rel_col = 'relation'
obj_col = 'obj_preferred_name'

In [4]:
kg_df = pd.read_csv(INPUT_FILE_PATH, keep_default_na=False)
kg_df = kg_df[[
    'paper_id', 'subj_CUI', 'subj_preferred_name', 'subj_semantic_type', 'subj_gene_id', 'subj_gene_name', \
    'subj_original_text', 'subj_negated', 'subj_confidence_score', 'subj_start', 'subj_end' , \
    'relation_type', 'relation', 'relation_negated', 'relation_start', 'relation_end', 'relation_original_text', \
    'obj_CUI', 'obj_preferred_name', 'obj_semantic_type', 'obj_gene_id', 'obj_gene_name', 'obj_original_text', \
    'obj_negated', 'obj_confidence_score', 'obj_start', 'obj_end', \
    'sentence']]
print("Number of relationships directly outputted by SemRep =", len(kg_df))

Number of relationships directly outputted by SemRep = 1088236


# Cleaning data

In [5]:
# replace whitespace with NaN
kg_df.replace(r'^\s*$', np.nan, regex=True, inplace=True)

# remove NaN for subj, relationship, and obj cols
kg_df = kg_df.dropna(axis=0, subset=[subj_col, "subj_original_text", rel_col, "relation_original_text", obj_col, "obj_original_text"])
print("Number of relationships (after removing empty strings) =", len(kg_df))

# check that all subj, verb, obj cells are non-null
missing = kg_df[(kg_df[subj_col].isnull()) | (kg_df[rel_col].isnull()) | (kg_df[obj_col].isnull())]
assert len(missing) == 0

Number of relationships (after removing empty strings) = 1079157


In [6]:
# drop duplicates
kg_df['sentence_lower'] = kg_df['sentence'].str.lower().str.strip()
kg_df = kg_df.drop_duplicates(subset=['paper_id', 'subj_start', 'subj_end', 'obj_start', 'obj_end', 'relation_start', 'relation_end', 'sentence_lower', subj_col, rel_col, obj_col])

print("Number of relationships (after dropping duplicates) =", len(kg_df))

Number of relationships (after dropping duplicates) = 1078916


In [7]:
# lowercase original text
kg_df['subj_original_text'] = kg_df['subj_original_text'].str.lower()
kg_df['obj_original_text'] = kg_df['obj_original_text'].str.lower()

In [8]:
print('After cleaning data:')

nodes = set(kg_df[subj_col]).union(set(kg_df[obj_col]))
print("number of nodes =", len(nodes))
rels = set(kg_df[rel_col])
print("number of unique relationship types =", len(rels))
print("number of unique edges (includes all edge properties) =", len(kg_df))
num_relationships = len(kg_df.groupby([subj_col, rel_col, obj_col]).count())
print("number of unique triples =", num_relationships)

print('number of unique paper IDs =', len(set(kg_df['paper_id'])))

After cleaning data:
number of nodes = 50182
number of unique relationship types = 64
number of unique edges (includes all edge properties) = 1078916
number of unique triples = 373327
number of unique paper IDs = 238816


# Standardize entities

In [9]:
# use gene name as preferred name
def set_gene_name_as_preferred_name(row, pref_name_col, gene_name_col):
    if str(row[gene_name_col]) != 'nan':
        return row[gene_name_col]
    else:
        return row[pref_name_col]
    
kg_df[subj_col] = kg_df.apply(lambda row: set_gene_name_as_preferred_name(row, subj_col, 'subj_gene_name'), axis=1)
kg_df[obj_col] = kg_df.apply(lambda row: set_gene_name_as_preferred_name(row, obj_col, 'obj_gene_name'), axis=1)


In [10]:
# SARS-CoV-2 terms
coronavirus = []
with open(os.path.join("covid_19_dictionaries", "Virus_SARS-CoV-2.txt")) as f:
    for line in f:
        coronavirus.append(line.strip())
print(len(coronavirus), "SARS-CoV-2 (virus) terms")

223 SARS-CoV-2 (virus) terms


In [11]:
# COVID-19 terms
covid = []
covid_exact = []
covid_all = []
with open(os.path.join("covid_19_dictionaries", "Disease_COVID-19.txt")) as f:
    for line in f:
        term = line.strip()
        covid_all.append(term)
        if term in ["wurs", "ncp", "sars2"] or term.count(" ") >= 4:
            covid_exact.append(term)
        else:
            covid.append(term)
print(len(covid), "COVID-19 (disease) terms (partial match)")
print(len(covid_exact), "COVID-19 (disease) terms (long/exact match)")
print(len(covid_all), "COVID-19 (disease) terms (all)")

1192 COVID-19 (disease) terms (partial match)
11723 COVID-19 (disease) terms (long/exact match)
12915 COVID-19 (disease) terms (all)


In [12]:
# dictionary for standardization of COVID-19-related entities
temp = pd.read_csv('table_s1.csv')
d_replace = dict(zip(list(temp['original term']), list(temp['normalized'])))
virus_terms = []
covid_terms = []
        
for k in d_replace:
    if 'SARS-CoV-2' in d_replace[k]:
        virus_terms.append(k)
    elif 'COVID-19' in d_replace[k]:
        covid_terms.append(k)
    else:
        raise Exception('Normalized term must contain SARS-CoV-2 or COVID-19', d[k])

In [13]:
# update semantic type for COVID-19-related terms
# either 'SARS-CoV-2 (virus)' or 'COVID-19 (disease)'

def update_covid_semtype(row, original_text_col, semtype_col):
    if str(row[original_text_col]).lower() in virus_terms:
        return "SARS-CoV-2 (virus)"
    elif str(row[original_text_col]).lower() in covid_terms:
        return "COVID-19 (disease)"
    else:
        return row[semtype_col]

kg_df['subj_semantic_type'] = kg_df.apply(lambda x: update_covid_semtype(x, 'subj_original_text', 'subj_semantic_type'), axis=1)
kg_df['obj_semantic_type'] = kg_df.apply(lambda x: update_covid_semtype(x, 'obj_original_text', 'obj_semantic_type'), axis=1)

In [14]:
# add COVID-19-related preferred names based on standardization
def update_covid_preferred_name(row, original_text_col, semantic_type_col, preferred_name_col):
    if str(row[semantic_type_col]) == "SARS-CoV-2 (virus)" or str(row[semantic_type_col]) == "COVID-19 (disease)":
        return str(row[original_text_col]).lower()
    else:
        return row[preferred_name_col]
    
kg_df[subj_col] = kg_df.apply(lambda x: update_covid_preferred_name(x, "subj_original_text", "subj_semantic_type", subj_col), axis=1)
kg_df[obj_col] = kg_df.apply(lambda x: update_covid_preferred_name(x, "obj_original_text", "obj_semantic_type", obj_col), axis=1)

kg_df[subj_col] = kg_df[subj_col].replace(d_replace) 
kg_df[obj_col] = kg_df[obj_col].replace(d_replace)

In [15]:
covid_semtypes = ["SARS-CoV-2 (virus)", "COVID-19 (disease)"] # custom semantic types representing coronavirus/covid
coronavirus_subj_terms = set(kg_df[kg_df["subj_semantic_type"].isin(covid_semtypes)][subj_col])
coronavirus_obj_terms = set(kg_df[kg_df["obj_semantic_type"].isin(covid_semtypes)][obj_col])
coronavirus_terms = coronavirus_subj_terms.union(coronavirus_obj_terms)

print("There are", len(coronavirus_terms), "coronavirus-related terms (preferred name after normalizing COVID-19 and SARS-CoV-2)")
print(coronavirus_terms)

There are 23 coronavirus-related terms (preferred name after normalizing COVID-19 and SARS-CoV-2)
{'SARS-CoV-2', 'SARS-CoV-2 spread', 'COVID-19 animal', 'SARS-CoV-2 protein', 'SARS-CoV-2 PCR', 'COVID-19 country', 'COVID-19 subject', 'COVID-19', 'COVID-19 treatment', 'COVID-19 re-infection', 'COVID-19 testing', 'COVID-19 symptom', 'COVID-19 suspected', 'SARS-CoV-2 related virus', 'COVID-19 patient', 'Asymptomatic COVID-19', 'SARS-CoV-2 antibody', 'Severe COVID-19', 'SARS-CoV-2 gene', 'COVID-19 related infection', 'COVID-19 acquired infection', 'COVID-19 vaccine', 'COVID-19 co-infection'}


In [16]:
# map semtype abbreviations to semantic group
semtypes_df = pd.read_csv(os.path.join('semantic_type_files', 'SemanticTypes_2018AB.txt'), sep="|", header=None)
semgroups_df = pd.read_csv(os.path.join('semantic_type_files', 'SemGroups_2018.txt'), sep="|", header=None)
semgroups_df.columns = ['semgroup_abbreviation', 'semgroup', 'TUI', 'semtype']
semtypes_df.columns = ['semtype_abbreviation', 'TUI', 'semtype']
semtypes_df = semtypes_df.merge(semgroups_df, on=['TUI', 'semtype'])

keys = semtypes_df["semtype_abbreviation"]
values = semtypes_df["semgroup"]
semtype_dictionary = dict(zip(keys, values))

kg_df.replace({'subj_semantic_type': semtype_dictionary}, inplace=True)
kg_df.replace({'obj_semantic_type': semtype_dictionary}, inplace=True)

# Standardize relationships

In [17]:
kg_df[rel_col] = kg_df[rel_col].str.replace("\(SPEC\)", "")
kg_df[rel_col] = kg_df[rel_col].str.replace("\(INFER\)", "")

# Filter relationships

In [18]:
# get most confidence and non-negated relationships
kg_df = kg_df[(kg_df["subj_confidence_score"]>=800)&(kg_df["obj_confidence_score"]>=800)]
print("number of relationships (after removing low confidence entities) =", len(kg_df))

number of relationships (after removing low confidence entities) = 983462


In [19]:
# remove self-loops
kg_df = kg_df[(kg_df[subj_col]!=kg_df[obj_col])&(kg_df['subj_CUI']!=kg_df['obj_CUI'])]
print("number of relationships (after removing self-loops) =", len(kg_df))

number of relationships (after removing self-loops) = 981153


In [20]:
# filter relationships to only keep triples that occur at least twice
weight_col = 'weight'
weight_df = kg_df.groupby([subj_col, rel_col, obj_col])['subj_start'].count().reset_index().rename(columns={'subj_start': weight_col})
thresh = 2
weight_thresh_df = weight_df[weight_df[weight_col] >= thresh]
kg_df = kg_df.merge(weight_thresh_df, on=[subj_col, rel_col, obj_col], how='inner')

print('=== Before thresholding ===')
print('Number of edges (sum of weights):', sum(weight_df[weight_col]))
print('Number of unique edges:', len(weight_df))
print()
print('=== After thresholding ===')
print('Number of edges (sum of weights):', sum(weight_thresh_df[weight_col]))
print('Number of unique edges:', len(weight_thresh_df))
print()

=== Before thresholding ===
Number of edges (sum of weights): 981153
Number of unique edges: 340194

=== After thresholding ===
Number of edges (sum of weights): 778965
Number of unique edges: 138006



In [21]:
# drop_duplicates()
kg_df = kg_df.drop_duplicates(subset=['subj_start', 'subj_end', 'obj_start', 'obj_end', 'relation_start', 'relation_end', 'sentence_lower', subj_col, rel_col, obj_col])
print("number of relationships (after dropping duplicates) =", len(kg_df))

number of relationships (after dropping duplicates) = 632830


# Format Text entities/nodes and Text-Document relationships for Neo4j Import

In [22]:
# extract text/sentence information from triple
sentences_id_df = kg_df[['paper_id', 'sentence']].drop_duplicates().reset_index(drop=True).reset_index().rename(columns={'index': 'text_id'})


In [23]:
sentences_id_df['text_id'] = sentences_id_df['text_id'].astype(int)
sentences_df = sentences_id_df.copy()
sentences_df = sentences_df[['text_id', 'sentence']]
sentences_df.columns = ['text_id:ID(Text)', 'text:STRING']
sentences_df['section:STRING'] = 'abstract'
sentences_df[':LABEL'] = 'Text'
sentences_df.to_csv(os.path.join(NEO4J_OUTPUT_DIR, 'text_nodes.csv'), index=False)

In [24]:
# Text-Document relationships
kg_df_sent = kg_df.merge(sentences_id_df, on=['sentence', 'paper_id'], how='left')
sentences_edges_df = kg_df_sent[['text_id', 'paper_id']].drop_duplicates()    
sentences_edges_df.columns = ['text_id:START_ID(Text)', 'doc_id:END_ID(Document)']
sentences_edges_df[':TYPE'] = 'in_document'
sentences_edges_df.to_csv(os.path.join(NEO4J_OUTPUT_DIR, 'text_edges.csv'), index=False)

# Create file for processed triples and their corresponding entities/nodes

In [25]:
# columns in the edges file
id_col = 'id' # the preferred name of the entity
id_col_1 = id_col + "1"
id_col_2 = id_col + "2"

identifier_col = 'identifier' # the CUI of the entity
identifier_col_1 = identifier_col + "1"
identifier_col_2 = identifier_col + "2"

type_col = 'type' # the semantic group of the entity
type_col_1 = type_col + "1"
type_col_2 = type_col + "2"

info_col = 'info' # synonyms/original texts for the entity
info_col_1 = info_col + "1"
info_col_2 = info_col + "2"

source_col = 'evidence_source' # what source the triple is from
rel_col = ':TYPE' # relationship

# what to name the columns after aggregation
identifier_col_agg = identifier_col + "s"
type_col_agg = type_col + "s"
info_col_agg = info_col + "s"

In [26]:
# update headers
kg_df = kg_df_sent
header = ['subj_preferred_name', 'subj_CUI', 'subj_semantic_type', 'subj_original_text', 'subj_start', 'subj_end',
          'obj_preferred_name', 'obj_CUI', 'obj_semantic_type', 'obj_original_text', 'obj_start', 'obj_end',
          'relation', 'relation_start', 'relation_end', 'text_id', weight_col]
header_updated = [id_col_1, identifier_col_1, type_col_1, info_col_1, 'start1', 'end1',
                  id_col_2, identifier_col_2, type_col_2, info_col_2, 'start2', 'end2',
                  rel_col, 'startr', 'endr', 'text_id', weight_col]
kg_df = kg_df[header]
kg_df.columns = header_updated
kg_df[source_col] = 'CORD-19'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [27]:
# collect nodes from triples
temp1 = kg_df[[identifier_col_1, id_col_1, type_col_1, info_col_1]].rename(columns={identifier_col_1: identifier_col, id_col_1: id_col, info_col_1: info_col, type_col_1: type_col})
temp2 = kg_df[[identifier_col_2, id_col_2, type_col_2, info_col_2]].rename(columns={identifier_col_2: identifier_col, id_col_2: id_col, info_col_2: info_col, type_col_2: type_col})
node_df = temp1.append(temp2).drop_duplicates().dropna()

In [28]:
# group entities by id (preferred name) and collect different identifiers, types, and infos
temp = node_df.groupby(id_col, as_index=False).agg({identifier_col: lambda x: ';'.join(set(x)), type_col: lambda x: ';'.join(set(x)), info_col: lambda x: ';'.join(set(x))})
node_id_mapping_df = node_df[[identifier_col, id_col, type_col, info_col]].merge(temp, on=[id_col])
node_id_mapping_df.columns = [identifier_col, id_col, type_col, info_col, identifier_col_agg, type_col_agg, info_col_agg]
node_id_mapping_df.to_csv(NODE_MAPPING_FILE_PATH , index=False)

In [29]:
# update the node information in the triples
kg_df = kg_df.merge(node_id_mapping_df[[id_col, info_col_agg, type_col_agg, identifier_col_agg]].drop_duplicates(), left_on=id_col_1, right_on=id_col, how='left').drop([id_col, identifier_col_1, type_col_1, info_col_1], axis=1).rename(columns={identifier_col_agg: identifier_col_1, type_col_agg: type_col_1, info_col_agg: info_col_1})
kg_df = kg_df.merge(node_id_mapping_df[[id_col, info_col_agg, type_col_agg, identifier_col_agg]].drop_duplicates(), left_on=id_col_2, right_on=id_col, how='left').drop([id_col, identifier_col_2, type_col_2, info_col_2], axis=1).rename(columns={identifier_col_agg: identifier_col_2, type_col_agg: type_col_2, info_col_agg: info_col_2})

In [30]:
# saved the processed triples
kg_df.to_csv(OUTPUT_FILE_PATH, index=False)

In [31]:
kg_df

Unnamed: 0,id1,start1,end1,id2,start2,end2,:TYPE,startr,endr,text_id,weight,evidence_source,info1,type1,identifier1,info2,type2,identifier2
0,Suicidal behavior,50,67,Participant,33,45,PROCESS_OF,46,49,0,3,CORD-19,sb;suicide behavior;suicide behaviors;suicidal...,Disorders,C1760428,gpps;participants;participant's;participant,Living Beings,C0679646
1,Suicidal behavior,894,911,Participant,877,889,PROCESS_OF,890,893,1,3,CORD-19,sb;suicide behavior;suicide behaviors;suicidal...,Disorders,C1760428,gpps;participants;participant's;participant,Living Beings,C0679646
2,Suicidal behavior,110,127,Participant,0,12,PROCESS_OF,76,79,2,3,CORD-19,sb;suicide behavior;suicide behaviors;suicidal...,Disorders,C1760428,gpps;participants;participant's;participant,Living Beings,C0679646
3,Endosomes,118,127,Virus,76,83,LOCATION_OF,115,117,3,10,CORD-19,endosomal;endosomes;endosome,Anatomy,C0034850,"ndv;wiv;imnv;general, enteric viruses;eeev;vir...",Living Beings,C0042776
4,Endosomes,194,202,Virus,179,184,LOCATION_OF,185,189,4,10,CORD-19,endosomal;endosomes;endosome,Anatomy,C0034850,"ndv;wiv;imnv;general, enteric viruses;eeev;vir...",Living Beings,C0042776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
632825,Anesthesia procedures,98,109,Midazolam,58,67,USES,85,88,136429,2,CORD-19,anesthesia;ofa;anaesthesia;na;lfa;aaga;oa,Procedures,C0002903,midazolam;pb/m,Chemicals & Drugs,C0026056
632826,Anesthesia procedures,20,31,Midazolam,48,57,USES,32,36,410608,2,CORD-19,anesthesia;ofa;anaesthesia;na;lfa;aaga;oa,Procedures,C0002903,midazolam;pb/m,Chemicals & Drugs,C0026056
632827,sevoflurane,100,111,Old World rabbit,42,49,ADMINISTERED_TO,50,59,410609,3,CORD-19,sevoflurane;spc,Chemicals & Drugs,C0074414,domestic rabbits;rabbits;european rabbit,Living Beings,C0324889
632828,sevoflurane,21,32,Old World rabbit,3,10,ADMINISTERED_TO,11,20,134111,3,CORD-19,sevoflurane;spc,Chemicals & Drugs,C0074414,domestic rabbits;rabbits;european rabbit,Living Beings,C0324889
