In [1]:
import numpy as np 
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from pyecharts import options as opts
from pyecharts.charts import Graph
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset,TensorDataset,DataLoader
from keras.preprocessing import sequence
from transformers import BertTokenizer, BertForSequenceClassification

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
marked_sentence_df = pd.read_csv('C:/Users/HP/Desktop/final499b/CSE499B/COVID-19 knowledge graph/marked_sentence.csv')

In [3]:
marked_sentence_df.sample(5)

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence
10658,C2,CD2,Gene,Gene,Further clonal analysis of this patient indica...
31705,AOC3,CD16,Gene,Gene,CONCLUSIONS: Our data suggest that end_entity ...
16300,Eczema,Infection,Disease,Disease,1 Million South Korean Health-Care Records Gwa...
21869,CCL2,GLS2,Gene,Gene,"PHF, PTGDR and end_entity could significantly ..."
15804,"Rhinitis, Allergic",Wounds and Injuries,Disease,Disease,Pieters A245 How to Define Chronic Cough: Base...


In [4]:
def Build_graph(df,relation=False,repulsion=40,title='COVID-19 knowledge graph',labelShow=False):
    entity_type_dic = dict(df.drop_duplicates(['start_entity']).set_index(['start_entity'])['start_entity_type'])
    entity_type_dic.update(dict(df.drop_duplicates(['end_entity']).set_index(['end_entity'])['end_entity_type']))
    color = {'Disease':'#FF7F50','Gene':'#48D1CC','Chemical':'#B3EE3A'}
    cate =  {'Disease':0,'Gene':1,'Chemical':2}
    categories = [{'name':'Disease','itemStyle': {'normal': {'color': color['Disease']}}},{'name':'Gene','itemStyle': {'normal': {'color': color['Gene']}}},{'name':'Chemical','itemStyle': {'normal': {'color': color['Chemical']}}}]
    nodes = []
    for entity in list(set(df['start_entity'])|set(df['end_entity'])):
        nodes.append({'name': entity, 'symbolSize': max(10,np.log1p(df.loc[(df['start_entity']==entity)|(df['end_entity']==entity)].shape[0])*10//1),
                     'category':cate[entity_type_dic[entity]]})
    links = []
    for i in df.index:
        if not relation:
            links.append({'source': df.loc[i,'start_entity'], 'target': df.loc[i,'end_entity']})
        else:
            links.append({'source': df.loc[i,'start_entity'], 'target': df.loc[i,'end_entity'], 'value':df.loc[i,'pred']})
    g = (
        Graph()
        .add('', nodes, links,categories, repulsion=repulsion,label_opts=opts.LabelOpts(is_show=labelShow))
        .set_global_opts(title_opts=opts.TitleOpts(title=title),legend_opts=opts.LegendOpts(orient='vertical', pos_left='2%', pos_top='40%',legend_icon='circle'))
        .render_notebook()
        )
    return g

In [5]:
g = Build_graph(marked_sentence_df.sample(100),title='subsample of  graph')
g

In [6]:
class Args:
    task_type = 'chemical-disease'
    max_seq_len = 64
    bs = 64

class Conf:
    # some information can be found in:
    # Percha B, Altman R B. A global network of biomedical relationships derived from text[J]. Bioinformatics, 2018, 34(15): 2614-2624.
    relation_type = {'chemical-disease':['T', 'C', 'Sa', 'Pr', 'Pa', 'J'],
                     'disease-chemical':['Mp'],
                     'chemical-gene':['A+', 'A-', 'B', 'E+', 'E-', 'E', 'N'],
                     'gene-chemical':['O', 'K', 'Z'],
                     'gene-disease':['U', 'Ud', 'D', 'J', 'Te', 'Y', 'G'],
                     'disease-gene':['Md', 'X', 'L'],
                     'gene-gene':['B', 'W', 'V+', 'E+', 'E', 'I', 'H', 'Rg', 'Q'],
                     }

args = Args()
conf = Conf()
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [7]:
# load pretrained Bert model
def Bert_model(taskType,bertPath):
    label_df = pd.read_csv('C:/Users/HP/Desktop/final499b/CSE499B/COVID-19 knowledge graph/%s_label.csv'%taskType)
    tokenizer = BertTokenizer.from_pretrained(bertPath,do_lower_case=False)
    model = BertForSequenceClassification.from_pretrained(bertPath, num_labels=label_df['label'].nunique())
    return label_df,tokenizer,model

# bulid data loader
def Data_loader(x,y=None,bs=128,shuffle=False,numWorkers=0):
    if y is not None:
        data = TensorDataset(x,y)
    else:
        data = TensorDataset(x)
    data_loader = DataLoader(dataset=data,batch_size=bs,shuffle=shuffle,num_workers=numWorkers)
    return data_loader

def Prepare_predict_data(tokenizer,bs):
    marked_sentences = marked_sentence_df.loc[(marked_sentence_df['start_entity_type'].apply(lambda x:x.lower())==args.task_type.split('-')[0])&\
                                              (marked_sentence_df['end_entity_type'].apply(lambda x:x.lower())==args.task_type.split('-')[1]),'marked_sentence']
    # convert tokens to ids
    ids = marked_sentences.apply(lambda x:tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))).tolist()
    # padding ids
    ids = sequence.pad_sequences(ids,args.max_seq_len, truncating='post', padding='post')
    # we cannot confirm order of entities, so predict two possibilities
    reverse_marked_sentences = marked_sentence_df.loc[(marked_sentence_df['start_entity_type'].apply(lambda x:x.lower())==args.task_type.split('-')[0])&\
                                              (marked_sentence_df['end_entity_type'].apply(lambda x:x.lower())==args.task_type.split('-')[1]),'marked_sentence']\
                                              .apply(lambda x:x.replace('start_entity','init_start_entity').replace('end_entity','start_entity').replace('init_start_entity','end_entity'))
    reverse_ids = reverse_marked_sentences.apply(lambda x:tokenizer.convert_tokens_to_ids(tokenizer.tokenize(x))).tolist()
    reverse_ids = sequence.pad_sequences(reverse_ids,args.max_seq_len, truncating='post', padding='post')
    predict_data_loader = Data_loader(torch.LongTensor(ids),torch.LongTensor(reverse_ids),bs=bs)
    return marked_sentences.values,predict_data_loader

def Predict():
    reverse_task_type = args.task_type.split('-')[1] + '-' + args.task_type.split('-')[0]
    def Filter(x):
        if x['init_pred'] in conf.relation_type[args.task_type]:
            if x['reverse_pred'] not in conf.relation_type[reverse_task_type]:
                # init_pred is a correct relation but reverse_pred not
                return 'init_pred'
            else:
                # init_pred and reverse_pred both are correct relations
                if x['init_pred_prob'] >= x['reverse_pred_prob']:
                    # init_pred_prob greater than or equal to reverse_pred_prob
                    return 'init_pred'
                else:
                    return 'reverse_pred'
        else:
            if x['reverse_pred'] not in conf.relation_type[reverse_task_type]:
                # init_pred and reverse_pred both are uncorrect relations
                return 'uncorrect'
            else:
                # reverse_pred is a correct relation but init_pred not
                return 'reverse_pred'
    label_df,tokenizer,model = Bert_model(args.task_type,'C:/Users/HP/Desktop/final499b/CSE499B/COVID-19 knowledge graph/%s/'%args.task_type)
    marked_sentences,predict_data_loader = Prepare_predict_data(tokenizer,args.bs)
    model = model.to(device)
    preds = []
    preds_prob = []
    reverse_preds = []
    reverse_preds_prob = []
    for data in tqdm(predict_data_loader):
        ids,reverse_ids = [t.to(device) for t in data]
        outputs = model(input_ids=ids)
        logits = outputs[0]
        pred_prob, pred = torch.max(F.softmax(logits.data,1), 1)
        preds.extend(list(pred.cpu().detach().numpy()))
        preds_prob.extend(list(pred_prob.cpu().detach().numpy()))
        reverse_outputs = model(input_ids=reverse_ids)
        reverse_logits = reverse_outputs[0]
        reverse_pred_prob, reverse_pred = torch.max(F.softmax(reverse_logits.data,1), 1)
        reverse_preds.extend(list(reverse_pred.cpu().detach().numpy()))
        reverse_preds_prob.extend(list(reverse_pred_prob.cpu().detach().numpy()))

    pred_df = pd.DataFrame({'marked_sentence':marked_sentences,'init_pred':preds,'init_pred_prob':preds_prob,'reverse_pred':reverse_preds,'reverse_pred_prob':reverse_preds_prob})
    # map label(0, 1, 2...) to raw label(T, C, Sa...) 
    pred_df['init_pred'] = pred_df['init_pred'].replace(dict(label_df.set_index(['label'])['label_raw']))
    pred_df['reverse_pred'] = pred_df['reverse_pred'].replace(dict(label_df.set_index(['label'])['label_raw']))
    # judge the order of a pair of entities
    pred_df['filter'] = pred_df.apply(lambda x:Filter(x), axis=1)
    pred_df['pred'] = pred_df['init_pred']
    pred_df['pred_prob'] = pred_df['init_pred_prob']
    pred_df.loc[pred_df['filter']=='reverse_pred','pred'] = pred_df.loc[pred_df['filter']=='reverse_pred','reverse_pred']
    pred_df.loc[pred_df['filter']=='reverse_pred','pred_prob'] = pred_df.loc[pred_df['filter']=='reverse_pred','reverse_pred_prob']
    pred_df = pred_df.loc[pred_df['filter']!='uncorrect']
    pred_df = marked_sentence_df.merge(pred_df,how='inner',on='marked_sentence')
    pred_df['init_start_entity'] = pred_df['start_entity']
    pred_df['init_start_entity_type'] = pred_df['start_entity_type']
    pred_df.loc[pred_df['filter']=='reverse_pred','start_entity'] = pred_df.loc[pred_df['filter']=='reverse_pred','end_entity']
    pred_df.loc[pred_df['filter']=='reverse_pred','start_entity_type'] = pred_df.loc[pred_df['filter']=='reverse_pred','end_entity_type']
    pred_df.loc[pred_df['filter']=='reverse_pred','end_entity'] = pred_df.loc[pred_df['filter']=='reverse_pred','init_start_entity']
    pred_df.loc[pred_df['filter']=='reverse_pred','end_entity_type'] = pred_df.loc[pred_df['filter']=='reverse_pred','init_start_entity_type']
    pred_df.drop(['init_start_entity','init_start_entity_type'],axis=1,inplace=True)
    torch.cuda.empty_cache()
    return label_df,pred_df

In [8]:
# chemical-disease relation prediction
args.task_type = 'chemical-disease'
c_d_label_df,c_d_pred_df = Predict()

  0%|          | 0/15 [00:00<?, ?it/s]

In [9]:
# chemical-disease relation theme
c_d_label_df

Unnamed: 0,label_raw,label,theme
0,C,0,Inhibits cell growth (esp. cancers)
1,J,1,Role in pathogenesis
2,Mp,2,Biomarkers (progression)
3,Pa,3,"Alleviates, reduces"
4,Pr,4,"Prevents, suppresses"
5,Sa,5,Side effect/adverse event
6,T,6,Treatment/therapy (incl. investigatory)


In [10]:
# chemical-disease classification results
c_d_pred_df.sample(5)

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
693,Omalizumab,Egg Hypersensitivity,Chemical,Disease,"Ehlayel, Abdul Bari Bener A607 Strong Associat...",T,0.580928,T,0.580928,init_pred,T,0.580928
204,Chlorhexidine,"Rhinitis, Allergic",Chemical,Disease,Pieters A151 Clinical significance between the...,Sa,0.997375,Sa,0.997375,init_pred,Sa,0.997375
390,Anaphylaxis,Cyclosporine,Disease,Chemical,1 Million South Korean Health-Care Records Gwa...,Mp,0.992673,Mp,0.992673,reverse_pred,Mp,0.992673
502,Infection,Intravenous Immunoglobulin,Disease,Chemical,"Akio Tanaka, Satoshi Morioke, Yukihiro Ohya, N...",Mp,0.586789,Mp,0.586789,reverse_pred,Mp,0.586789
190,Rhinitis,Toluene,Disease,Chemical,"Leung, Ka Hou Chu A67 The relationship between...",Mp,0.913594,Mp,0.913594,reverse_pred,Mp,0.913594


In [11]:
# chemical-gene relation prediction
args.task_type = 'chemical-gene'
c_g_label_df,c_g_pred_df = Predict()

  0%|          | 0/8 [00:00<?, ?it/s]

In [12]:
# chemical-gene relation theme
c_g_label_df

Unnamed: 0,label_raw,label,theme
0,A+,0,"Agonism, activation"
1,A-,1,"Antagonism, blocking"
2,B,2,"Binding, ligand (esp. receptors)"
3,E,3,Affects expression/production (neutral)
4,E-,4,Decreases expression/production
5,K,5,"Metabolism, pharmacokinetics"
6,N,6,Inhibits
7,O,7,"Transport, channels"
8,Z,8,Enzyme activity


In [13]:
# chemical-gene classification results
c_g_pred_df.sample(5)

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
329,ventilation,alanine aminotransferase,Chemical,Gene,"Demographics, rate of mechanical start_entity ...",E,0.960856,E,0.989686,init_pred,E,0.960856
371,Oseltamivir,RD,Chemical,Gene,start_entity increased the risk of headaches w...,E-,0.987741,E,0.813697,init_pred,E-,0.987741
167,Toluene,AR,Chemical,Gene,"Leung, Ka Hou Chu A67 The relationship between...",B,0.729219,B,0.729219,init_pred,B,0.729219
324,rhMBL,MBL,Chemical,Gene,Although recombinant human end_entity start_en...,N,0.626527,E,0.955956,init_pred,N,0.626527
320,mesenchymal stem cells,MTOR,Chemical,Gene,"In addition, many intracellular signaling path...",N,0.755158,N,0.755158,init_pred,N,0.755158


In [14]:
# gene-disease relation prediction
args.task_type = 'gene-disease'
g_d_label_df,g_d_pred_df = Predict()

  0%|          | 0/142 [00:00<?, ?it/s]

In [15]:
# gene-disease relation theme
g_d_label_df

Unnamed: 0,label_raw,label,theme
0,D,0,Drug targets
1,G,1,Promotes progression
2,J,2,Role in pathogenesis
3,L,3,Improper regulation linked to disease
4,Md,4,Biomarkers (diagnostic)
5,Te,5,Possible therapeutic effect
6,U,6,Causal mutations
7,Ud,7,Mutations affect disease course
8,X,8,Overexpression in disease
9,Y,9,Polymorphisms alter risk


In [16]:
# gene-disease classification results
g_d_pred_df.sample(5)

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
3924,MAPK8,"Rhinitis, Allergic",Gene,Disease,"Ehlayel, Abdul Bari Bener A607 Strong Associat...",J,0.737792,J,0.737792,init_pred,J,0.737792
6998,NCR1,Melanoma,Gene,Disease,"nnAbbreviationsAHR, airway hyperresponsiveness...",J,0.999631,J,0.999631,init_pred,J,0.999631
2651,ADAM17,Sleep Wake Disorders,Gene,Disease,"Leung, Ka Hou Chu A67 The relationship between...",Te,0.736633,Te,0.736633,init_pred,Te,0.736633
4699,CRP,Porphyria Cutanea Tarda,Gene,Disease,The peripheral blood white blood cell WBC coun...,J,0.999971,J,0.999978,init_pred,J,0.999971
517,IKBKG,diarrhea,Gene,Disease,The PDCoV nsp5 cleavage site in the start_enti...,U,0.99978,U,0.999422,init_pred,U,0.99978


In [17]:
# gene-gene relation prediction
args.task_type = 'gene-gene'
g_g_label_df,g_g_pred_df = Predict()

  0%|          | 0/340 [00:00<?, ?it/s]

In [18]:
# gene-gene relation theme
g_g_label_df

Unnamed: 0,label_raw,label,theme
0,B,0,"Binding, ligand (esp. receptors)"
1,E,1,Affects expression/production (neutral)
2,E+,2,Increases expression/production
3,H,3,Same protein or complex
4,I,4,Signaling pathway
5,Q,5,Production by cell population
6,Rg,6,Regulation
7,V+,7,"Activates, stimulates"
8,W,8,Enhances response


In [19]:
# gene-gene classification results
g_g_pred_df.sample(5)

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
8514,IFNA1,MX1,Gene,Gene,CONCLUSIONS: This study has identified various...,E+,0.99899,E+,0.989102,init_pred,E+,0.99899
2083,p38,MAPK8,Gene,Gene,We specifically identified end_entity and star...,Rg,0.541112,Rg,0.586573,reverse_pred,Rg,0.586573
20807,d-dimer,NDUFB6,Gene,Gene,Multivariable regression showed increasing odd...,V+,0.964366,V+,0.979759,reverse_pred,V+,0.979759
3955,NPIPB3,NPIPA1,Gene,Gene,Results The highest affinity clone to ORF6 dis...,H,0.978994,H,0.993285,reverse_pred,H,0.993285
6989,CCL5,CCR1,Gene,Gene,"We also evaluated protein levels of CCL2, end_...",B,0.568204,I,0.312748,init_pred,B,0.568204


In [20]:
# chemicl-COVID-19 relations
g = Build_graph(c_d_pred_df.loc[(c_d_pred_df['start_entity']=='COVID-19')|(c_d_pred_df['end_entity']=='COVID-19')],relation=True,repulsion=800,title='chemical-COVID-19 knowledge graph',labelShow=True)
g

In [21]:
# gene-COVID-19 relations
g = Build_graph(g_d_pred_df.loc[(g_d_pred_df['start_entity']=='COVID-19')|(g_d_pred_df['end_entity']=='COVID-19')],relation=True,repulsion=60,title='gene-COVID-19 knowledge graph',labelShow=False)
g

In [22]:
# gene-COVID-19 relations
g = Build_graph(marked_sentence_df.loc[(marked_sentence_df['start_entity']=='COVID-19')&(marked_sentence_df['end_entity_type']=='Disease')|(marked_sentence_df['start_entity_type']=='Disease')&(marked_sentence_df['end_entity']=='COVID-19')],relation=False,repulsion=60,title='disease-COVID-19  graph',labelShow=False)
g

In [23]:
# merge all relation prediction and save results
cols = ['start_entity','end_entity','start_entity_type','end_entity_type','marked_sentence','pred','pred_prob']
relation_df = pd.concat([c_d_pred_df[cols],c_g_pred_df[cols],g_d_pred_df[cols],g_g_pred_df[cols]]).append(marked_sentence_df.loc[(marked_sentence_df['start_entity_type'].isin(['Chemical','Disease'])&(marked_sentence_df['start_entity_type']==marked_sentence_df['end_entity_type']))]).reset_index(drop=True)
relation_df.loc[(relation_df['pred'].isna())&(relation_df['start_entity_type']=='Chemical'),'pred'] = 'CC'
relation_df.loc[(relation_df['pred'].isna())&(relation_df['start_entity_type']=='Disease'),'pred'] = 'DD'
relation_df = relation_df[cols]
relation_df.to_csv('relation.csv',index=False)

In [24]:
relation_df

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,pred,pred_prob
0,keyhole limpet hemocyanin,"Lupus Erythematosus, Systemic",Chemical,Disease,"Middle-aged female identical twins, one of who...",Pa,0.969854
1,Cocaine,Incontinentia Pigmenti,Chemical,Disease,"start_entity 10 mgkg, administered intraperito...",Pa,0.832488
2,Daptomycin,COVID-19,Chemical,Disease,"start_entity is not indicated for end_entity, ...",T,0.999757
3,Zinc,diarrhea,Chemical,Disease,start_entity is being used for the treatment o...,Pa,0.769982
4,Zinc,Glycogen Storage Disease Type II,Chemical,Disease,start_entity was effective in decreasing the i...,Pa,0.423437
...,...,...,...,...,...,...,...
39577,Coronavirus Infections,Severe Acute Respiratory Syndrome,Disease,Disease,In a recent review on the persistence of human...,DD,
39578,Coronavirus Infections,Syndrome,Disease,Disease,In a recent review on the persistence of human...,DD,
39579,MERS,Severe Acute Respiratory Syndrome,Disease,Disease,In a recent review on the persistence of human...,DD,
39580,MERS,Syndrome,Disease,Disease,In a recent review on the persistence of human...,DD,


In [25]:
# subsample of knowledge graph
g = Build_graph(relation_df.sample(1000),relation=True,repulsion=15,title='subsample of COVID-19 knowledge graph',labelShow=False)
g

In [26]:
# 1. What chemicals have Sa(Side effect/adverse event) to COVID-19 and revelant disease?

# due to a large number of negative words, these predictions may be wrong
c_d_pred_df.loc[c_d_pred_df['pred']=='Sa']

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
5,ventilation,"Respiratory Distress Syndrome, Adult",Chemical,Disease,Hospital-acquired end_entity may result from a...,Sa,0.954767,Sa,0.997769,init_pred,Sa,0.954767
7,Nelfinavir,Severe Acute Respiratory Syndrome,Chemical,Disease,start_entity inhibited the cytopathic effect i...,Sa,0.559930,C,0.601428,init_pred,Sa,0.559930
37,ventilation,"Corneal Dystrophy, Fleck",Chemical,Disease,Findings end_entity simulation revealed that t...,Sa,0.747989,Pr,0.768367,init_pred,Sa,0.747989
42,ventilation,COVID-19,Chemical,Disease,The main complications reported in hospitalize...,Sa,0.976008,Sa,0.957273,init_pred,Sa,0.976008
44,glucocorticoids,Lymphopenia,Chemical,Disease,"end_entity can be caused by start_entity, and ...",Sa,0.981520,Pr,0.482006,init_pred,Sa,0.981520
...,...,...,...,...,...,...,...,...,...,...,...,...
862,ventilation,COVID-19,Chemical,Disease,Severe_Acute_Respiratory_Syndrome end_entity d...,Sa,0.748148,C,0.995705,init_pred,Sa,0.748148
864,mesenchymal stem cells,BMD,Chemical,Disease,The injection of Ci-I hydrogels encapsulating ...,Sa,0.765685,T,0.818395,init_pred,Sa,0.765685
880,Hexachlorophene,inhibitory concentration 50,Chemical,Disease,"start_entity end_entity: 6M, tannic acid end_e...",Sa,0.816925,Pa,0.654468,init_pred,Sa,0.816925
882,ventilation,"Respiratory Distress Syndrome, Adult",Chemical,Disease,"Compared with survivors, non-survivors were ol...",Sa,0.824085,J,0.619180,init_pred,Sa,0.824085


In [27]:
# 2. What genes have U(Causal mutations), Ud(Mutations affect disease course), Y(Polymorphisms alter risk) to COVID-19?


g_d_pred_df.loc[(g_d_pred_df['end_entity']=='COVID-19')&(g_d_pred_df['pred'].isin(['U','Ud','Y']))]

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
1,p28,COVID-19,Gene,Disease,A high level of similarity was detected betwee...,Ud,0.932619,U,0.975709,init_pred,Ud,0.932619
4,E1,COVID-19,Gene,Disease,The carbohydrate contents of end_entity glycop...,U,0.959629,U,0.977800,init_pred,U,0.959629
5,E2,COVID-19,Gene,Disease,The carbohydrate contents of end_entity glycop...,U,0.935262,J,0.610720,init_pred,U,0.935262
9,ERVW-1,COVID-19,Gene,Disease,Tunicamycin has different effects on the glyco...,Ud,0.687813,Y,0.812220,init_pred,Ud,0.687813
11,E1,COVID-19,Gene,Disease,The end_entity envelope ERVW-1 start_entity ap...,Ud,0.989256,L,0.718454,init_pred,Ud,0.989256
...,...,...,...,...,...,...,...,...,...,...,...,...
7807,S2,COVID-19,Gene,Disease,The severe acute respiratory syndrome end_enti...,Y,0.676697,U,0.480305,init_pred,Y,0.676697
7819,ACE2,COVID-19,Gene,Disease,Molecular modeling revealed that end_entity RB...,U,0.607003,Y,0.998896,init_pred,U,0.607003
7835,ACE2,COVID-19,Gene,Disease,Infection by severe acute respiratory syndrome...,U,0.928476,U,0.836398,init_pred,U,0.928476
7862,MERTK,COVID-19,Gene,Disease,A 12 start_entity peptide derived from end_ent...,Ud,0.558339,J,0.512180,init_pred,Ud,0.558339


In [28]:
# 3. What diseases are associated to COVID-19 (like complication)?
d_d_df = relation_df.loc[((relation_df['start_entity']=='COVID-19')&(relation_df['end_entity_type']=='Disease'))|((relation_df['start_entity_type']=='Disease')&(relation_df['end_entity']=='COVID-19'))]
value_counts_dic = dict(d_d_df['start_entity'].value_counts())
end_entity_value_counts_dic = dict(d_d_df['end_entity'].value_counts())
for key in end_entity_value_counts_dic:
    if key in value_counts_dic:
        value_counts_dic[key] += end_entity_value_counts_dic[key]
    else:
        value_counts_dic[key] = end_entity_value_counts_dic[key]
most_relevant_disease = []
for key in value_counts_dic:
    if value_counts_dic[key] > 10:
        most_relevant_disease.append(key)
relation_df.loc[((relation_df['start_entity'].isin(most_relevant_disease))&(relation_df['end_entity']=='COVID-19'))|((relation_df['start_entity']=='COVID-19')&(relation_df['end_entity'].isin(most_relevant_disease)))]

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,pred,pred_prob
30951,COVID-19,Cataract congenital Volkmann type,Disease,Disease,The pathogenesis of canine start_entity end_en...,DD,
30956,COVID-19,Cataract congenital Volkmann type,Disease,Disease,The genomic organization at the 3 end of canin...,DD,
30961,COVID-19,"Influenza, Human",Disease,Disease,Blood was taken from the sows at the beginning...,DD,
30964,COVID-19,Cataract congenital Volkmann type,Disease,Disease,From the reasons that canine start_entity end_...,DD,
30971,COVID-19,Infection,Disease,Disease,end_entity of cell monolayers by murine start_...,DD,
...,...,...,...,...,...,...,...
39569,COVID-19,Severe Acute Respiratory Syndrome,Disease,Disease,end_entity start_entity end_entity-CoV papain-...,DD,
39572,COVID-19,Coronavirus Infections,Disease,Disease,In a recent review on the persistence of human...,DD,
39573,COVID-19,MERS,Disease,Disease,In a recent review on the persistence of human...,DD,
39574,COVID-19,Severe Acute Respiratory Syndrome,Disease,Disease,In a recent review on the persistence of human...,DD,


In [29]:
# 1. What chemicals have Pa(Alleviates, reduces), Pr(Prevents, suppresses), T(Treatment/therapy (incl. investigatory)) to COVID-19 and revelant disease?

c_d_pred_df.loc[(c_d_pred_df['end_entity']=='COVID-19')&(c_d_pred_df['pred'].isin(['Pa','Pr','T']))]

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
2,Daptomycin,COVID-19,Chemical,Disease,"start_entity is not indicated for end_entity, ...",T,0.999757,T,0.597814,init_pred,T,0.999757
15,ventilation,COVID-19,Chemical,Disease,Study objectives: To report on our experience ...,T,0.999976,Sa,0.998793,init_pred,T,0.999976
47,oseltamivir,COVID-19,Chemical,Disease,During the pandemic H1N1 2009 influenza A viru...,T,0.584085,T,0.932823,init_pred,T,0.584085
55,ventilation,COVID-19,Chemical,Disease,"Bad housing, particularly bad start_entity, wi...",Pr,0.759889,Sa,0.961628,init_pred,Pr,0.759889
57,Ganciclovir,COVID-19,Chemical,Disease,start_entity can be considered for patients wi...,T,0.999927,T,0.999898,init_pred,T,0.999927
59,ventilation,COVID-19,Chemical,Disease,Case report We report a case of acute respirat...,T,0.848479,Sa,0.999983,init_pred,T,0.848479
67,Chloroquine,COVID-19,Chemical,Disease,"start_entity, remdesivir, lopinavir, ribavirin...",T,0.948416,Pr,0.833725,init_pred,T,0.948416
69,ventilation,COVID-19,Chemical,Disease,Three clinical cases of patients presenting se...,T,0.94644,T,0.936319,init_pred,T,0.94644
78,ventilation,COVID-19,Chemical,Disease,The populations of patients with community-acq...,T,0.999792,C,0.627972,init_pred,T,0.999792
79,ventilation,COVID-19,Chemical,Disease,"By contrast, the use of non-invasive start_ent...",T,0.998,T,0.521089,init_pred,T,0.998


In [30]:
# 2. What genes have D(Drug targets), G(Promotes progression), J(Role in pathogenesis), Te(Possible therapeutic effect), X(Overexpression in disease) to COVID-19 and revelant disease?

g_d_pred_df.loc[(g_d_pred_df['end_entity']=='COVID-19')&(g_d_pred_df['pred'].isin(['D','G','J','Te','X']))]

Unnamed: 0,start_entity,end_entity,start_entity_type,end_entity_type,marked_sentence,init_pred,init_pred_prob,reverse_pred,reverse_pred_prob,filter,pred,pred_prob
8,GCNT2,COVID-19,Gene,Disease,"This end_entity, like influenza virus, can rep...",J,0.999964,Ud,0.578845,init_pred,J,0.999964
18,PSMD1,COVID-19,Gene,Disease,The sequences of the peplomeric start_entity p...,J,0.958231,U,0.994972,init_pred,J,0.958231
24,CYTIP,COVID-19,Gene,Disease,"The hemagglutininesterase start_entity, spike ...",D,0.480011,Te,0.834435,init_pred,D,0.480011
27,S2,COVID-19,Gene,Disease,These observations may reflect the biological ...,Te,0.731483,J,0.999688,init_pred,Te,0.731483
33,CYTIP,COVID-19,Gene,Disease,"In contrast to bovine end_entity, which appare...",J,0.993720,L,0.988948,init_pred,J,0.993720
...,...,...,...,...,...,...,...,...,...,...,...,...
7805,PSMD1,COVID-19,Gene,Disease,We investigated the kinetics of serologic resp...,Te,0.744867,J,0.997138,init_pred,Te,0.744867
7808,NDUFB6,COVID-19,Gene,Disease,We fitted a log-normal distribution to pooled ...,D,0.962257,Te,0.909175,init_pred,D,0.962257
7810,IFNA1,COVID-19,Gene,Disease,Poxvirus and end_entity mutants that lacked 2-...,Te,0.784434,J,0.973893,init_pred,Te,0.784434
7834,ARIH1,COVID-19,Gene,Disease,The Human respiratory syncytial virus RSV is o...,Te,0.914886,J,0.916927,init_pred,Te,0.914886
