In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from tqdm.notebook import tqdm
import scipy.stats as stats
import matplotlib.pyplot as plt
import math
import os

sns.set(rc={'figure.figsize':(11.7,8.27)})
sns.set(font_scale=1.5)

pd.options.display.max_rows = 10000
pd.options.display.max_columns = 10000

# Set up Entrez
from Bio import Entrez
Entrez.api_key = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
Entrez.email = "yyyy@zzzz.com"

# Load our lists of search terms (diseases and pathogens)

In [None]:
HOME_DIR = "/data/pathogen_ncd"

In [None]:
dis_ab = pd.read_csv(f"{HOME_DIR}/misc/a_and_b_disease_data.txt", 
                   sep = '\t', encoding = "ISO-8859-1")

dis_other = pd.read_csv(f"{HOME_DIR}/misc/all_other_disease_data.txt", 
                   sep = '\t', encoding = "ISO-8859-1")

orgs = pd.read_csv(f"{HOME_DIR}/misc/org_data.txt", 
                   sep = '\t', encoding = "ISO-8859-1")

# Disease only search

## Combine all diseases into a single list

In [None]:
dis = pd.concat([dis_ab, dis_other], ignore_index = True)

# Strip out square brackets which for some reason causes esearch to drop from 
# query!
dis['disease'] = dis['disease'].str.replace('\[|\]', " ")

In [None]:
dis_res = []

for x, curr_row in tqdm(dis.iterrows(), total = dis.shape[0]):
    curr_dis = curr_row['disease']
    
    q = f"({curr_dis})"
    
    handle = Entrez.esearch(db = "pubmed", retmax = "100000", retmode = "xml", 
                            term = q)

    res = Entrez.read(handle)
    
    dis_cnt = res.get('Count')
    
    dis_ids = res.get('IdList')
    
    # Translated query (what actually got searched)
    trans_q = res.get('QueryTranslation')
    
    dis_res.append([curr_row['disease'], curr_row['icd'], 
                    curr_row['icd_cat'], curr_row['icd_site'], dis_cnt, 
                    trans_q, dis_ids])

In [None]:
dis_res_df = pd.DataFrame(dis_res, 
                          columns = ['Disease', 'icd', 'icd_cat', 'icd_site', 
                                     'count', 'query', 'PMIDs'])

dis_res_df['icd_site'] = dis_res_df['icd_site'].apply(str)
dis_res_df['icd_site'] = dis_res_df['icd_site'].str.zfill(2)

In [None]:
dis_res_df.to_csv(f"{HOME_DIR}/results/other/dis_only_py_pubmed_search.tsv", 
                  sep = '\t', index = False)

# Pathogen only search

In [None]:
org_res = []

for x, curr_row in tqdm(orgs.iterrows(), total = orgs.shape[0]):
    curr_org_name = curr_row['org_name']
    curr_org_abbrev = curr_row['abbrev']
    curr_org_mesh = curr_row['mesh_id']
    
    q = f"(({curr_org_name}) OR ({curr_org_abbrev}) OR ({curr_org_mesh}))"
    
    handle = Entrez.esearch(db = "pubmed", retmax = "100000", retmode = "xml", 
                            term = q)

    res = Entrez.read(handle)
    
    # Number of articles found
    org_cnt = res.get('Count')
    
    org_ids = res.get('IdList')
    # Translated query (what actually got searched)
    trans_q = res.get('QueryTranslation')
    
    org_res.append([curr_org_name, curr_org_abbrev, curr_org_mesh, org_cnt, 
                    trans_q, org_ids])

In [None]:
org_res_df = pd.DataFrame(org_res, 
                          columns = ['org_name', 'abbrev', 'mesh_id', 'count', 
                                     'query', 'PMIDs'])

In [None]:
org_res_df.to_csv(f"{HOME_DIR}/results/other/path_only_py_pubmed_search.tsv",
                   sep = '\t', index = False)

# Disease-Pathogen Pair search

In [None]:
pair_res = []
for x, curr_dis_row in tqdm(dis.iterrows(), total = dis.shape[0]):
    curr_dis = curr_dis_row['disease']
    
    for y, curr_org_row in orgs.iterrows():
        curr_org_name = curr_org_row['org_name']
        curr_org_abbrev = curr_org_row['abbrev']
        curr_org_mesh = curr_org_row['mesh_id']
    
        q = f"(({curr_dis}) AND (({curr_org_name}) OR ({curr_org_abbrev}) OR ({curr_org_mesh})))"
    
        handle = Entrez.esearch(db = "pubmed", retmax = "100000", 
                                retmode = "xml", term = q)

        res = Entrez.read(handle)
    
        # Number of articles found
        pair_cnt = res.get('Count')
    
        pair_ids = res.get('IdList')
        
        # Translated query (what actually got searched)
        trans_q = res.get('QueryTranslation')
        
        pair_res.append([curr_dis, curr_dis_row['icd'], 
                         curr_dis_row['icd_cat'], curr_dis_row['icd_site'],
                         curr_org_name, curr_org_abbrev, curr_org_mesh, 
                         pair_cnt, trans_q, pair_ids])

In [None]:
pair_res_df = pd.DataFrame(pair_res, 
                          columns = ['Disease', 'icd', 'icd_cat', 'icd_site',
                                     'org_name', 'abbrev', 'mesh_id', 'count', 
                                     'query', 'PMIDs'])

pair_res_df['icd_site'] = pair_res_df['icd_site'].apply(str)
pair_res_df['icd_site'] = pair_res_df['icd_site'].str.zfill(2)

In [None]:
pair_res_df.to_csv(f"{HOME_DIR}/results/other/pairs_py_pubmed_search.tsv", 
                   sep = '\t', index = False)

# Calculate LPF

In [None]:
final = pd.merge(left = pair_res_df, 
                 right = org_res_df[['abbrev','count', 'query']],
                 on = "abbrev")

final.columns = ['Disease', 'icd', 'icd_cat', 'icd_site', 'org_name', 'abbrev',
                   'mesh_id', 'pair_count', 'pair_query', 'pair_PMIDs',
                 'org_count', 'org_query']

In [None]:
final = pd.merge(left = final, 
                 right = dis_res_df[['Disease', 'count', 'query']], 
                 on = "Disease")

final.columns = ['Disease', 'icd', 'icd_cat', 'icd_site', 'org_name', 'abbrev',
                 'mesh_id', 'pair_count', 'pair_query', 'pair_PMIDs', 
                 'org_count', 'org_query', 'dis_count', 'dis_query']

final['pair_count'] = final['pair_count'].apply(int)
final['org_count'] = final['org_count'].apply(int)
final['dis_count'] = final['dis_count'].apply(int)

In [None]:
# Negated form of LPF
dis_norm     = final['pair_count'] /final['dis_count']
path_norm    = final['pair_count'] / final['org_count']
final['lpf'] = -np.log10(dis_norm * path_norm)

In [None]:
final.to_excel(f"{HOME_DIR}/results/pubmed_search.xlsx", 
               index = False)