## data preparation

In [None]:
import json
import pickle

with open('mesh_dict.dat', 'rb') as f:    
    mesh_dict = pickle.load(f)    # mesh_dict: key: pmid, value: list of mesh terms for each paper
    f.close()

with open('mesh_rels_dict.dat', 'rb') as f2:    
    mesh_rels_dict = pickle.load(f2)    # mesh_rels_dict: key: pmid, value: list of combinations of mesh terms for each paper
    f2.close()

with open('cits_dict.dat', 'rb') as f3:
    cits_dict = pickle.load(f3)   # cits_dict: key: pmid, value: list of citing pmids for each paper
    f3.close() 
    
with open('refs_dict.dat', 'rb') as f4:
    refs_dict = pickle.load(f4)    # refs_dict: key: pmid, value: list of reference pmids for each paper
    f4.close()

file = open('pmid_pub_years.txt', 'r')  
js = file.read()
pmid_year_dict = json.loads(js)
file.close()

## mED(rel)

In [None]:
import numpy as np
from itertools import combinations

m_dict = {}
ED_rels = {}

focal_nodes = list(pmids.keys())
pbar = tqdm(focal_nodes)

for node in pbar:
    cits = cits_dict[node]
    refs = refs_dict[node]
    
    s_rel = mesh_rels_dict[node]
    n_s = len(s_rel)
    
    # ED_s calculation
    sj_rel = set()
    for ref in refs:
        sj_rel.update(mesh_rels_dict[ref])   
    si = s_rel - sj_rel
    sj_old = s_rel & sj_rel
    n_si = len(si)
    n_sj = len(sj_old)
    ED_s = round((n_si - n_sj) / n_s, 5)

    # ED_p calculation
    citing_nodes_fp = [fp_cits for fp_cits in cits]
    N = 0 
    m = 0 
    ED_p = 0
    for c in citing_nodes_fp:
        g_rel = mesh_rels_dict[c]            
        n_g = len(g_rel)
        if n_g > 0:  
            N += 1
            n_gi = len((s_rel & g_rel) - sj_rel)
            n_gj = len(s_rel & g_rel & sj_rel)
            n_gk = len((g_rel & sj_rel) - s_rel)
            n_gn = len(g_rel - sj_rel - s_rel)
            ED_g = round((n_gi + n_gn - n_gj - n_gk) / n_g, 5)
            ED_p += ED_g
        if len(s_rel & g_rel) > 0:
            m += 1
    m_dict[node] = m
       
    if N == 0:
        ED_rel = 0.5 * ED_s + 0.5 * ED_p
        ED_rels[node] = ED_rel
        continue
    
    ED_p = round(ED_p / N, 5)
    ED_rel = 0.5 * ED_s + 0.5 * ED_p
    ED_rels[node] = ED_rel


In [None]:
import pandas as pd

df_new = pd.DataFrame.from_dict(ED_rel, orient = 'index')
df_new.to_sql('pubmed_ED_rel_noweight', con = engine, if_exists = 'append')

In [None]:
import pandas as pd

mdf_new = pd.DataFrame.from_dict(m_dict, orient = 'index')
mdf_new.to_sql('pubmed_ED_rel_m', con = engine, if_exists = 'append')

In [None]:
import numpy as np
import pandas as pd

sql = 'select * from pubmed_ED_noweight'
df_new = pd.read_sql(sql, con = engine)

max_m_years = {}
min_m_years = {}
for y in range(1991,2015,1):
    m_y = np.array(df_new[df_new['pub_year'] == y]['m'].values)
#     m_y = df_new['m'].values
    max_m = np.max(m_y)
    min_m = np.min(m_y)
    max_m_years[y] = max_m
    min_m_years[y] = min_m
# max_m_years = max_m
# min_m_years = min_m

In [None]:
mED_dict = {}

for i,item in df_new.iterrows():
    pmid = item['pmid']
    ED_rel = item['ED_rel']
    year = item['pub_year']
    m = item['m']
    try: 
        m_weight = round((m - min_m_years[year]) / (max_m_years[year] - min_m_years[year]), 4)
        mED = round(ED_rel * m_weight, 5)
        mED_dict[pmid] = mED
    except:
        continue

## mED(ent)

In [None]:
## 新指标2直接计算
# M_s: 相对于知识来源的创新
# M_p：对于生长路径的影响
# w：目标文献与同年份所发表文献的相对影响
import numpy as np
from itertools import combinations

# df = pd.DataFrame(columns = ['pmid', 'pub_year', 'm', 'iM_rel'])

m_dict = {}
ED_s_dict = {}
ED_p_dict = {}
ED_ent_dict = {}

focal_nodes = list(pmids.keys())           # the list of pmids of papers for calculation
pbar = tqdm(focal_nodes)

for node in pbar:
    cits = cits_dict[node]
    refs = refs_dict[node]
    
    s_rel = set(mesh_dict[node])
    n_s = len(s_rel)
    
    # ED_s calculation
    sj_rel = set()
    for ref in refs:
        sj_rel.update(set(mesh_dict[ref]))
    si = s_rel - sj_rel
    sj_old = s_rel & sj_rel
    n_si = len(si)
    n_sj = len(sj_old)
    ED_s = round((n_si - n_sj) / n_s, 5)
    ED_s_dict[node] = ED_s
    
    # ED_p calculation
    citing_nodes_fp = [fp_cits for fp_cits in cits]
    cits_rels_dict = {}
    N = 0  
    m = 0  
    ED_p = 0
    N_s = 0
    for c in citing_nodes_fp:
        g_rel = set(mesh_dict[c])            
        n_g = len(g_rel)
        if n_g > 0:  
            N += 1
            n_gi = len((s_rel & g_rel) - sj_rel)
            n_gj = len(s_rel & g_rel & sj_rel)
            n_gk = len((g_rel & sj_rel) - s_rel)
            n_gn = len(g_rel - sj_rel - s_rel)
            n_g_list = []
            n_g_list.append(n_gi)
            n_g_list.append(n_gj)
            n_g_list.append(n_gk)
            n_g_list.append(n_gn)
            cits_rels_dict[c] = n_g_list
            ED_g = round((n_gi + n_gn - n_gj - n_gk) / n_g, 5)
            ED_p += ED_g
        if len(s_rel & g_rel) > 0:
            m += 1
    m_dict[node] = m
    ED_p_dict[node] = cits_rels_dict
       

    if N == 0:
        ED_ent = 0.5 * M_s + 0.5 * M_p
        ED_ent_dict[node] = ED_ent
        continue
        
    ED_p = round(ED_p / N, 5)
    ED_ent = 0.5 * ED_s + 0.5 * ED_p
    ED_ent_dict[node] = ED_ent


In [None]:
import pandas as pd

df_new = pd.DataFrame.from_dict(ED_ent_dict, orient = 'index')
df_new.to_sql('pubmed_ED_ent_noweight', con = engine, if_exists = 'append')

In [None]:
import pandas as pd

mdf_new = pd.DataFrame.from_dict(m_dict, orient = 'index')
mdf_new.to_sql('pubmed_ED_ent_m', con = engine, if_exists = 'append')

In [None]:
import numpy as np
import pandas as pd

sql = 'select * from pubmed_ED_ent_noweight'
df_new = pd.read_sql(sql, con = engine)

max_m_years = {}
min_m_years = {}
for y in range(1991,2015,1):
    m_y = np.array(df_new[df_new['pub_year'] == y]['m'].values)
    max_m = np.max(m_y)
    min_m = np.min(m_y)
    max_m_years[y] = max_m
    min_m_years[y] = min_m

In [None]:
mED_ent_dict = {}

for i,item in df_new.iterrows():
    pmid = item['pmid']
    ED_ent = item['ED_ent']
    year = item['pub_year']
    m = item['m']
    try: 
        m_weight = round((m - min_m_years[year]) / (max_m_years[year] - min_m_years[year]), 4)
        mED_ent = round(ED_ent * m_weight, 5)
        mED_ent_dict[pmid] = mED_ent
    except:
        continue

mED_ent_df = pd.DataFrame.from_dict(mED_ent_dict, orient = 'index')
mED_ent_df.to_sql('pubmed_mED_ent', con = engine, if_exists = 'append')

## DI5

In [None]:
# DI5 不控制引文窗口
# fp: focus paper
# sp: source paper

DI5_nodes = {}

focal_nodes = pmids
pbar = tqdm(focal_nodes)

for node in pbar:
    pub_year = pmid_year_dict[str(node)]
    
    citing_nodes_fp = cits_dict[node]
    refs = refs_dict[node]
    
    if len(cits) <= 0 or len(refs) <= 0:
        continue

    citing_nodes_sp = []
    for s in refs:
        citing_nodes_sp.extend(cits_dict[s])
    citing_nodes_sp2 = [s2 for s2 in citing_nodes_sp if pmid_year_dict[str(s2)] >= pub_year and s2 != node]
    
    n_i = len(set(citing_nodes_fp) - set(citing_nodes_sp2))
    nodes_j = set(citing_nodes_fp) & set(citing_nodes_sp2)
    n_j_1 = len(nodes_j)
    n_j = 0
    for j in nodes_j:
        refs_j = refs_dict[j]
        l = len(list(set(refs) & set(refs_j)))
        if l >= 5:
            n_j += 1
    n_all = len(set(citing_nodes_fp) | set(citing_nodes_sp2))
    l_distance = n_j_1 - n_j
    
    try:
        DI5 = round((n_i - n_j) / (n_all - l_distance), 5)
    except:
        continue

    DI5_nodes[node] = DI5

pbar.close()

## DI1

In [None]:
# DI1 不控制引文窗口
# fp: focus paper
# sp: source paper

import pandas as pd
import time

DI1_nodes = {}

t_start = time.time()
focal_nodes = pmids
pbar = tqdm(focal_nodes)

print(len(pmids))

for node in pbar:
    citing_nodes_fp = cits_dict[node]
    cited_nodes = refs_dict[node]
    pub_year = pmid_year_dict[str(node)]
    
    if len(citing_nodes_fp) == 0 or len(cited_nodes) == 0:
        continue #如果没有参考文献或者施引文献就跳出这轮循环
    
    citing_nodes_sp = []
    for s in cited_nodes:
        citing_nodes_sp.extend(cits_dict[s])
    citing_nodes_sp_filter = [s2 for s2 in citing_nodes_sp if pmid_year_dict[str(s2)] >= pub_year and s2 != node]
    
    n_i = len(set(citing_nodes_fp) - set(citing_nodes_sp_filter))
    n_j = len(set(citing_nodes_fp) & set(citing_nodes_sp_filter))
    n_all = len(set(citing_nodes_fp) | set(citing_nodes_sp_filter))
    DI1 = round((n_i - n_j) / n_all, 5)
    DI1_nodes[node] = DI1

pbar.close()

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(DI1_nodes, orient = 'index')
df.to_sql('pubmed_di1', con = engine)

## mCD

In [None]:
import pandas as pd
sql = 'select * from pubmed_di1'
df_di1 = pd.read_sql(sql, con = engine)

In [None]:
import pymongo
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import datetime
import time
from tqdm import tqdm

myclient = pymongo.MongoClient('mongodb://192.168.24.233:27017')   # Mongodb Server
mydb = myclient['PubMed']
mycol = mydb['document']

mCD = {}

for i,item in df_di1.iterrows():
    pmid = item['pmid']
    di1 = item['DI1']
    item_find = list(mycol.find({'pmid': pmid}, {'_id': 0, 'pmid': 1, 'cits': 1}))[0]
    cits = item_find['cits']
    mCD[pmid] = round(di1 * len(cits), 6)

In [None]:
import pandas as pd

df = pd.DataFrame.from_dict(mCD, orient = 'index')
df.to_sql('pubmed_mCD', con = engine)