# Figure 6: Distributions of ranks of scientific breakthroughs in the PubMed dataset for the five disruption indicators

### mED(rel)

In [None]:
# Science breakthrough papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_science.csv', header = 0)    # including pmid, pub_year
pubmed_mED_rel = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)    # including pmid, pub_year, mED_rel

ranks_mED_rel_science = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_rel[pubmed_mED_rel['pub_year'] == y]
    pubmed_year.sort_values('mED_rel', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_rel_science.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_rel_science))

In [None]:
# Prize papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_prize.csv', header = 0)
pubmed_mED_rel = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)

ranks_mED_rel_prize = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_rel[pubmed_mED_rel['pub_year'] == y]
    pubmed_year.sort_values('mED_rel', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_rel_prize.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_rel_prize))

In [None]:
# Highly cited papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_highly_cited.csv', header = 0)
pubmed_mED_rel = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)

ranks_mED_rel_highly_cited = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_rel[pubmed_mED_rel['pub_year'] == y]
    pubmed_year.sort_values('mED_rel', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_rel_highly_cited.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_rel_highly_cited))

In [None]:
# Faculty papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_faculty.csv', header = 0)
pubmed_mED_rel = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)

ranks_mED_rel_faculty = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_rel[pubmed_mED_rel['pub_year'] == y]
    pubmed_year.sort_values('mED_rel', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_rel_faculty.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_rel_faculty))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

bins = 100
fig, ax = plt.subplots(1,4, figsize = (18, 4.5))

data = [ranks_mED_rel_science, ranks_mED_rel_prize, ranks_mED_rel_highly_cited, ranks_mED_rel_faculty]

color_list = ['#8EA3C2', '#A3B38C', '#EDB17F', '#b3b8bc']
titles = ['Science breakthrough papers', 'Prize papers', 'Highly cited papers', 'Faculty Opinions papers']

for x in range(4):
    sns.kdeplot(data[x],ax = ax[x], shade = True, color = color_list[x], alpha = 0.5)
    rank_mean = np.mean(data[x])
    ax[x].axvline(x = rank_mean, linestyle = '--', color = 'gray')
    if x != 3:
        ax[x].text(x = 0.18, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.6, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    else:
        ax[x].text(x = 0.18, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.6, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    ax[x].set_xlabel('Rankings', fontsize = 12)
    ax[x].tick_params(axis = 'x',labelsize = 12, labelrotation = 20)
    ax[x].tick_params(axis = 'y', labelsize = 12)
    ax[x].spines['bottom'].set_linewidth(0.5)
    ax[x].spines['bottom'].set_color('black')
    ax[x].spines['left'].set_linewidth(0.5)
    ax[x].spines['left'].set_color('black')
    ax[x].spines['top'].set_linewidth(0.5)
    ax[x].spines['top'].set_color('black')
    ax[x].spines['right'].set_linewidth(0.5)
    ax[x].spines['right'].set_color('black')
    ax[x].spines['right'].set_visible(False)
    ax[x].spines['top'].set_visible(False)
    ax[x].set_title(titles[x], fontsize = 12, fontweight = 'medium')

plt.tight_layout()
plt.suptitle('(a) mED(rel)', fontsize = 14, ha = 'left',va = 'top', fontweight = 'bold', x = 0.002, y = 0.9999)
plt.show()

### mED(ent)

In [None]:
# Science breakthrough papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_science.csv', header = 0)
pubmed_mED_ent = pd.read_csv(r'data/pubmed_mED(ent).csv', header = 0)

ranks_mED_ent_science = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_ent[pubmed_mED_ent['pub_year'] == y]
    pubmed_year.sort_values('mED_ent', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_ent_science.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_ent_science))

In [None]:
# Prize papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_prize.csv', header = 0)
pubmed_mED_ent = pd.read_csv(r'data/pubmed_mED(ent).csv', header = 0)

ranks_mED_ent_prize = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_ent[pubmed_mED_ent['pub_year'] == y]
    pubmed_year.sort_values('mED_ent', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_ent_prize.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_ent_prize))

In [None]:
# Highly cited papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_highly_cited.csv', header = 0)
pubmed_mED_ent = pd.read_csv(r'data/pubmed_mED(ent).csv', header = 0)

ranks_mED_ent_highly_cited = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_ent[pubmed_mED_ent['pub_year'] == y]
    pubmed_year.sort_values('mED_ent', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_ent_highly_cited.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_ent_highly_cited))

In [None]:
# Faculty papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_faculty.csv', header = 0)
pubmed_mED_ent = pd.read_csv(r'data/pubmed_mED(ent).csv', header = 0)

ranks_mED_ent_faculty = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mED_ent[pubmed_mED_ent['pub_year'] == y]
    pubmed_year.sort_values('mED_ent', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mED_ent_faculty.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mED_ent_faculty))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

bins = 100
fig, ax = plt.subplots(1,4, figsize = (18, 4.5))

data = [ranks_mED_ent_science, ranks_mED_ent_prize, ranks_mED_ent_highly_cited, ranks_mED_ent_faculty]

color_list = ['#8EA3C2', '#A3B38C', '#EDB17F', '#b3b8bc']
titles = ['Science breakthrough papers', 'Prize papers', 'Highly cited papers', 'Faculty Opinions papers']

for x in range(4):
    sns.kdeplot(data[x],ax = ax[x], shade = True, color = color_list[x], alpha = 0.5)
    rank_mean = np.mean(data[x])
    ax[x].axvline(x = rank_mean, linestyle = '--', color = 'gray')
    if x != 3:
        ax[x].text(x = 0.3, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.8, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    else:
        ax[x].text(x = 0.4, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.9, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    ax[x].set_xlabel('Rankings', fontsize = 12)
    ax[x].tick_params(axis = 'x',labelsize = 12, labelrotation = 20)
    ax[x].tick_params(axis = 'y', labelsize = 12)
    ax[x].spines['bottom'].set_linewidth(0.5)
    ax[x].spines['bottom'].set_color('black')
    ax[x].spines['left'].set_linewidth(0.5)
    ax[x].spines['left'].set_color('black')
    ax[x].spines['top'].set_linewidth(0.5)
    ax[x].spines['top'].set_color('black')
    ax[x].spines['right'].set_linewidth(0.5)
    ax[x].spines['right'].set_color('black')
    ax[x].spines['right'].set_visible(False)
    ax[x].spines['top'].set_visible(False)
    ax[x].set_title(titles[x], fontsize = 12, fontweight = 'medium')

plt.tight_layout()
plt.suptitle('(b) mED(ent)', fontsize = 14, ha = 'left',va = 'top', fontweight = 'bold', x = 0.002, y = 0.9999)
plt.show()

### mCD

In [None]:
# Science breakthrough papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_science.csv', header = 0)
pubmed_mCD = pd.read_csv(r'data/pubmed_mCD.csv', header = 0)

ranks_mCD_science = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mCD[pubmed_mCD['pub_year'] == y]
    pubmed_year.sort_values('mCD', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mCD_science.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mCD_science))

In [None]:
# Prize papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_prize.csv', header = 0)
pubmed_mCD = pd.read_csv(r'data/pubmed_mCD.csv', header = 0)

ranks_mCD_prize = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mCD[pubmed_mCD['pub_year'] == y]
    pubmed_year.sort_values('mCD', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mCD_prize.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mCD_prize))

In [None]:
# Highly cited papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_highly_cited.csv', header = 0)
pubmed_mCD = pd.read_csv(r'data/pubmed_mCD.csv', header = 0)

ranks_mCD_highly_cited = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mCD[pubmed_mCD['pub_year'] == y]
    pubmed_year.sort_values('mCD', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mCD_highly_cited.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mCD_highly_cited))

In [None]:
# Faculty papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_faculty.csv', header = 0)
pubmed_mCD = pd.read_csv(r'data/pubmed_mCD.csv', header = 0)

ranks_mCD_faculty = []

for y in range(1991, 2015):
    pubmed_year = pubmed_mCD[pubmed_mCD['pub_year'] == y]
    pubmed_year.sort_values('mCD', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_mCD_faculty.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_mCD_faculty))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

bins = 100
fig, ax = plt.subplots(1,4, figsize = (18, 4.5))

data = [ranks_mCD_science, ranks_mCD_prize, ranks_mCD_highly_cited, ranks_mCD_faculty]

color_list = ['#8EA3C2', '#A3B38C', '#EDB17F', '#b3b8bc']
titles = ['Science breakthrough papers', 'Prize papers', 'Highly cited papers', 'Faculty Opinions papers']

for x in range(4):
    sns.kdeplot(data[x],ax = ax[x], shade = True, color = color_list[x], alpha = 0.5)
    rank_mean = np.mean(data[x])
    ax[x].axvline(x = rank_mean, linestyle = '--', color = 'gray')
    if x != 3:
        ax[x].text(x = 0.25, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.7, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    else:
        ax[x].text(x = 0.3, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.85, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    ax[x].set_xlabel('Rankings', fontsize = 12)
    ax[x].tick_params(axis = 'x',labelsize = 12, labelrotation = 20)
    ax[x].tick_params(axis = 'y', labelsize = 12)
    ax[x].spines['bottom'].set_linewidth(0.5)
    ax[x].spines['bottom'].set_color('black')
    ax[x].spines['left'].set_linewidth(0.5)
    ax[x].spines['left'].set_color('black')
    ax[x].spines['top'].set_linewidth(0.5)
    ax[x].spines['top'].set_color('black')
    ax[x].spines['right'].set_linewidth(0.5)
    ax[x].spines['right'].set_color('black')
    ax[x].spines['right'].set_visible(False)
    ax[x].spines['top'].set_visible(False)
    ax[x].set_title(titles[x], fontsize = 12, fontweight = 'medium')

plt.tight_layout()
plt.suptitle('(c) mCD', fontsize = 14, ha = 'left',va = 'top', fontweight = 'bold', x = 0.002, y = 0.9999)
plt.show()

### DI5 

In [None]:
# Science breakthrough papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_science.csv', header = 0)
pubmed_di5 = pd.read_csv(r'data/pubmed_di5.csv', header = 0)

ranks_di5_science = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di5[pubmed_di5['pub_year'] == y]
    pubmed_year.sort_values('DI5', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di5_science.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di5_science))

In [None]:
# Prize papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_prize.csv', header = 0)
pubmed_di5 = pd.read_csv(r'data/pubmed_di5.csv', header = 0)

ranks_di5_prize = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di5[pubmed_di5['pub_year'] == y]
    pubmed_year.sort_values('DI5', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di5_prize.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di5_prize))

In [None]:
# Highly cited papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_highly_cited.csv', header = 0)
pubmed_di5 = pd.read_csv(r'data/pubmed_di5.csv', header = 0)

ranks_di5_highly_cited = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di5[pubmed_di5['pub_year'] == y]
    pubmed_year.sort_values('DI5', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di5_highly_cited.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di5_highly_cited))

In [None]:
# Faculty papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_faculty.csv', header = 0)
pubmed_di5 = pd.read_csv(r'data/pubmed_di5.csv', header = 0)

ranks_di5_faculty = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di5[pubmed_di5['pub_year'] == y]
    pubmed_year.sort_values('DI5', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di5_faculty.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di5_faculty))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

bins = 100
fig, ax = plt.subplots(1,4, figsize = (18, 4.5))

data = [ranks_di5_science, ranks_di5_prize, ranks_di5_highly_cited, ranks_di5_faculty]

color_list = ['#8EA3C2', '#A3B38C', '#EDB17F', '#b3b8bc']
titles = ['Science breakthrough papers', 'Prize papers', 'Highly cited papers', 'Faculty Opinions papers']

for x in range(4):
    sns.kdeplot(data[x],ax = ax[x], shade = True, color = color_list[x], alpha = 0.5)
    rank_mean = np.mean(data[x])
    ax[x].axvline(x = rank_mean, linestyle = '--', color = 'gray')
    if x != 2:
        ax[x].text(x = 0.2, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.65, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    else:
        ax[x].text(x = 0.12, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.65, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    ax[x].set_xlabel('Rankings', fontsize = 12)
    ax[x].tick_params(axis = 'x',labelsize = 12, labelrotation = 20)
    ax[x].tick_params(axis = 'y', labelsize = 12)
    ax[x].spines['bottom'].set_linewidth(0.5)
    ax[x].spines['bottom'].set_color('black')
    ax[x].spines['left'].set_linewidth(0.5)
    ax[x].spines['left'].set_color('black')
    ax[x].spines['top'].set_linewidth(0.5)
    ax[x].spines['top'].set_color('black')
    ax[x].spines['right'].set_linewidth(0.5)
    ax[x].spines['right'].set_color('black')
    ax[x].spines['right'].set_visible(False)
    ax[x].spines['top'].set_visible(False)
    ax[x].set_title(titles[x], fontsize = 12, fontweight = 'medium')

plt.tight_layout()
plt.suptitle('(d) DI5', fontsize = 14, ha = 'left',va = 'top', fontweight = 'bold', x = 0.002, y = 0.9999)
plt.show()

### DI1

In [None]:
# Science breakthrough papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_science.csv', header = 0)
pubmed_di1 = pd.read_csv(r'data/pubmed_di1.csv', header = 0)

ranks_di1_science = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di1[pubmed_di1['pub_year'] == y]
    pubmed_year.sort_values('DI1', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di1_science.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di1_science))

In [None]:
# Prize papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_prize.csv', header = 0)
pubmed_di1 = pd.read_csv(r'data/pubmed_di1.csv', header = 0)

ranks_di1_prize = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di1[pubmed_di1['pub_year'] == y]
    pubmed_year.sort_values('DI1', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di1_prize.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di1_prize))

In [None]:
# Highly cited papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_highly_cited.csv', header = 0)
pubmed_di1 = pd.read_csv(r'data/pubmed_di1.csv', header = 0)

ranks_di1_highly_cited = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di1[pubmed_di1['pub_year'] == y]
    pubmed_year.sort_values('DI1', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di1_highly_cited.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di1_highly_cited))

In [None]:
# Faculty papers

import pandas as pd

test = pd.read_csv(r'data/test_dataset_faculty.csv', header = 0)
pubmed_di1 = pd.read_csv(r'data/pubmed_di1.csv', header = 0)

ranks_di1_faculty = []

for y in range(1991, 2015):
    pubmed_year = pubmed_di1[pubmed_di1['pub_year'] == y]
    pubmed_year.sort_values('DI1', ascending = False, inplace = True, ignore_index = True)
    
    test_year = list(test[test['pub_year'] == y]['pmid'].values)
    hit_papers_df = pubmed_year[pubmed_year.pmid.isin(test_year)]
    hit_papers_ranks = list(hit_papers_df.index.values)
    ranks_di1_faculty.extend([i+1 for i in hit_papers_ranks])
    
print(len(ranks_di1_faculty))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

bins = 100
fig, ax = plt.subplots(1,4, figsize = (18, 4.5))

data = [ranks_di1_science, ranks_di1_prize, ranks_di1_highly_cited, ranks_di1_faculty]

color_list = ['#8EA3C2', '#A3B38C', '#EDB17F', '#b3b8bc']
titles = ['Science breakthrough papers', 'Prize papers', 'Highly cited papers', 'Faculty Opinions papers']

for x in range(4):
    sns.kdeplot(data[x],ax = ax[x], shade = True, color = color_list[x], alpha = 0.5)
    rank_mean = np.mean(data[x])
    ax[x].axvline(x = rank_mean, linestyle = '--', color = 'gray')
    if x != 2:
        ax[x].text(x = 0.25, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.7, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    else:
        ax[x].text(x = 0.25, y = 0.7, s = 'higher\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
        ax[x].text(x = 0.7, y = 0.7, s = 'lower\ndisruption', fontsize = 12, horizontalalignment = 'center', verticalalignment = 'center', transform = ax[x].transAxes)
    ax[x].set_xlabel('Rankings', fontsize = 12)
    ax[x].tick_params(axis = 'x',labelsize = 12, labelrotation = 20)
    ax[x].tick_params(axis = 'y', labelsize = 12)
    ax[x].spines['bottom'].set_linewidth(0.5)
    ax[x].spines['bottom'].set_color('black')
    ax[x].spines['left'].set_linewidth(0.5)
    ax[x].spines['left'].set_color('black')
    ax[x].spines['top'].set_linewidth(0.5)
    ax[x].spines['top'].set_color('black')
    ax[x].spines['right'].set_linewidth(0.5)
    ax[x].spines['right'].set_color('black')
    ax[x].spines['right'].set_visible(False)
    ax[x].spines['top'].set_visible(False)
    ax[x].set_title(titles[x], fontsize = 12, fontweight = 'medium')

plt.tight_layout()
plt.suptitle('(e) DI1', fontsize = 14, ha = 'left',va = 'top', fontweight = 'bold', x = 0.002, y = 0.9999)
plt.show()

# Figure 7: The number of scientific breakthroughs ranked among the annual top 0.1% disruptive papers in PubMed over the five disruption indicators

In [None]:
import pandas as pd
import numpy as np

test_dataset_list = ['test_dataset_science', 'test_dataset_prize','test_dataset_highly_cited', 'test_dataset_faculty']

pubmed_dataset_list = ['pubmed_mED(rel)', 'pubmed_mED(ent)', 'pubmed_mCD', 'pubmed_di5', 'pubmed_di1']
indicators = ['mED_rel', 'mED_ent', 'mCD', 'DI5', 'DI1']

hits_list = []

for i in range(len(test_dataset_list)):
    hits_list_i = []    # number of papers ranked among the annual top 0.1% disruptive papers in PubMed
    
    test_dataset = test_dataset_list[i]
    test = pd.read_csv(r'data/%s.csv'%test_dataset, header = 0)
    
    for j in range(len(pubmed_dataset_list)):
        dataset = pubmed_dataset_list[j]
        pubmed = pd.read_csv(r'data/%s.csv'%dataset, header = 0)

        hits = []
        for y in range(1991, 2015):
            pubmed_year = pubmed[pubmed['pub_year'] == y]
            pubmed_year.sort_values(str(indicators[j]), ascending = False, inplace = True, ignore_index = True)
            pubmed_year_indicator = np.array(pubmed_year[str(indicators[j])].values)
            dvalue_year_top = np.percentile(pubmed_year_indicator, 99.9)
            pmid_year_top = list(pubmed_year[pubmed_year[str(indicators[j])] >= dvalue_year_top]['pmid'].values)

            test_year = test[test['pub_year'] == y]['pmid'].values

            hit_papers = list(set(list(pmid_year_top)) & set(list(test_year)))
            hits.append(len(hit_papers))

        hits_list_i.append(hits)
        
    hits_list.append(hits_list_i)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl

x = np.arange(4)
labels = ['mED(rel)', 'mED(ent)', 'mCD', 'DI5', 'DI1']

hit_year_dict_list = []

for i in range(len(x)):    # four test datasets
    hit_year_dict = {}
    for y in range(24):
        ylist = []
        ylist.append(hits_list[i][0][y])
        ylist.append(hits_list[i][1][y])
        ylist.append(hits_list[i][2][y])
        ylist.append(hits_list[i][3][y])
        ylist.append(hits_list[i][4][y])
        hit_year_dict[y] = ylist
    hit_year_dict_list.append(hit_year_dict)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
from matplotlib.colors import ListedColormap
 
fig, ax = plt.subplots(1,4,figsize = (18,4.5))
fig.subplots_adjust(top=0.8)

cmap = sns.diverging_palette(220, 20, as_cmap=True)

x = np.arange(5)     # x-axis scale: five indicators
title_list = ['Science breakthrough papers', 'Prize papers', 'Highly cited papers', 'Faculty Opinions papers']
labels = ['mED(rel)', 'mED(ent)', 'mCD', 'DI5', 'DI1']
width = 0.4

for a in range(4):
    bottom = np.array([0,0,0,0,0])
    colors = cmap(np.linspace(0,1,24))
    for y in range(24):
        ax[a].bar(x, hit_year_dict_list[a][y], width = width, bottom = bottom, color = colors[y], alpha = 0.8)
        bottom =  bottom + np.array(hit_year_dict_list[a][y])

    ax[a].set_xticks(x)
    ax[a].set_xticklabels(labels, rotation = 30)
    ax[a].set_title(str(title_list[a]), fontsize = 14)
    ax[a].yaxis.grid(linewidth=0.3,color='gray')
    ax[a].set_axisbelow(True)
    ax[a].tick_params(labelsize = 14)
#     ax[a].spines['bottom'].set_visible(False)
#     ax[a].spines['left'].set_visible(False)
    ax[a].spines['top'].set_visible(False)
    ax[a].spines['right'].set_visible(False)
    ax[a].spines['bottom'].set_linewidth(0.5)
#     ax[a].spines['bottom'].set_color('black')
    ax[a].spines['left'].set_linewidth(0.5)
#     ax[a].spines['left'].set_color('black')
#     ax[a].spines['top'].set_linewidth(0.5)
#     ax[a].spines['top'].set_color('gray')
#     ax[a].spines['right'].set_linewidth(0.2)
#     ax[a].spines['right'].set_color('gray')

ax[0].set_ylabel('Number of hit breakthrough papers', fontsize = 14)

plt.tight_layout()
plt.show()

# Table 3: The AUC scores of the five disruption indicators

## AUC scores for control groups

In [None]:
# Science breakthrough papers
import numpy as np
import pickle
import pandas as pd

with open(r'data/control_groups/control_pair_dict_year_authors_and_cits_1to5_science.dat', 'rb') as handle:
    control_dict = pickle.load(handle)    # control_dict: key: pmids for Science breakthrough papers, values: list of control papers corresponding to each Science breakthrough paper
# print(len(control_dict.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)    # the path of pubmed dataset can be changed into "data/pubmed_mED(rel).csv", "data/pubmed_mCD.csv", "data/pubmed_di5.csv", and "data/pubmed_di1.csv"
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict.keys()))]

n1 = 0
n2 = 0

for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)[0]
    D_control_list = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    for k in D_control_list:
        if D_test_i > k:
            n1 += 1
        elif D_test_i == k:
            n2 += 1     

AUC_science = round((n1 + 0.5*n2) / (len(df_test) * 5), 8)
print(len(df_test)*5)
print(n1)
print(n2)
print(AUC_science)

In [None]:
#  Prize papers
import numpy as np
import pickle
import pandas as pd

with open(r'data/control_groups/control_pair_dict_year_authors_and_cits_1to5_prize.dat', 'rb') as handle:
    control_dict2 = pickle.load(handle)
print(len(control_dict2.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict2.keys()))]

n1 = 0
n2 = 0

for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict2[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)[0]
    D_control_list = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    for k in D_control_list:
        if D_test_i > k:
            n1 += 1
        elif D_test_i == k:
            n2 += 1     

AUC_prize = round((n1 + 0.5*n2) / (len(df_test) * 5), 8)
print(len(df_test)*5)
print(n1)
print(n2)
print(AUC_prize)

In [None]:
#  Highly cited papers
import numpy as np
import pickle
import pandas as pd

with open(r'data/control_groups/control_pair_dict_year_authors_and_1to5_highly_cited.dat', 'rb') as handle:
    control_dict3 = pickle.load(handle)
print(len(control_dict3.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict3.keys()))]

n1 = 0
n2 = 0

for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict3[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)[0]
    D_control_list = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    for k in D_control_list:
        if D_test_i > k:
            n1 += 1
        elif D_test_i == k:
            n2 += 1     

AUC_highly_cited = round((n1 + 0.5*n2) / (len(df_test) * 5), 8)
print(len(df_test)*5)
print(n1)
print(n2)
print(AUC_highly_cited)

In [None]:
#  Faculty papers
import numpy as np
import pickle
import pandas as pd

with open(r'data/control_groups/control_pair_dict_year_authors_and_cits_1to5_faculty.dat', 'rb') as handle:
    control_dict4 = pickle.load(handle)
print(len(control_dict4.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict4.keys()))]

n1 = 0
n2 = 0

for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict4[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)[0]
    D_control_list = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    for k in D_control_list:
        if D_test_i > k:
            n1 += 1
        elif D_test_i == k:
            n2 += 1     

AUC_faculty = round((n1 + 0.5*n2) / (len(df_test) * 5), 8)
print(len(df_test)*5)
print(n1)
print(n2)
print(AUC_faculty)

## AUC scores for random groups

In [None]:
import pickle

with open(r'data/control_groups/science_control_pmids_list_random.pkl', 'rb') as f:
    science_control_pmids_list = pickle.load(f)

with open(r'data/control_groups/prize_control_pmids_list_random.pkl', 'rb') as f:
    prize_control_pmids_list = pickle.load(f)
    
with open(r'data/control_groups/highly_cited_control_pmids_list_random.pkl', 'rb') as f:
    highly_cited_control_pmids_list = pickle.load(f)

with open(r'data/control_groups/faculty_control_pmids_list_random.pkl', 'rb') as f:
    faculty_control_pmids_list = pickle.load(f)

In [None]:
#  Science breakthrough papers
import numpy as np
import pickle
import pandas as pd

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)    # the path of pubmed dataset can be changed into "data/pubmed_mED(rel).csv", "data/pubmed_mCD.csv", "data/pubmed_di5.csv", and "data/pubmed_di1.csv"

n1 = 0
n2 = 0

for item in science_control_pmids_list:
    D_test = list(df_pubmed[df_pubmed['pmid'] == item[0]]['mED_rel'].values)[0]
    D_control = list(df_pubmed[df_pubmed['pmid'] == item[1]]['mED_rel'].values)[0]
    if D_test > D_control:
        n1 += 1
    elif D_test == D_control:
        n2 += 1  

AUC_science_random = round((n1 + 0.5*n2) / 10000, 8)
print(n1)
print(n2)
print(AUC_science_random)

In [None]:
#  Prize papers
import numpy as np
import pickle
import pandas as pd

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)

n1 = 0
n2 = 0

for item in prize_control_pmids_list:
    D_test = list(df_pubmed[df_pubmed['pmid'] == item[0]]['mED_rel'].values)[0]
    D_control = list(df_pubmed[df_pubmed['pmid'] == item[1]]['mED_rel'].values)[0]
    if D_test > D_control:
        n1 += 1
    elif D_test == D_control:
        n2 += 1  

AUC_prize_random = round((n1 + 0.5*n2) / 10000, 8)
print(n1)
print(n2)
print(AUC_prize_random)

In [None]:
#  Highly cited papers
import numpy as np
import pickle
import pandas as pd

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)

n1 = 0
n2 = 0

for item in highly_cited_control_pmids_list:
    D_test = list(df_pubmed[df_pubmed['pmid'] == item[0]]['mED_rel'].values)[0]
    D_control = list(df_pubmed[df_pubmed['pmid'] == item[1]]['mED_rel'].values)[0]
    if D_test > D_control:
        n1 += 1
    elif D_test == D_control:
        n2 += 1  

AUC_highly_cited_random = round((n1 + 0.5*n2) / 10000, 8)
print(n1)
print(n2)
print(AUC_highly_cited_random)

In [None]:
#  Highly cited papers
import numpy as np
import pickle
import pandas as pd

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)

n1 = 0
n2 = 0

for item in faculty_control_pmids_list:
    D_test = list(df_pubmed[df_pubmed['pmid'] == item[0]]['mED_rel'].values)[0]
    D_control = list(df_pubmed[df_pubmed['pmid'] == item[1]]['mED_rel'].values)[0]
    if D_test > D_control:
        n1 += 1
    elif D_test == D_control:
        n2 += 1  

AUC_faculty_random = round((n1 + 0.5*n2) / 10000, 8)
print(n1)
print(n2)
print(AUC_faculty_random)

# Figure 8: Distributions of disruption scores of scientific breakthroughs and control papers for mED(rel)

In [None]:
# Science Breakthrough papers
import numpy as np
import pickle
import pandas as pd

with open('control_groups/control_pair_dict_year_authors_and_cits_1to5_science.dat', 'rb') as handle:
    control_dict = pickle.load(handle)
print(len(control_dict.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict.keys()))]

D_test = []
D_control = []
for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)
    D_control_i = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    if D_test_i[0] is not None and len(D_control_i) >= 5:
        D_test.append(D_test_i[0])
        D_control_i_mean = np.mean(D_control_i)
        D_control.append(D_control_i_mean)        

print(len(D_control))
print(len(D_test))
print(np.mean(D_control))
print(np.mean(D_test))

In [None]:
from scipy import stats

print(stats.levene(D_test, D_control))    ### p >> 0.05，homogeneity of variances，equal_var = True; otherwise, heterocedasticity, equal_var = False
print(stats.ttest_ind(D_test, D_control, equal_var = False))

In [None]:
# Prize papers
import numpy as np
import pickle
import pandas as pd

with open('control_groups/control_pair_dict_year_authors_and_cits_1to5_prize.dat', 'rb') as handle:
    control_dict2 = pickle.load(handle)
print(len(control_dict2.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict2.keys()))]

D_test2 = []
D_control2 = []
for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict2[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)
    D_control_i = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    if D_test_i[0] is not None and len(D_control_i) >= 5:
        D_test2.append(D_test_i[0])
        D_control_i_mean = np.mean(D_control_i)
        D_control2.append(D_control_i_mean)        

print(len(D_control2))
print(len(D_test2))
print(np.mean(D_control2))
print(np.mean(D_test2))

In [None]:
from scipy import stats

print(stats.levene(D_test2, D_control2)) ### p >> 0.05，homogeneity of variances，equal_var = True; otherwise, heterocedasticity, equal_var = False
print(stats.ttest_ind(D_test2, D_control2, equal_var = True))

In [None]:
# Highly cited papers
import numpy as np
import pickle
import pandas as pd

with open('control_groups/control_pair_dict_year_authors_and_1to5_highly_cited.dat', 'rb') as handle:
    control_dict3 = pickle.load(handle)
print(len(control_dict3.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict3.keys()))]

D_test3 = []
D_control3 = []
for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict3[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)
    D_control_i = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    if D_test_i[0] is not None and len(D_control_i) >= 5:
        D_test3.append(D_test_i[0])
        D_control_i_mean = np.mean(D_control_i)
        D_control3.append(D_control_i_mean)        

print(len(D_control3))
print(len(D_test3))
print(np.mean(D_control3))
print(np.mean(D_test3))

In [None]:
from scipy import stats

print(stats.levene(D_test3, D_control3)) ### p >> 0.05，homogeneity of variances，equal_var = True; otherwise, heterocedasticity, equal_var = False
print(stats.ttest_ind(D_test3, D_control3, equal_var = False))

In [None]:
# Faculty papers
import numpy as np
import pickle
import pandas as pd

with open('control_groups/control_pair_dict_year_authors_and_cits_1to5_faculty.dat', 'rb') as handle:
    control_dict4 = pickle.load(handle)
print(len(control_dict4.keys()))

df_pubmed = pd.read_csv(r'data/pubmed_mED(rel).csv', header = 0)
df_test = df_pubmed[df_pubmed.pmid.isin(list(control_dict4.keys()))]

D_test4 = []
D_control4 = []
for i, item in df_test.iterrows():
    test_pmid = item['pmid']
    control_pmid = control_dict4[test_pmid]
    D_test_i = list(df_pubmed[df_pubmed['pmid'] == test_pmid]['mED_rel'].values)
    D_control_i = list(df_pubmed[df_pubmed.pmid.isin(control_pmid)]['mED_rel'].values)
    if D_test_i[0] is not None and len(D_control_i) >= 5:
        D_test4.append(D_test_i[0])
        D_control_i_mean = np.mean(D_control_i)
        D_control4.append(D_control_i_mean)        

print(len(D_control4))
print(len(D_test4))
print(np.mean(D_control4))
print(np.mean(D_test4))

In [None]:
from scipy import stats

print(stats.levene(D_test4, D_control4))   ### p >> 0.05，homogeneity of variances，equal_var = True; otherwise, heterocedasticity, equal_var = False
print(stats.ttest_ind(D_test4, D_control4, equal_var = False))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.axes_grid1.inset_locator import inset_axes

fig, ax = plt.subplots(2,2,figsize = (12,10))

sns.distplot(D_test,kde = True, ax = ax[0,0],color = '#9AC9DB')
sns.distplot(D_control,kde = True,ax = ax[0,0],color ='#F8AC8C')
ax[0,0].axvline(np.mean(D_test),color = '#9AC9DB', linestyle = 'dashed', linewidth = 1.5)
ax[0,0].axvline(np.mean(D_control),color = '#F8AC8C', linestyle = 'dashed', linewidth = 1.5)
ax[0,0].set_xlabel('disruption value', fontsize = 10)
# ax[0,0].set_xlim(-0.02, 0.02)
ax[0,0].legend(['test group', 'control group'], loc = 'upper right')
ax[0,0].text(0.55, 0.75, 'T-test, p-value = 2.369e-05', transform = ax[0,0].transAxes, fontdict = {'size': '10', 'color': 'black'})
ax[0,0].spines['bottom'].set_linewidth(0.5)
ax[0,0].spines['bottom'].set_color('gray')
ax[0,0].spines['left'].set_linewidth(0.5)
ax[0,0].spines['left'].set_color('gray')
ax[0,0].spines['top'].set_linewidth(0.5)
ax[0,0].spines['top'].set_color('gray')
ax[0,0].spines['right'].set_linewidth(0.5)
ax[0,0].spines['right'].set_color('gray')
# ax[0,0].spines['top'].set_visible(False)
# ax[0,0].spines['right'].set_visible(False)
ax[0,0].set_title('Science breakthrough papers', fontsize = 12, loc = 'center')

sns.distplot(D_test2,kde = True, ax = ax[0,1],color = '#9AC9DB')
sns.distplot(D_control2,kde = True,ax = ax[0,1],color ='#F8AC8C')
ax[0,1].axvline(np.mean(D_test2),color = '#9AC9DB', linestyle = 'dashed', linewidth = 1.5)
ax[0,1].axvline(np.mean(D_control2),color = '#F8AC8C', linestyle = 'dashed', linewidth = 1.5)
ax[0,1].set_xlabel('disruption value', fontsize = 10)
# ax[0,1].set_xlim(-0.02, 0.02)
ax[0,1].legend(['test group', 'control group'], loc = 'upper right')
ax[0,1].text(0.6, 0.75, 'T-test, p-value = 0.480', transform = ax[0,1].transAxes, fontdict = {'size': '10', 'color': 'black'})
ax[0,1].spines['bottom'].set_linewidth(0.5)
ax[0,1].spines['bottom'].set_color('gray')
ax[0,1].spines['left'].set_linewidth(0.5)
ax[0,1].spines['left'].set_color('gray')
ax[0,1].spines['top'].set_linewidth(0.5)
ax[0,1].spines['top'].set_color('gray')
ax[0,1].spines['right'].set_linewidth(0.5)
ax[0,1].spines['right'].set_color('gray')
# ax[0,1].spines['top'].set_visible(False)
# ax[0,1].spines['right'].set_visible(False)
ax[0,1].set_title('Prize papers', fontsize = 12, loc = 'center')

sns.distplot(D_test3,kde = True, ax = ax[1,0],color = '#9AC9DB')
sns.distplot(D_control3,kde = True,ax = ax[1,0],color ='#F8AC8C')
ax[1,0].axvline(np.mean(D_test3),color = '#9AC9DB', linestyle = 'dashed', linewidth = 1.5)
ax[1,0].axvline(np.mean(D_control3),color = '#F8AC8C', linestyle = 'dashed', linewidth = 1.5)
ax[1,0].set_xlabel('disruption value', fontsize = 10)
ax[1,0].set_xlim(-0.02, 0.02)
ax[1,0].legend(['test group', 'control group'], loc = 'upper left')
ax[1,0].text(0.02, 0.75, 'T-test, p-value = 3.415e-66', transform = ax[1,0].transAxes, fontdict = {'size': '10', 'color': 'black'})
ax[1,0].spines['bottom'].set_linewidth(0.5)
ax[1,0].spines['bottom'].set_color('gray')
ax[1,0].spines['left'].set_linewidth(0.5)
ax[1,0].spines['left'].set_color('gray')
ax[1,0].spines['top'].set_linewidth(0.5)
ax[1,0].spines['top'].set_color('gray')
ax[1,0].spines['right'].set_linewidth(0.5)
ax[1,0].spines['right'].set_color('gray')
# ax[1,0].spines['top'].set_visible(False)
# ax[1,0].spines['right'].set_visible(False)
ax[1,0].set_title('Highly cited papers', fontsize = 12, loc = 'center')

axin = ax[1,0].inset_axes([0.12, 0.22, 0.3, 0.35])
sns.distplot(D_test3,kde = True, ax = axin,color = '#9AC9DB')
axin.set_xlim(-0.2, 0.2)
# axin.set_ylim(0, 12)
axin.set(ylabel = None)
axin.spines['bottom'].set_linewidth(0.5)
axin.spines['bottom'].set_color('gray')
axin.spines['left'].set_linewidth(0.5)
axin.spines['left'].set_color('gray')
axin.spines['top'].set_linewidth(0.5)
axin.spines['top'].set_color('gray')
axin.spines['right'].set_linewidth(0.5)
axin.spines['right'].set_color('gray')

sns.distplot(D_test4,kde = True, ax = ax[1,1],color = '#9AC9DB')
sns.distplot(D_control4,kde = True,ax = ax[1,1],color ='#F8AC8C')
ax[1,1].axvline(np.mean(D_test4),color = '#9AC9DB', linestyle = 'dashed', linewidth = 1.5)
ax[1,1].axvline(np.mean(D_control4),color = '#F8AC8C', linestyle = 'dashed', linewidth = 1.5)
ax[1,1].set_xlabel('disruption value', fontsize = 10)
# ax[1,1].set_xlim(-0.02, 0.02)
ax[1,1].legend(['test group', 'control group'], loc = 'upper right')
ax[1,1].text(0.55, 0.75, 'T-test, p-value = 1.574e-59', transform = ax[1,1].transAxes, fontdict = {'size': '10', 'color': 'black'})
ax[1,1].spines['bottom'].set_linewidth(0.5)
ax[1,1].spines['bottom'].set_color('gray')
ax[1,1].spines['left'].set_linewidth(0.5)
ax[1,1].spines['left'].set_color('gray')
ax[1,1].spines['top'].set_linewidth(0.5)
ax[1,1].spines['top'].set_color('gray')
ax[1,1].spines['right'].set_linewidth(0.5)
ax[1,1].spines['right'].set_color('gray')
# ax[1,1].spines['top'].set_visible(False)
# ax[1,1].spines['right'].set_visible(False)
ax[1,1].set_title('Faculty Opinions papers', fontsize = 12, loc = 'center')

plt.show()