# Figure plotting

Codes for reproducing the main figures by SciSciNet data. All the figures use the same dataframe merged from the processed data. You can find this data in the path "data/processed/PaperID_KI2-Dopen_nok_control.pickle".

## Figure 1B-C
Loading data

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
pre_path = os.path.abspath(r"..")


# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_merged.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# extract df_B containing ['PaperID',KI_type] for Figure 1B
df_B = selected_df[['PaperID',KI_type]]
df_B = df_B.dropna()
# extract df_C containing [KI_type, 'Year', 'Field'] for Figure 1C
df_C = selected_df[[KI_type, 'Year', 'Field']]
df_C = df_C.dropna()

Plotting

In [None]:
# Set seaborn style
palette_a = sns.color_palette("deep", 10) + sns.color_palette("vlag", 9)

nrows_base, ncols_base = 1, 2
fig,[ax1,ax2]=plt.subplots(nrows=nrows_base, ncols=ncols_base, figsize=(10, 4))
fig.subplots_adjust(hspace=.35,wspace=.35) # hspace,wspace


# Plot Figure 1B: Distribution of KI values
sns.histplot(data=df_B, x=KI_type, bins=50, kde=False, stat='probability', color='#96B6D8', ax=ax1)
ax1.set_ylabel("Fraction")
ax1.set_xlabel("KI")
ax1.set_xlim(-1.05,1.05)
ax1.set_xticks(np.arange(-1, 1.1, 0.5))


# Plot Figure 1C: Temporal patten of KI values across Fields
field_list = ['Physics', 'Biology', 'Medicine', 'Chemistry', 'Psychology', 'Engineering', 'Mathematics', 'Sociology', 'Economics', 'Philosophy', 'Geography', 
              'Art', 'History', 'Geology', 'Business', 'Political science', 'Materials science', 'Computer science', 'Environmental science']
sns.lineplot(data=df_C, x='Year', y=KI_type, hue='Field', hue_order=field_list, palette=palette_a, #marker='s',
                dashes=False, legend='full', ax=ax2)
ax2.set_ylabel("KI")
ax2.set_xlabel("Year")
ax2.set_xlim(1949,2021+2)
ax2.set_xticks(range(1950, 2021, 10))

# Set the legend for Figure 1C
handles, labels = ax2.get_legend_handles_labels()
legend = ax2.legend(handles, labels, loc='upper center')
# Divide the legend into two parts
cutoff = 11
first_legend_handles = handles[:cutoff]
first_legend_labels = labels[:cutoff]
second_legend_handles = handles[cutoff:]
second_legend_labels = labels[cutoff:]
# Create the first legend
first_legend = ax2.legend(first_legend_handles, first_legend_labels, loc='lower left', title_fontsize=10, ncol=1, fontsize=6, 
            shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
ax2.add_artist(first_legend)
# Create the second legend
second_legend = ax2.legend(second_legend_handles, second_legend_labels, loc='lower center', bbox_to_anchor=(0.42, 0), title_fontsize=10, ncol=1, fontsize=6, 
            shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)


# Add annotations for the subplots
note_size = 25
x_pos, y_pos = -0.2, 1.16 # position of the annotation in axes fraction
ax1.annotate(chr(97 + 1).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=note_size, fontfamily='Arial', color='black', fontweight='bold')
ax2.annotate(chr(97 + 2).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=note_size, fontfamily='Arial', color='black', fontweight='bold')

plt.show()
fig.savefig('%s/results/figures/Fig1b_c.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)

## Figure 2

#### Loading data

In [None]:
import sys
import os
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns

pre_path = os.path.abspath(r"..")
sys.path.insert(1, os.path.join(pre_path, 'src'))
from utils import rank, rank_bin, binary_check, convert_to_decade

In [None]:
# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_merged.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# drop rows with NaN values in the columns [KI_type, DC_type]
selected_df = selected_df.dropna(subset=[KI_type, DC_type])

# calculate the percentile ranks and binary tags of KI and DC
bin_rank_list = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
label_rank_list = [2.5,7.5,12.5,17.5,22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5,87.5,92.5,97.5]
selected_df['KI_percentile_bin'] = rank_bin(list(selected_df[KI_type]),bin_rank_list,label_rank_list)
selected_df['DC_percentile'] = rank(list(selected_df[DC_type]))
selected_df['DC_positive_tag'] = binary_check(list(selected_df[DC_type]))
# calculate the percentile ranks of C5
selected_df['C5_percentile_bin'] = pd.qcut(x = selected_df['C5'],q=5,labels=['0-20%','20-40%','40-60%','60-80%','80-100%'])
# convert the 'Year' column to decade format
selected_df['Decade'] = selected_df['Year'].apply(convert_to_decade)

#### Figure 2A

In [None]:
# Plot Figure 2A: relationship between Disruption and Knowledge Independence.
fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(4, 3))
fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace

color_a, color_b = '#245297', '#62AA67'
sns.lineplot(data=selected_df, x='KI_percentile_bin', y="DC_percentile", color=color_a, ax=ax)
ax.set_xlabel('KI percentile')
ax.set_ylabel('Disruption percentile', color=color_a)
ax.tick_params(axis='y', labelcolor=color_a)

# Setup the right y axis for Figure 2A
ax_1 = ax.twinx()
sns.lineplot(data=selected_df, x='KI_percentile_bin', y='DC_positive_tag', color=color_b, ax=ax_1)
ax_1.set_ylabel('Disruption positive ratio', color=color_b)
ax_1.tick_params(axis='y', labelcolor=color_b)

plt.show()
fig.savefig('%s/results/figures/Fig2A.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)
plt.close(fig)  # Close the figure to avoid display issues in some environments

#### Figure 2B-D

In [None]:
# Iteratively plot Figure 2B, 2C, 2D: Conditioning on paper's impact C5, publication decade, and field
condition_list = ['C5_percentile_bin', 'Decade', 'Field'] # Conditions for Figure 2B, 2C, 2D
condition_labels_list = [['0-20%','20-40%','40-60%','60-80%','80-100%'], 
                        ['1950s','1960s','1970s','1980s','1990s','2000s','2010s'],
                        ['Physics', 'Biology', 'Medicine', 'Chemistry', 'Psychology', 'Engineering', 'Mathematics', 
                         'Sociology', 'Economics', 'Philosophy', 'Geography', 'Art', 'History', 'Geology', 'Business', 
                         'Political science', 'Materials science', 'Computer science', 'Environmental science']]
palette_list = [sns.color_palette('Blues', 5),
                sns.color_palette('OrRd', 7),
                sns.color_palette("deep", 10) + sns.color_palette("vlag", 9)]
legend_title_list = ['Impact $C_{5}$ percentile', '', '']
annotations_list = ['B', 'C', 'D'] # annotations for the subplots

for idx in range(len(condition_list)):
    condition = condition_list[idx]
    condition_labels = condition_labels_list[idx]
    palette_ = palette_list[idx]
    legend_title = legend_title_list[idx]

    fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(4, 3))
    fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace

    df_condition = selected_df[[KI_type,DC_type,condition]]
    df_condition = df_condition.dropna() # drop rows with NaN values in the condition column

    # Plot Figure 2B: Conditioning on paper's impact C5
    sns.lineplot(data=df_condition, x='KI_percentile_bin', y="DC_percentile", hue=condition, 
                 hue_order=condition_labels, palette=palette_, dashes=False, legend='full', ax=ax)
    ax.set_xlabel('KI percentile')
    ax.set_ylabel("Disruption percentile")

    # set the legend for Figure 2B, 2C
    if condition != 'Field':
        ax.legend(title=legend_title, title_fontsize=10, loc='best', ncol=1, fontsize=8, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
    # set custom legend for Figure 2D
    else:
        handles, labels = ax.get_legend_handles_labels()
        legend = ax.legend(handles, labels, loc='upper center')
        # Divide the legend into two parts
        cutoff = 11
        first_legend_handles = handles[:cutoff]
        first_legend_labels = labels[:cutoff]
        second_legend_handles = handles[cutoff:]
        second_legend_labels = labels[cutoff:]
        # Create the first legend
        first_legend = ax.legend(first_legend_handles, first_legend_labels, loc='upper left', title_fontsize=10, ncol=1, fontsize=7, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
        ax.add_artist(first_legend)
        # Create the second legend
        second_legend = ax.legend(second_legend_handles, second_legend_labels, loc='upper center', bbox_to_anchor=(0.5, 1), title_fontsize=10, ncol=1, fontsize=7, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)

    plt.show()
    fig.savefig('%s/results/figures/Fig2%s.pdf'%annotations_list[idx], bbox_inches='tight',dpi=600,pad_inches=0.0)
    plt.close(fig)  # Close the figure to avoid display issues in some environments

#### Figure 2E

In [None]:
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns


def plot_p(DF_data,DF_p_value,KI_size_bins,ax,width=16,heigth=12,widthx = 0, widthy = -0.15, annot_fontsize=5.5):
    """
    Plot a heatmap of the data in DF_data with annotations and significance markers.
    """
    # Set parameters
    vmax = DF_data.abs().max().max()
    im1 = sns.heatmap(DF_data, annot=True, cmap='vlag', center=0 , vmax=vmax, vmin=-vmax, square=True, 
                      fmt='.2f', ax = ax, cbar=False, annot_kws={"color": "k", "fontsize": annot_fontsize})
    
    ax.set_title('ATT on Disruption percentile')
    ax.set_xlabel('Controlled KI percentile')
    ax.set_ylabel('Treated KI percentile')
    ax.set_xticklabels(KI_size_bins, rotation = 0, horizontalalignment='center', fontsize = 'x-small')
    ax.set_yticklabels(KI_size_bins, rotation = 0, horizontalalignment='right', fontsize = 'x-small')

    # Add significance markers
    for m in ax.get_xticks():
        for n in ax.get_yticks():
            if m == n: continue
            pv = (DF_p_value.values[int(m),int(n)])
            if  pv< 0.05 and pv>= 0.01:
                ax.text(n+widthx,m+widthy,'*',ha = 'center',color = 'k', fontsize = annot_fontsize)
            if  pv< 0.01 and pv>= 0.001:
                ax.text(n+widthx,m+widthy,'**',ha = 'center',color = 'k', fontsize = annot_fontsize)
            if  pv< 0.001:
                ax.text(n+widthx,m+widthy,'***',ha = 'center',color = 'k', fontsize = annot_fontsize)
    plt.tight_layout()

In [None]:
# Define the regression type and treatment effect type
data_type = 'raw'       # data_type: raw or normalized
regression_type = 'glm' # regression_type: glm or logit
treat_effect_type='ATT' # treat_effect_type: ATT or ATC

# Load the data
KI_size_bins = [2.5,7.5,12.5,17.5,22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5,87.5,92.5,97.5]
DF_data = pd.DataFrame(columns=KI_size_bins, index=KI_size_bins)    # DataFrame to store effect sizes
DF_p_value = pd.DataFrame(columns=KI_size_bins, index=KI_size_bins) # DataFrame to store p-values
for control_size in KI_size_bins:
    df_temp = pd.read_csv('%s/results/results_for_tables/PSM_Analysis_Results/ALL_%s_%s_%s.csv'%(pre_path,control_size,treat_effect_type,regression_type))
    DF_data[control_size] = df_temp['Effect Size'].tolist()
    DF_p_value[control_size] = df_temp['P value'].tolist()

# Plot the heatmap
fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(10*.75, 8*.75))
fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace
plot_p(DF_data,DF_p_value,KI_size_bins,ax)

# Add annotations for the subplots
note_size = 25
x_pos, y_pos = -0.2*.65, 1+.16*.5
ax.annotate(chr(97 + 4).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
            xytext=(5, -5), textcoords='offset points', ha='left', va='top',
            fontsize=note_size*.75, fontfamily='Arial', color='black', fontweight='bold')

plt.show()
fig.savefig('%s/results/figures/Fig2E.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)

## Figure 3

#### Loading data

In [None]:
import sys
import pandas as pd
import numpy as np
from scipy.stats import linregress
import matplotlib.pylab as plt
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
import seaborn as sns

pre_path = os.path.abspath(r"..")
sys.path.insert(1, os.path.join(pre_path, 'src'))
from utils import rank, rank_bin, convert_to_decade

In [None]:
# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_merged.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# drop rows with NaN values in the columns [KI_type, DC_type]
selected_df = selected_df.dropna(subset=[KI_type, DC_type])

# calculate the percentile ranks and binary tags of KI and DC
selected_df['KI_percentile'] = rank(list(selected_df[KI_type]))
selected_df['DC_percentile'] = rank(list(selected_df[DC_type]))
selected_df['KI_percentile_bin5'] = rank_bin(list(selected_df[KI_type]),[0,20,40,60,80,100],['0-20%','20-40%','40-60%','60-80%','80-100%'])

# convert other variables to proper formats
selected_df['Decade'] = selected_df['Year'].apply(convert_to_decade)
# rescale the KI_percentile to relative values by dividing by the mean of KI_percentile in the same decade
selected_df['Relative_KI_percentile'] = selected_df.groupby('Decade')['KI_percentile'].transform(lambda x: x / x.mean())

selected_df['Team_Size'] = selected_df['Team_Size'].apply(lambda x: int(11) if x > 10 else int(x))
selected_df['Team_Distance'] = pd.cut(x = selected_df['Team_Distance'], bins=[0, 100, float('inf')], labels=[0,1], right=True, include_lowest=True) # 0 (onsite): within 100km, 1 (remote): beyond 100km
selected_df['Team_Distance_tag'] = pd.cut(x = selected_df['Team_Distance'], bins=[0, 100, 300, 500, 700, 900, 1100, float('inf')], labels=[0, 200, 400, 600, 800, 1000, 1200], right=True, include_lowest=True) # 0 (onsite): within 100km, 200: 100-300km, 400: 300-500km, 600: 500-700km, 800: 700-900km, 1000: 900-1100km, 1200: beyond 1100km
selected_df['Team_Distance'] = selected_df['Team_Distance'].astype(int)
selected_df['Team_Distance_tag'] = selected_df['Team_Distance_tag'].astype(int)


#### Figure 3A, 3B, 3C

In [None]:
# Iteratively plot Figure 3A, 3B, 3C: Conditioning on paper's team size, team distance, and team freshness
# setup the annotation properties in axes fraction
note_size = 25
x_pos, y_pos = -0.2, 1.16 # position of the annotation in axes fraction

condition_list = ['Team_Size','Team_Distance','Team_Freshness']
xlabel_list = ['Team size', 'Team distance (km)', 'Team freshness']
xticks_list = [np.arange(1, 12, 2), range(2), np.arange(0, 4, 1)]
xticklabels_list = [['1', '3', '5', '7', '9', '10+'], ['Onsite','Remote'], np.arange(0, 4, 1)]
legend_tag = ['full',False,False]
annotation_list = ['A', 'B', 'C']

for idx in range(len(condition_list)):
    condition = condition_list[idx]

    df_condition = selected_df[['KI_percentile_bin5','Relative_KI_percentile','DC_percentile',condition,'Decade']]
    df_condition = df_condition.dropna() # drop rows with NaN values in the condition column


    fig,[[ax_upper],[ax_lower]]=plt.subplots(nrows=2, ncols=1, figsize=(4, 7))
    fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace

    # Plot the upper row of subplots
    palette_upper = sns.color_palette('OrRd', 7)
    sns.lineplot(data=df_condition, x=condition, y="Relative_KI_percentile", hue='Decade', 
                hue_order=['1950s','1960s','1970s','1980s','1990s','2000s','2010s'], 
                palette=palette_upper, dashes=False, legend=legend_tag[idx], ax=ax_upper) # legend='full', marker='o', markersize=5
    
    # For team distance, we need to add an inset plot to display the detailed distance distribution
    if xlabel_list[idx] == 'Team distance (km)':
        ax_inset = inset_axes(ax_upper, width="76%", height="76%", bbox_to_anchor=(0.08, 0.06, 0.5, 0.5), bbox_transform=ax_upper.transAxes, loc='lower left')
        df_condition = selected_df[['KI_percentile_bin5','Relative_KI_percentile','DC_percentile',condition,'Team_Distance_tag','Decade']]
        df_condition = df_condition.dropna() # drop rows with NaN values in the condition column        
        sns.lineplot(data=df_condition, x='Team_Distance_tag', y="Relative_KI_percentile", hue='Decade', 
                    hue_order=['1950s','1960s','1970s','1980s','1990s','2000s','2010s'], 
                    palette=palette_upper, dashes=False, legend=legend_tag[idx], ax=ax_inset) # legend='full', marker='o', markersize=5
        # Add a horizontal line at y=1
        ax_inset.axhline(y=1, color='gray', linestyle='--', alpha=0.5)
        ax_inset.set_ylabel('')
        ax_inset.set_xlabel('')
        ax_inset.set_xticks(np.arange(0, 1300, 200))
        ax_inset.set_xticklabels(['0', '', '', '600', '', '', '1100+'], horizontalalignment='center')
        
        plt.setp(ax_inset.get_xticklabels(), fontsize='x-small')
        ax_inset.tick_params(axis='x', which='both', pad=2)
        plt.setp(ax_inset.get_yticklabels(), fontsize='x-small')
        ax_inset.tick_params(axis='y', which='both', pad=2)
    
    # Add a horizontal line at y=1
    ax_upper.axhline(y=1, color='gray', linestyle='--', alpha=0.5)
    ax_upper.set_xlabel(xlabel_list[idx])
    ax_upper.set_xticks(xticks_list[idx])
    ax_upper.set_xticklabels(xticklabels_list[idx], horizontalalignment='center')
    ax_upper.set_ylabel("Relative KI percentile")
    if legend_tag[idx] == 'full':
        ax_upper.legend(title='', title_fontsize=10, loc='best', ncol=1, fontsize=7, 
                shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
    
    # Add annotations for the upper row of subplots
    ax_upper.annotate(chr(97 + idx).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
                xytext=(5, -5), textcoords='offset points', ha='left', va='top',
                fontsize=note_size, fontfamily='Arial', color='black', fontweight='bold')
    

    # Plot the lower row of subplots
    palette_lower = sns.color_palette('Blues', 5)
    sns.lineplot(data=df_condition, x=condition, y="DC_percentile", hue='KI_percentile_bin5', hue_order=label_rank_list, 
                palette=palette_lower, dashes=False, legend=legend_tag[idx], ax=ax_lower)
    # add fit line for each category
    for category_idx in range(len(label_rank_list)):
        category = label_rank_list[category_idx]
        subset = df_condition[df_condition['KI_percentile_bin5'] == category]
        slope, intercept, r_value, p_value, std_err = linregress(subset[condition], subset['DC_percentile'])
        ax_lower.plot(subset[condition], intercept + slope * subset[condition], linestyle='--', color=palette_lower[category_idx])
        x_min, x_max = subset[condition].min(), subset[condition].max()
        text_pos = x_min + x_max*0.2
        ax_lower.text(text_pos, intercept + slope*text_pos + 1, f's={slope:.2f}', color=palette_lower[category_idx], fontsize=8)
    
    ax_lower.set_xlabel(xlabel_list[idx])
    ax_lower.set_xticks(xticks_list[idx])
    ax_lower.set_xticklabels(xticklabels_list[idx], horizontalalignment='center')
    ax_lower.set_ylabel("Disruption percentile")
    if legend_tag[idx] == 'full':
        ax_lower.legend(title="KI percentile", title_fontsize=8, loc='best', ncol=1, fontsize=7, 
                shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)

    # add inset plot for the lower row of subplots
    ax_inset = inset_axes(ax_lower, width="50%", height="50%", bbox_to_anchor=(0.5, 0.005, 0.5, 0.5), bbox_transform=ax_lower.transAxes, loc='lower right')
    sns.lineplot(data=df_condition, x=condition, y="DC_percentile", color="#245297", 
                dashes=False, legend=False, ax=ax_inset)
    # add fit line for the overall categories
    slope, intercept, r_value, p_value, std_err = linregress(df_condition[condition], df_condition['DC_percentile'])
    ax_inset.plot(df_condition[condition], intercept + slope * df_condition[condition], linestyle='--')
    x_min, x_max = df_condition[condition].min(), df_condition[condition].max()
    text_pos = x_min + x_max*0.2
    if xlabel_list[idx] == 'Team size':
        ax_inset.text(text_pos, intercept + slope*text_pos + 1, f's={slope:.2f}', color="#245297", fontsize=6)
    elif xlabel_list[idx] == 'Team distance (km)':
        ax_inset.text(text_pos+0.1, intercept + slope*text_pos + 0.5, f's={slope:.2f}', color="#245297", fontsize=6)
    elif xlabel_list[idx] == 'Team freshness':
        ax_inset.text(x_max*0.1, intercept + slope*(x_max*0.8) + 1, f's={slope:.2f}', color="#245297", fontsize=6)
        
    ax_inset.set_xlabel('')
    ax_inset.set_xlim(x_min-.25,x_max+.25)
    ax_inset.set_xticks([])
    plt.setp(ax_inset.get_xticklabels(), fontsize='x-small')
    ax_inset.tick_params(axis='x', which='both', pad=2)

    ax_inset.set_ylabel('')
    plt.setp(ax_inset.get_yticklabels(), fontsize='x-small')
    ax_inset.tick_params(axis='y', which='both', pad=2)
    
    # Add annotations for the inset plot
    ax_lower.annotate(chr(97 + 3+idx).upper(), xy=(x_pos, y_pos), xycoords='axes fraction',
                xytext=(5, -5), textcoords='offset points', ha='left', va='top',
                fontsize=note_size, fontfamily='Arial', color='black', fontweight='bold')

    plt.show()
    fig.savefig('%s/results/figures/Fig3%s.pdf'%annotation_list[idx], bbox_inches='tight',dpi=600,pad_inches=0.0)
    plt.close(fig)  # Close the figure to avoid display issues in some environments

## Figure 4

#### Loading data

In [None]:
import sys
import os
import pickle
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib.colors import Normalize
from matplotlib.lines import Line2D
from matplotlib.cm import ScalarMappable

pre_path = os.path.abspath(r"..")
sys.path.insert(1, os.path.join(pre_path, 'src'))
from utils import read_big_csv, rank, rank_bin, convert_to_decade

In [None]:
# Set parameters
DC_type = 'Dopen_nok'  # 'Dopen_nok', 'Dopen', 'D5_nok', 'D5'
KI_type = 'KI2'        # 'KI2', 'KI2_frac', 'KI2_adj', 'KI2_adj_frac'
file_path = '%s/data/processed/PaperID_%s-%s_merged.pickle'%(pre_path,KI_type,DC_type)

# Load the data
df = pd.read_pickle(file_path)

# setup the selection criteria
selected_doctypes = ['Journal', 'Conference'] # all doctypes: ['Journal','Thesis','Conference','Book','BookChapter','Repository','Dataset']
selected_pubyears = list(range(1950,2022))    # to ensure that the latest papers have at least 1 years of citation history
selected_df = df[(df['DocType'].isin(selected_doctypes)) & (df['Year'].isin(selected_pubyears))]

# drop rows with NaN values in the columns ['Copen', DC_type, 'Year', 'Field']
selected_df = selected_df.dropna(subset=['Copen', DC_type, 'Year', 'Field'])

# set bins and labels for the values and percentiles
bin_list = [-1,-0.9,-0.8,-0.7,-0.6,-0.5,-0.4,-0.3,-0.2,-0.1,0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
label_list = [-0.95,-0.85,-0.75,-0.65,-0.55,-0.45,-0.35,-0.25,-0.15,-0.05,0.05,0.15,0.25,0.35,0.45,0.55,0.65,0.75,0.85,0.95]
bin_rank_list = [0,5,10,15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95,100]
label_rank_list = [2.5,7.5,12.5,17.5,22.5,27.5,32.5,37.5,42.5,47.5,52.5,57.5,62.5,67.5,72.5,77.5,82.5,87.5,92.5,97.5]

# calculate the percentile ranks of Copen and DC
selected_df['Citation_percentile'] = selected_df.groupby(['Year', 'Field'])['Copen'].transform(rank) # calculate the citation percentile within each year and field
selected_df['Citation_percentile_bin'] = pd.cut(x = selected_df['Citation_percentile'], bins = bin_rank_list, labels = label_rank_list, include_lowest = True)
selected_df['Citation_percentile_bin'] = selected_df['Citation_percentile_bin'].astype(float)
selected_df['DC_percentile'] = rank(list(selected_df[DC_type]))

# convert other variables to proper formats
selected_df['Decade'] = selected_df['Year'].apply(convert_to_decade)
selected_df['KI_bin'] = pd.cut(x = selected_df[KI_type], bins = bin_list, labels = label_list, include_lowest = True) # 20 bins of KI values for X-axis
selected_df['KI_percentile_bin'] = rank_bin(list(selected_df[KI_type]),bin_rank_list,label_rank_list) # 20 bins of KI percentiles for X-axis
selected_df['KI_percentile_bin5'] = rank_bin(list(selected_df[KI_type]),[0,20,40,60,80,100],['0-20%','20-40%','40-60%','60-80%','80-100%']) # 5 bins of KI percentiles for controls


#### Figure 4A, 4B, 4C, 4G

In [None]:
# Iteratively plot Figure 4A, 4B, 4C, 4G: Conditioning on paper's journal rank, publication decade, field, and KI percentile.
# setup the subplots
condition_list = ['SJR','Decade','Field','KI_percentile_bin5']
condition_labels_list = [['Q1','Q2','Q3','Q4'], 
                        ['1950s','1960s','1970s','1980s','1990s','2000s','2010s'],
                        ['Physics', 'Biology', 'Medicine', 'Chemistry', 'Psychology', 'Engineering', 'Mathematics', 
                         'Sociology', 'Economics', 'Philosophy', 'Geography', 'Art', 'History', 'Geology', 'Business', 
                         'Political science', 'Materials science', 'Computer science', 'Environmental science'],
                        ['0-20%','20-40%','40-60%','60-80%','80-100%']]
palette_list = [sns.color_palette('Blues', 4),
                sns.color_palette('OrRd', 7),
                sns.color_palette("deep", 10) + sns.color_palette("vlag", 9),
                sns.color_palette('Blues', 5)]
legend_title_list = ['Journal rank', '', '', 'KI percentile']
annotations_list = ['A', 'B', 'C', 'G'] # annotations for the subplots


for idx in range(len(condition_list)):
    condition = condition_list[idx]
    condition_labels = condition_labels_list[idx]
    palette_ = palette_list[idx]
    legend_title = legend_title_list[idx]

    df_condition = selected_df[['Citation_percentile_bin','DC_percentile',condition]]
    df_condition = df_condition.dropna() # drop rows with NaN values in the condition column

    fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(4, 3))
    fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace
    # Plot the subplot
    sns.lineplot(data=df_condition, x='Citation_percentile_bin', y="DC_percentile", hue=condition, hue_order=condition_labels, 
                palette=palette_, dashes=False, legend='full', ax=ax)
       
    # For 4G (KI_percentile_bin5), we need to add the fitted line for each category and an inset plot
    if condition == 'KI_percentile_bin5':
        # add fit line for each category
        for category_idx in range(len(condition_labels)):
            category = condition_labels[category_idx]
            subset = df_condition[df_condition[condition] == category]
            slope, intercept, r_value, p_value, std_err = linregress(subset[condition], subset['DC_percentile'])
            ax.plot(subset[condition], intercept + slope * subset[condition], linestyle='--', color=palette_lower[category_idx])
            x_min, x_max = subset[condition].min(), subset[condition].max()
            text_pos = x_min + x_max*0.2
            ax.text(text_pos, intercept + slope*text_pos + 1, f's={slope:.2f}', color=palette_lower[category_idx], fontsize=8)

        # add inset plot for the lower row of subplots
        ax_inset = inset_axes(ax, width="50%", height="50%", bbox_to_anchor=(0.08, 0.005, 0.5, 0.5), bbox_transform=ax.transAxes, loc='lower left')
        sns.lineplot(data=df_condition, x=condition, y="DC_percentile", color="#245297", 
                    dashes=False, legend=False, ax=ax_inset)
        # add fit line for the overall categories
        slope, intercept, r_value, p_value, std_err = linregress(df_condition[condition], df_condition['DC_percentile'])
        ax_inset.plot(df_condition[condition], intercept + slope * df_condition[condition], linestyle='--')
        x_min, x_max = df_condition[condition].min(), df_condition[condition].max()
        text_pos = x_max*0.75
        ax_inset.text(text_pos, intercept + slope*text_pos + 1, f's={slope:.2f}', color=palette_[idx], fontsize=8)
            
        # set the x and y limits for the inset plot
        ax_inset.set_xlabel('')
        ax_inset.set_xlim(x_min-6,x_max+6)
        ax_inset.set_xticks([])
        plt.setp(ax_inset.get_xticklabels(), fontsize='x-small')
        ax_inset.tick_params(axis='x', which='both', pad=2)

        ax_inset.set_ylabel('')
        plt.setp(ax_inset.get_yticklabels(), fontsize='x-small')
        ax_inset.tick_params(axis='y', which='both', pad=2)
    
    # set the title and labels
    ax.set_xlabel('Citation percentile')
    ax.set_ylabel('Disruption percentile')
    
    # set the legend properties
    if condition != 'Field':    # set legend properties for 4A, 4B, 4G
        ax.legend(title=legend_title, title_fontsize=10, loc='best', ncol=1, fontsize=7, 
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
    else:                       # set custom legend for Figure 4C
        handles, labels = ax.get_legend_handles_labels()
        legend = ax.legend(handles, labels, loc='upper center')
        # Divide the legend into two parts
        cutoff = 8
        first_legend_handles = handles[:cutoff]
        first_legend_labels = labels[:cutoff]
        second_legend_handles = handles[cutoff:]
        second_legend_labels = labels[cutoff:]
        # Create the first legend
        first_legend = ax.legend(first_legend_handles, first_legend_labels, loc='upper center', bbox_to_anchor=(0.45, 1), title_fontsize=10, ncol=1, fontsize=5
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)
        ax.add_artist(first_legend)
        # Create the second legend
        second_legend = ax.legend(second_legend_handles, second_legend_labels, loc='upper right', title_fontsize=10, ncol=1, fontsize=5,
                    shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)

    plt.show()
    fig.savefig('%s/results/figures/Fig4%s.pdf'%annotations_list[idx], bbox_inches='tight',dpi=600,pad_inches=0.0)
    plt.close(fig)  # Close the figure to avoid display issues in some environments

#### Figure 4D

In [None]:
# Plot Figure 4D, 4E, 4F: the lower and delayed citation of higher KI.
cited_pub_year = 2000
df_KI = selected_df[['PaperID','KI_bin']]
df_year = selected_df[['PaperID','Year']]
df_year_select = df_year[df_year['Year']==cited_pub_year]  # filter papers published before or in the cited_pub_year

df_KI.rename(columns={'PaperID':'Cited_PaperID_new', 'KI_bin':'cited_KI_bin'}, inplace=True)
df_year_select.rename(columns={'PaperID':'Cited_PaperID_new', 'Year':'cited_pub_year'}, inplace=True)

# Load the Paper_newID mapping from the pickle file.
Paper_newID = pickle.load(open("%s/data/processed/Paper_newID.pickle"%pre_path, 'rb'))
Paper_newID_df = pd.DataFrame(list(Paper_newID.items()), columns=['Cited_PaperID', 'Cited_PaperID_new'])

# Load the Citing_PaperID-Cited_PaperID citation pairs. 
Paper_Reference_df = read_big_csv("%s/data/raw/SciSciNet_PaperReferences.tsv"%pre_path, sep='\t', compression=None, chunksize=1000000, nrows=None, 
                           usecols=['Citing_PaperID', 'Cited_PaperID'])
combined_df = Paper_Reference_df.merge(Paper_newID_df, how='inner', on='Cited_PaperID')
combined_df.drop(columns=['Cited_PaperID'], inplace=True)  # Drop the original Citing_PaperID column
combined_df = combined_df.merge(df_year_select, how='inner', on='Cited_PaperID_new')  # Merge with the df_year_select to filter by cited_pub_year
combined_df.drop(columns=['cited_pub_year'], inplace=True)  # Drop the cited_pub_year column as it is not needed in the final DataFrame
combined_df = combined_df.merge(df_KI, how='inner', on='Cited_PaperID_new')  # Merge with the df_KI to get the cited_KI_bin

Paper_newID_df.rename(columns={'Cited_PaperID':'Citing_PaperID', 'Cited_PaperID_new':'Citing_PaperID_new'}, inplace=True)
combined_df = combined_df.merge(Paper_newID_df, how='inner', on='Citing_PaperID')
combined_df.drop(columns=['Citing_PaperID'], inplace=True)  # Drop the original Citing_PaperID column
df_year_select.rename(columns={'Cited_PaperID_new':'Citing_PaperID_new', 'cited_pub_year':'citing_pub_year'}, inplace=True)
combined_df = combined_df.merge(df_year_select, how='inner', on='Citing_PaperID_new')  # Merge with the df_year_select to filter by citing_pub_year

combined_df['cited_age'] = combined_df['citing_pub_year'] - cited_pub_year # calculate the age of the cited paper
combined_df = combined_df[['Citing_PaperID_new','Cited_PaperID_new','cited_KI_bin','cited_age']]
combined_df = combined_df[combined_df['cited_age']>0]


# deduplication based on 'cited_id' to obtain the distribution of KI among the individual citable papers as the baseline distribution
dedup_df = combined_df.drop_duplicates(subset='cited_id') 
ki_dist_overall = (
    dedup_df['cited_KI_bin']
    .value_counts(normalize=True)
    .rename('percentage')
    .reset_index()
    .rename(columns={'index': 'cited_KI_bin'})
)
print(ki_dist_overall)

# without deduplication, obtain the distribution of KI for each cited_age
ki_dist_yearly_cited = (
    combined_df
    .groupby('cited_age')['cited_KI_bin']
    .value_counts(normalize=True)
    .rename('percentage')
    .reset_index()
)
print(ki_dist_yearly_cited)


# Create a color palette for the lineplot
unique_ages = combined_df['cited_age'].sort_values().unique()
palette_ = sns.color_palette("Purples", as_cmap=False, n_colors=len(unique_ages)+5)
palette_ = palette_[5:]

# Create a ScalarMappable for the colorbar
norm = Normalize(vmin=min(unique_ages), vmax=max(unique_ages))
cmap = plt.get_cmap("Purples")
sm = ScalarMappable(norm=norm, cmap=cmap)
sm.set_array([])

# Step 3: visualize the KI baseline distribution and the distribution of KI for each cited_age 
fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(12, 3))
fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace
# distribution of KI among the individual citable papers (baseline distribution)
sns.lineplot(
    data=ki_dist_overall,
    x='cited_KI_bin',
    y='percentage',
    color='#999A9E',
    ax=ax
)
# distribution of KI for each cited_age
sns.lineplot(
    data=ki_dist_yearly_cited,
    x='cited_KI_bin',
    y="percentage",
    hue='cited_age',
    hue_order=list(unique_ages),
    palette=palette_,
    dashes=False,
    legend=False,
    ax=ax
)

# set the x and y labels
ax.set_ylabel("Proportion")
ax.set_xlabel("KI")
ax.text(0.02, 0.70, '$T$ = %s'%cited_pub_year, transform=ax.transAxes,
        fontsize='large', verticalalignment='top', horizontalalignment='left')
# legend elements
legend_elements = [
    Line2D([0], [0], color='#999A9E', label='Baseline'),
    Line2D([0], [0], color='none', label='Yearly cited')
]
ax.legend(title='', handles=legend_elements, title_fontsize=10, loc='upper left', ncol=1, fontsize=10, 
            shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)

# build the colorbar to the 'Yearly cited' in the legend
cax = inset_axes(ax, width="40%", height="8%", bbox_to_anchor=(0, 0, 1, 1), bbox_transform=ax.transAxes, borderpad=1, loc='upper right') # bbox_to_anchor=(x, y, width, height)
cbar = plt.colorbar(sm, cax=cax, orientation='horizontal')
cbar.set_label('Age $t$', color='purple')
cbar.ax.tick_params(colors='purple')
# Set the ticks and labels for the colorbar
cbar.set_ticks([min(unique_ages), max(unique_ages)])
cbar.set_ticklabels([str(min(unique_ages)), str(max(unique_ages))])

plt.show()
fig.savefig('%s/results/figures/Fig4D.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)
plt.close(fig)  # Close the figure to avoid display issues in some environments

#### Figure 4E

In [None]:
# Plot Figure 4E: higher KI associates with lower citation impact.
df_condition = selected_df[['KI_percentile_bin','Citation_percentile','Decade']]
df_condition = df_condition.dropna() # drop rows with NaN values in the condition column
df_condition['Relative_Citation_percentile'] = df_condition.groupby('Decade')['Citation_percentile'].transform(lambda x: x / x.mean()) # rescale the Citation_percentile to relative values by dividing by the mean of Citation_percentile in the same decade


fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(4, 3))
fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace
# Plot the subplot
time_periods = ['1950s','1960s','1970s','1980s','1990s','2000s','2010s']
palette_ = sns.color_palette('OrRd', len(time_periods))
sns.lineplot(data=df_condition, x='KI_percentile_bin', y="Relative_Citation_percentile", hue='Decade', hue_order=time_periods, 
             palette=palette_, dashes=False, legend='full', ax=ax)

# Add a horizontal line at y=1
ax.axhline(y=1, color='gray', linestyle='--', alpha=0.5)
ax.set_xlabel("KI percentile")
ax.set_ylabel("Relative Citation percentile")
ax.legend(title='', title_fontsize=10, loc='best', ncol=1, fontsize=8, 
            shadow=False, frameon=False, fancybox=False, handlelength=1, framealpha=0.8)

plt.show()
fig.savefig('%s/results/figures/Fig4E.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)
plt.close(fig)  # Close the figure to avoid display issues in some environments

#### Figure 4F

In [None]:
# Plot Figure 4F: higher KI associates with higher likelihood of being sleeping beauty.
df_condition = selected_df[['KI_percentile_bin','SB_B','SB_T']]
df_condition = df_condition.dropna() # drop rows with NaN values in the condition column


fig,ax=plt.subplots(nrows=1, ncols=1, figsize=(4, 3))
fig.subplots_adjust(hspace=.3,wspace=.3) # hspace,wspace
# Plot the subplot

color_a, color_b = '#8A7197', '#999A9E'
sns.lineplot(data=df_condition, x='KI_percentile_bin', y="SB_B", color=color_a, ax=ax)
ax.set_xlabel('KI percentile')
ax.set_ylabel('SB coefficient', color=color_a)
ax.tick_params(axis='y', labelcolor=color_a)

# Setup the right y axis for Figure 2A
ax_1 = ax.twinx()
sns.lineplot(data=df_condition, x='KI_percentile_bin', y='SB_T', color=color_b, ax=ax_1)
ax_1.set_ylabel('Awaking time', color=color_b)
ax_1.tick_params(axis='y', labelcolor=color_b)

plt.show()
fig.savefig('%s/results/figures/Fig4F.pdf', bbox_inches='tight',dpi=600,pad_inches=0.0)
plt.close(fig)  # Close the figure to avoid display issues in some environments