In [1]:
#Note: robust standard errors are obtained via Stata
#See Stata files
#Python code is for figures and open-source reproducibility
import config_declining_disruption as config
import pandas as pd
import numpy as np
import gc

from sklearn import config_context, linear_model #we use scikit-learn 1.2.0

## Regression results for raw SciSciNet journal data 

Load the data 

In [None]:
#Load the data 
data = pd.read_csv(config.DATA_PATH + 'SciSciNet_df_raw_journal_data.csv')

#Only keep the relevant columns
data=data[["PaperID", "cd_5", "year", "Field_Name", "Team_Size", "references"]]

#Rename the year to Year for consistency
data.rename(columns={"year":"Year"}, inplace=True)

#Drop rows with missing values
data.dropna(subset=["cd_5", 
                    "Year", 
                    "Field_Name", 
                    "Team_Size", 
                    ], inplace=True)

data.reset_index(drop=True, inplace=True)

Prepare data for regression

In [None]:
#Create dummies
grant_year_dummies = pd.get_dummies(data["Year"], prefix="Year")
grant_year_dummies.drop("Year_1944", axis=1, inplace=True)

subfield_dummies = pd.get_dummies(data["Field_Name"], prefix="Field")
subfield_dummies.drop("Field_Biology", axis=1, inplace=True)

#Create control variables
data["no_of_papers_subfield_t"] = data.groupby(
    ["Year", "Field_Name"]
)["PaperID"].transform("size")

data["no_of_references_subfield_t"] = data.groupby(
    ["Year", "Field_Name"]
)["references"].transform("sum")

data["no_of_authors_subfield_t"] = data.groupby(
    ["Year", "Field_Name"]
)["Team_Size"].transform("sum")

data["no_of_references_mean_subfield_t"] = data["no_of_references_subfield_t"] / data["no_of_papers_subfield_t"]
data["no_of_authors_mean_subfield_t"] = data["no_of_authors_subfield_t"] / data["no_of_papers_subfield_t"]

#New control
data["bin_0"] = data["references"].apply(lambda x: 1 if x == 0 else 0)

#Data
x = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
    ],
    axis=1,
)

x_0 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_0"],
    ],
    axis=1,
)

y = data["cd_5"]

Run the regressions

In [None]:
###Regression - takes 5 mins with these optimizations
#Large data set - use cholesky solver
# https://github.com/scikit-learn/scikit-learn/issues/13923
# https://github.com/scikit-learn/scikit-learn/pull/22940
with config_context(assume_finite=True):
    model_1 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x, y)

with config_context(assume_finite=True):
    model_2 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_0, y)

###Compute residuals
data["preds_1"] = model_1.predict(x)
data["preds_2"] = model_2.predict(x_0)

data["residuals_1"] = y - data["preds_1"]
residuals_1=data["residuals_1"].to_numpy()
data["residuals_2"] = y - data["preds_2"]
residuals_2=data["residuals_2"].to_numpy()

#Print Adjusted R-squared of the models
print("Adjusted R-squared for regression without dummy at zero ref: ",1-(1-model_1.score(x,y))*((len(x)-1)/(len(x)-len(x.columns)-1)))
print("Adjusted R-squared for regression with dummy at zero ref: ",1-(1-model_2.score(x_0,y))*((len(x_0)-1)/(len(x_0)-len(x_0.columns)-1)))

#Save the residuals 
np.save(config.DATA_PATH + "SciSciNet_residuals_1_raw_journal_data.npy", residuals_1)
np.save(config.DATA_PATH + "SciSciNet_residuals_2_raw_journal_data.npy", residuals_2)

Adjusted R-squared for regression without dummy at zero ref:  0.15585605832839922
Adjusted R-squared for regression with dummy at zero ref:  0.9320422587789884


Compute and save the margins 

In [12]:
#Compute margins - takes 120 mins
# https://www.stata.com/meeting/germany13/abstracts/materials/de13_jann.pdf
#See slides 14-15
#Make predictions with original data, but for each year set grant_year_i = 1
x[grant_year_dummies.columns] = 0
x_0[grant_year_dummies.columns] = 0

margins = pd.DataFrame(
    {
        "Years": list(range(int(data["Year"].min()), int(data["Year"].max()+1)))
    }
)

for i in range(int(data["Year"].min()), int(data["Year"].max()+1)):

    if i > 1944:
        x[f"Year_{i}"] = 1
        x_0[f"Year_{i}"] = 1
   
    margins.loc[margins["Years"] == i, "margins_original"] = model_1.predict(x).mean()
    margins.loc[margins["Years"] == i, "margins_zero_refs"] = model_2.predict(x_0).mean()

    if i > 1944:
        x[f"Year_{i}"] = 0
        x_0[f"Year_{i}"] = 0

del model_1, model_2, x, x_0
gc.collect()

#Save margins
margins.to_csv(config.DATA_PATH+"SciSciNet_margins_raw_journal_data.csv", index=False)

Compute adjusted R^2 of models with dummies for 1, 2, 3, 4, 5 references

In [None]:
for i in range(1, 6):
    data[f"bin_{i}"] = data["references"].apply(lambda x: 1 if x == i else 0)

In [None]:
for i in range(1, 6):
    
    x_new = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_"+str(i)],
    ],
    axis=1,
    )

    with config_context(assume_finite=True):
        model_new = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_new, y)

    #Print Adjusted R-squared of the model
    print("Adjusted R-squared for regression with dummy at "+str(i)+": ", 1-(1-model_new.score(x_new,y))*((len(x_new)-1)/(len(x_new)-len(x_new.columns)-1)))

    del model_new, x_new
    gc.collect()

Adjusted R-squared for regression with dummy at 1:  0.15898879717197967
Adjusted R-squared for regression with dummy at 2:  0.1675493711906444
Adjusted R-squared for regression with dummy at 3:  0.17020706232235427
Adjusted R-squared for regression with dummy at 4:  0.17068992814188666
Adjusted R-squared for regression with dummy at 5:  0.17033912842065835
