In [1]:
#Note: robust standard errors are obtained via Stata
#See Stata files
#Python code is for figures and open-source reproducibility
import config_declining_disruption as config
import pandas as pd
import numpy as np
import gc

from sklearn import config_context, linear_model #we use scikit-learn 1.2.0


###Data prep - takes 5 min
#Read data
data = pd.read_csv(config.DATA_PATH+"SciSciNet_df.csv")

#Drop rows with missing values
data.dropna(subset=["cd_5", "Year", "Field_Name", "Team_Size"], inplace=True)

#Create dummies
grant_year_dummies = pd.get_dummies(data["Year"], prefix="Year")
grant_year_dummies.drop("Year_1944.0", axis=1, inplace=True)

subfield_dummies = pd.get_dummies(data["Field_Name"], prefix="Field")
subfield_dummies.drop("Field_Art", axis=1, inplace=True)

#Create control variables
data["no_of_papers_subfield_t"] = data.groupby(
    ["Year", "Field_Name"]
)["PaperID"].transform("size")

data["no_of_references_subfield_t"] = data.groupby(
    ["Year", "Field_Name"]
)["references"].transform("sum")

data["no_of_authors_subfield_t"] = data.groupby(
    ["Year", "Field_Name"]
)["Team_Size"].transform("sum")

data["no_of_references_mean_subfield_t"] = data["no_of_references_subfield_t"] / data["no_of_papers_subfield_t"]
data["no_of_authors_mean_subfield_t"] = data["no_of_authors_subfield_t"] / data["no_of_papers_subfield_t"]

#New control
data["bin_0"] = data["references"].apply(lambda x: 1 if x == 0 else 0)

#Data
x = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
    ],
    axis=1,
)

x_0 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_0"],
    ],
    axis=1,
)

y = data["cd_5"]

###Regression - takes 5 mins with these optimizations
#Large data set - use cholesky solver
# https://github.com/scikit-learn/scikit-learn/issues/13923
# https://github.com/scikit-learn/scikit-learn/pull/22940
with config_context(assume_finite=True):
    model_1 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x, y)

with config_context(assume_finite=True):
    model_2 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_0, y)

###Compute residuals
data["preds_1"] = model_1.predict(x)
data["preds_2"] = model_2.predict(x_0)

data["residuals_1"] = y - data["preds_1"]
residuals_1=data["residuals_1"].to_numpy()
data["residuals_2"] = y - data["preds_2"]
residuals_2=data["residuals_2"].to_numpy()

#Print Adjusted R-squared of the models
print(1-(1-model_1.score(x,y))*((len(x)-1)/(len(x)-len(x.columns)-1)))
print(1-(1-model_2.score(x_0,y))*((len(x_0)-1)/(len(x_0)-len(x_0.columns)-1)))

#Compute margins - takes 120 mins
# https://www.stata.com/meeting/germany13/abstracts/materials/de13_jann.pdf
#See slides 14-15
#Make predictions with original data, but for each year set grant_year_i = 1
x[grant_year_dummies.columns] = 0
x_0[grant_year_dummies.columns] = 0

margins = pd.DataFrame(
    {
        "Years": list(range(int(data["Year"].min()), int(data["Year"].max()+1)))
    }
)

for i in range(int(data["Year"].min()), int(data["Year"].max()+1)):

    if i > 1944:
        x[f"Year_{i}.0"] = 1
        x_0[f"Year_{i}.0"] = 1
   
    margins.loc[margins["Years"] == i, "margins_original"] = model_1.predict(x).mean()
    margins.loc[margins["Years"] == i, "margins_zero_refs"] = model_2.predict(x_0).mean()

    if i > 1944:
        x[f"Year_{i}.0"] = 0
        x_0[f"Year_{i}.0"] = 0

del model_1, model_2, x, x_0
gc.collect()

#Save margins and the residuals
margins.to_csv(config.DATA_PATH+"SciSciNet_margins.csv", index=False)
np.save(config.DATA_PATH + "patentsview_residuals_1.npy", residuals_1)
np.save(config.DATA_PATH + "patentsview_residuals_2.npy", residuals_2)

0.15385705571540897
0.9460157668286039


In [3]:
data["bin_1"] = data["references"].apply(lambda x: 1 if x == 1 else 0)
data["bin_2"] = data["references"].apply(lambda x: 1 if x == 2 else 0)
data["bin_3"] = data["references"].apply(lambda x: 1 if x == 3 else 0)
data["bin_4"] = data["references"].apply(lambda x: 1 if x == 4 else 0)
data["bin_5"] = data["references"].apply(lambda x: 1 if x == 5 else 0)

x_1 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_1"],
    ],
    axis=1,
)

with config_context(assume_finite=True):
    model_3 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_1, y)

#Print Adjusted R-squared of the model
print(1-(1-model_3.score(x_1,y))*((len(x_1)-1)/(len(x_1)-len(x_1.columns)-1)))

del model_3, x_1
gc.collect()

x_2 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_2"],
    ],
    axis=1,
)

with config_context(assume_finite=True):
    model_4 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_2, y)

#Print Adjusted R-squared of the model
print(1-(1-model_4.score(x_2,y))*((len(x_2)-1)/(len(x_2)-len(x_2.columns)-1)))

del model_4, x_2
gc.collect()

x_3 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_3"],
    ],
    axis=1,
)

with config_context(assume_finite=True):
    model_5 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_3, y)

#Print Adjusted R-squared of the model
print(1-(1-model_5.score(x_3,y))*((len(x_3)-1)/(len(x_3)-len(x_3.columns)-1)))

del model_5, x_3
gc.collect()

x_4 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_4"],
    ],
    axis=1,
)

with config_context(assume_finite=True):
    model_6 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_4, y)

#Print Adjusted R-squared of the model
print(1-(1-model_6.score(x_4,y))*((len(x_4)-1)/(len(x_4)-len(x_4.columns)-1)))

del model_6, x_4
gc.collect()

x_5 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        data["references"],
        data["no_of_authors_mean_subfield_t"],
        data["no_of_references_mean_subfield_t"],
        data["no_of_papers_subfield_t"],
        data["bin_5"],
    ],
    axis=1,
)

with config_context(assume_finite=True):
    model_7 = linear_model.Ridge(alpha=1e-9, solver="cholesky").fit(x_5, y)

#Print Adjusted R-squared of the model
print(1-(1-model_7.score(x_5,y))*((len(x_5)-1)/(len(x_5)-len(x_5.columns)-1)))

del model_7, x_5
gc.collect()

0.15991098025272588
0.16620440721129714
0.16819010088222153
0.16831851544344134
0.16784256685955745


0