In [1]:
#Note: robust standard errors are obtained via Stata
#See Stata files
#Python code is for figures and open-source reproducibility
import config_declining_disruption as config

import pandas as pd
import numpy as np

from sklearn import linear_model #we use scikit-learn 1.2.0


###Data prep - takes 1 min
#Load data
patentsview_df = pd.read_csv(
    config.DATA_PATH + "patentsview_analytical_df.csv.gz",
    dtype={"field_nber_category_id":str},
    low_memory=False,
)

#Clean up from original stata code
patentsview_df = patentsview_df[~patentsview_df.field_nber_category_id.isnull()]
patentsview_df = patentsview_df[patentsview_df.field_nber_category_id != ""]
patentsview_df = patentsview_df[patentsview_df.field_nber_category_id != "6"]
patentsview_df = patentsview_df[patentsview_df.field_nber_category_id != "7"]

patentsview_df = patentsview_df[(patentsview_df["grant_year"] >= 1980) & (patentsview_df["grant_year"] <= 2010)]

#Drop missing values to perform regression
patentsview_df.dropna(subset=["cd_5"], inplace=True)

#Create dummies - drop first to avoid multicollinearity (reference category)
grant_year_dummies = pd.get_dummies(patentsview_df["grant_year"], prefix="grant_year")
grant_year_dummies.drop("grant_year_1980", axis=1, inplace=True)

subfield_dummies = pd.get_dummies(patentsview_df["field_nber_subcategory_id"], prefix="subfield")
subfield_dummies.drop("subfield_11.0", axis=1, inplace=True)

#Additional controls
patentsview_df["no_authors_mean"] = patentsview_df["nsc_upatents_inventors_total_t"] / patentsview_df["nsc_upatents_t"]
patentsview_df["no_cited_mean"] = patentsview_df["nsc_upatents_cited_total_t"] / patentsview_df["nsc_upatents_t"]
patentsview_df["no_of_works"] = patentsview_df["nsc_upatents_t"]

#New controls
patentsview_df["bin_0"] = patentsview_df["cited_total"].apply(lambda x: 1 if x == 0 else 0)

#Data
x = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
    ],
    axis=1,
)

x_0 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
        patentsview_df["bin_0"],
    ],
    axis=1,
)

y = patentsview_df["cd_5"]

###Regression - takes 2 min
model_1 = linear_model.LinearRegression().fit(x, y)
model_2 = linear_model.LinearRegression().fit(x_0, y)

#Compute residuals
patentsview_df["preds_1"] = model_1.predict(x)
patentsview_df["preds_2"] = model_2.predict(x_0)

patentsview_df["residuals_1"] = y - patentsview_df["preds_1"]
residuals_1=patentsview_df["residuals_1"].to_numpy()
patentsview_df["residuals_2"] = y - patentsview_df["preds_2"]
residuals_2=patentsview_df["residuals_2"].to_numpy()

#Compute margins
#See https://www.stata.com/meeting/germany13/abstracts/materials/de13_jann.pdf
#Slides 14-15
#Make predictions with original data, but for each year set grant_year_i = 1
x_margins_1 = x.copy()
x_margins_2 = x_0.copy()

x_margins_1[grant_year_dummies.columns] = 0
x_margins_2[grant_year_dummies.columns] = 0

margins = pd.DataFrame({"years": list(range(1980, 2011))})

for i in range(31):

    if i > 0:
        x_margins_1["grant_year_" + str(1980 + i)] = 1
        x_margins_2["grant_year_" + str(1980 + i)] = 1

    margins.loc[i, "margins_original"] = model_1.predict(x_margins_1).mean()
    margins.loc[i, "margins_zero_refs"] = model_2.predict(x_margins_2).mean()

    if i > 0:
        x_margins_1["grant_year_" + str(1980 + i)] = 0
        x_margins_2["grant_year_" + str(1980 + i)] = 0

#Save margins and the residuals 
margins.to_csv(config.DATA_PATH + "patentsview_margins.csv", index=False)
np.save(config.DATA_PATH + "patentsview_residuals_1.npy", residuals_1)
np.save(config.DATA_PATH + "patentsview_residuals_2.npy", residuals_2)

In [2]:
#R-squared
patentsview_df["bin_1"] = patentsview_df["cited_total"].apply(lambda x: 1 if x == 1 else 0)
patentsview_df["bin_2"] = patentsview_df["cited_total"].apply(lambda x: 1 if x == 2 else 0)
patentsview_df["bin_3"] = patentsview_df["cited_total"].apply(lambda x: 1 if x == 3 else 0)
patentsview_df["bin_4"] = patentsview_df["cited_total"].apply(lambda x: 1 if x == 4 else 0)
patentsview_df["bin_5"] = patentsview_df["cited_total"].apply(lambda x: 1 if x == 5 else 0)

x_1 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
        patentsview_df["bin_1"],
    ],
    axis=1,
)

x_2 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
        patentsview_df["bin_2"],
    ],
    axis=1,
)

x_3 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
        patentsview_df["bin_3"],
    ],
    axis=1,
)

x_4 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
        patentsview_df["bin_4"],
    ],
    axis=1,
)

x_5 = pd.concat(
    [
        grant_year_dummies,
        subfield_dummies,
        patentsview_df["cited_total"],
        patentsview_df["no_authors_mean"],
        patentsview_df["no_cited_mean"],
        patentsview_df["no_of_works"],
        patentsview_df["bin_5"],
    ],
    axis=1,
)

model_3 = linear_model.LinearRegression().fit(x_1, y)
model_4 = linear_model.LinearRegression().fit(x_2, y)
model_5 = linear_model.LinearRegression().fit(x_3, y)
model_6 = linear_model.LinearRegression().fit(x_4, y)
model_7 = linear_model.LinearRegression().fit(x_5, y)

print(1-(1-model_1.score(x,y))*((len(x)-1)/(len(x)-len(x.columns)-1)))
print(1-(1-model_2.score(x_0,y))*((len(x)-1)/(len(x_0)-len(x_0.columns)-1)))
print(1-(1-model_3.score(x_1,y))*((len(x)-1)/(len(x_1)-len(x_1.columns)-1)))
print(1-(1-model_4.score(x_2,y))*((len(x)-1)/(len(x_2)-len(x_2.columns)-1)))
print(1-(1-model_5.score(x_3,y))*((len(x)-1)/(len(x_3)-len(x_3.columns)-1)))
print(1-(1-model_6.score(x_4,y))*((len(x)-1)/(len(x_4)-len(x_4.columns)-1)))
print(1-(1-model_7.score(x_5,y))*((len(x)-1)/(len(x_5)-len(x_5.columns)-1)))

0.09566384149062512
0.517976914241067
0.10954177538198284
0.09567702084170959
0.09795743173232752
0.09975755822140187
0.10010021622064813
