In [54]:
from uspto import *
import pandas as pd
from tqdm import tqdm

In [61]:
base_url = "https://bulkdata.uspto.gov/data/patent/grant/redbook/fulltext/"
Year = 2001

In [62]:
file_list = get_patent_files_by_year(Year)

In [8]:
for f in file_list:
    if 'r1' in f:
        file_list.remove(f[:9]+'.zip')
        

#### Run the code from here

In [63]:
pub_date_lst =[]
filing_date_lst = []
kind_lst =[]
doc_number_lst =[]
country_lst=[]
company_name_lst=[]

In [64]:
for file in tqdm(file_list):
    url = base_url+str(Year)+'/'+file
    items = ['ASSG']
    response_data = read_and_parse_from_url(url, items)
    pub_date = file[6:14]  #6:14
    
    for data in response_data:
        try:
            var_filing_date = data['bibliographic_information']['Application Filing Date']
            var_kind = data['bibliographic_information']['Series Code']
            var_doc_number =data['bibliographic_information']['Patent Number']
            var_country ='us'
            var_company_name = data['assignees'][0]['inventor name']
            filing_date_lst.append(var_filing_date)
            kind_lst.append(var_kind)
            doc_number_lst.append(var_doc_number)
            country_lst.append(var_country)
            company_name_lst.append(var_company_name) 
            pub_date_lst.append(pub_date)

        except Exception as e:
            e =1

data_dict = {"Filing_date":filing_date_lst,"Publish_date":pub_date_lst,"Classification":kind_lst,
"Patent_Number":doc_number_lst,"Country":country_lst, "Company_Name": company_name_lst}

df = pd.DataFrame(data_dict)
df.to_csv("data/raw_data/"+str(Year)+".csv", index=False)



100%|██████████| 52/52 [24:15<00:00, 27.98s/it]


### END of Code

In [35]:
for file in tqdm(file_list):
    url = base_url+str(Year)+'/'+file
    items = ['ASSG']
    response_data = read_and_parse_from_url(url, items)
    pub_date = file[6:14]
    for data in response_data:
        try:
            var_filing_date = data['bibliographic_information']['date']
            var_kind = data['bibliographic_information']['kind']
            var_doc_number =data['bibliographic_information']['doc-number']
            var_country =data['bibliographic_information']['country']
            var_company_name = data['assignees'][0]['addressbook/orgname']
            filing_date_lst.append(var_filing_date)
            kind_lst.append(var_kind)
            doc_number_lst.append(var_doc_number)
            country_lst.append(var_country)
            company_name_lst.append(var_company_name) 
            pub_date_lst.append(pub_date)

        except Exception as e:
            e =1

data_dict = {"Filing_date":filing_date_lst,"Publish_date":pub_date_lst,"Classification":kind_lst,
"Patent_Number":doc_number_lst,"Country":country_lst, "Company_Name": company_name_lst}

df = pd.DataFrame(data_dict)
df.to_csv("data/raw_data/"+str(Year)+".csv", index=False)


100%|██████████| 52/52 [18:40<00:00, 21.54s/it]


### tetsing Demo Regression

In [None]:
import plotly.express as px
import pandas as pd
import numpy as np
import statsmodels.api as sm


In [31]:
df = pd.read_excel('data/regres_final_data.xlsx')

In [37]:

df = df.dropna()

In [32]:
df = df.drop(columns=['Unnamed: 12','Unnamed: 13','Unnamed: 14'])

In [34]:

df['log_ROA_BASED_ON_BOTTOM_EPS'] = np.sign(df['ROA_BASED_ON_BOTTOM_EPS']) * np.log(np.abs(df['ROA_BASED_ON_BOTTOM_EPS']))


In [41]:
df['log_BS_TOT_ASSET'] = np.log(df['BS_TOT_ASSET'])

In [5]:
fig = px.scatter(df, x='date', y='ROA_BASED_ON_BOTTOM_EPS', title="Scatter Plot of ROA over Years", 
                 labels={'date': 'Year', 'ROA_BASED_ON_BOTTOM_EPS': 'ROA'})

# Show plot
fig.show()

In [29]:
df['no_patents'] = df['no_patents'].fillna(df['no_patents'].mean())


In [42]:


X = df[['no_patents', 'log_BS_TOT_ASSET',
       'CUR_MKT_CAP',
       'TOBIN_Q_RATIO', 'TOT_DEBT_TO_TOT_ASSET', 'TOT_DEBT_TO_TOT_EQY',]]  # independent variables
y = df['log_ROA_BASED_ON_BOTTOM_EPS']  # dependent variable

# Add a constant (intercept) to the model
X = sm.add_constant(X)

# Fit the OLS model
model = sm.OLS(y, X).fit()

# Print the summary of the regression
print(model.summary())

                                 OLS Regression Results                                
Dep. Variable:     log_ROA_BASED_ON_BOTTOM_EPS   R-squared:                       0.038
Model:                                     OLS   Adj. R-squared:                  0.036
Method:                          Least Squares   F-statistic:                     24.00
Date:                         Mon, 21 Oct 2024   Prob (F-statistic):           5.34e-28
Time:                                 20:06:40   Log-Likelihood:                -6427.5
No. Observations:                         3671   AIC:                         1.287e+04
Df Residuals:                             3664   BIC:                         1.291e+04
Df Model:                                    6                                         
Covariance Type:                     nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
------------------------------

In [44]:
from xgboost import XGBRegressor
from sklearn.metrics import r2_score

In [45]:
X = df[['no_patents', 'log_BS_TOT_ASSET', 'CUR_MKT_CAP', 
         'TOBIN_Q_RATIO', 'TOT_DEBT_TO_TOT_ASSET', 'TOT_DEBT_TO_TOT_EQY']]
y = df['log_ROA_BASED_ON_BOTTOM_EPS']

# Step 1: Create and fit the XGBoost model using all the data
model = XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X, y)

# Step 2: Make predictions using the same data
y_pred = model.predict(X)

# Step 3: Calculate R-squared value
r2 = r2_score(y, y_pred)

# Print R-squared value
print(f'R-squared: {r2:.4f}')

R-squared: 0.9482
