In [1]:
import pandas as pd
import numpy as np


In [2]:
#import relevant data
master = pd.read_csv('data/master.csv')
citations = pd.read_csv('data/citations.csv', dtype={'patent_number':str})
focal_citation_link = pd.read_csv('data/citations_patents_level.csv')
focal_class = pd.read_csv('data/uspcs_level.csv')

In [3]:
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent,is_missing,num_design_cited,num_utility_cited,non-pat_refs,num_figures,num_assignees,priority_date
0,D257752,1981,1980,1,1.0,0.0,1,1,2,1.0,6.0,1,1980
1,D257924,1981,1980,1,1.0,0.0,1,1,2,1.0,6.0,1,1980
2,D258382,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
3,D258383,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
4,D258571,1981,1980,1,1.0,0.0,1,1,1,0.0,2.0,1,1980


In [4]:
focal = master[['patent_number','priority_date']]
focal.head()

Unnamed: 0,patent_number,priority_date
0,D257752,1980
1,D257924,1980
2,D258382,1980
3,D258383,1980
4,D258571,1980


In [5]:
citations.head()

Unnamed: 0,uspc_mainclass_id,uspc_subclass_id,patent_number
0,248,248/423,3930272
1,403,403/107,3930272
2,5,5/11,3930272
3,5,5/100,3930272
4,47,47/58.1R,3930335


In [6]:
focal_citation_link.head()

Unnamed: 0,cited_patent_number,patent_number
0,4162014,D257752
1,4162014,D257924
2,D253842,D258382
3,D253842,D258383
4,D253842,D258678


In [7]:
focal_citation_link.isnull().sum()

cited_patent_number    0
patent_number          0
dtype: int64

In [8]:
focal_citation = pd.merge(focal, focal_citation_link, on='patent_number', how='left')
focal_citation.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number
0,D257752,1980,4162014
1,D257924,1980,4162014
2,D258382,1980,D253842
3,D258383,1980,D253842
4,D258571,1980,


In [9]:
focal_citation['patent_number'].nunique()

525490

In [10]:
focal_citation.isnull().sum()

patent_number              0
priority_date              0
cited_patent_number    18363
dtype: int64

Let's see what is null and why

In [11]:
focal_citation.loc[focal_citation['cited_patent_number'].isnull()]

Unnamed: 0,patent_number,priority_date,cited_patent_number
4,D258571,1980,
5,D258650,1980,
8,D258760,1980,
21,D259183,1980,
45,D259575,1980,
46,D259610,1980,
56,D259669,1980,
63,D259753,1980,
65,D259820,1980,
75,D259959,1980,


After viewing a sample, these patents either don't make references, rare, or all of their references are pre 1976, more common. Since they are not useful for analysis, I will drop them

In [12]:
# percentage of patents affected
focal_citation['cited_patent_number'].value_counts(normalize=True, dropna=False).head()

NaN        0.002822
D504889    0.000077
D337569    0.000071
D341848    0.000068
D561358    0.000059
Name: cited_patent_number, dtype: float64

Less than 1 percent of patents are affected by this

In [13]:
focal_citation.dropna(inplace=True)
focal_citation.patent_number.nunique()

507127

# current number of patents represened: 507,127

In [14]:
citations.rename(index=str, columns={'patent_number':'cited_patent_number'}, inplace=True)
focal_citation_citeclass = pd.merge(focal_citation, citations, on='cited_patent_number', how='left')
focal_citation_citeclass.patent_number.nunique()

507127

In [15]:
focal_citation_citeclass.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,uspc_mainclass_id,uspc_subclass_id
0,D257752,1980,4162014,211,211/50
1,D257752,1980,4162014,211,211/55
2,D257752,1980,4162014,211,211/126.1
3,D257924,1980,4162014,211,211/50
4,D257924,1980,4162014,211,211/55


In [16]:
focal_citation_citeclass.rename(index=str, columns={'uspc_mainclass_id':'cite_mainclass', 'uspc_subclass_id':'cite_subclass'}, inplace=True)
focal_citation_citeclass.isnull().sum()

patent_number            0
priority_date            0
cited_patent_number      0
cite_mainclass         530
cite_subclass          530
dtype: int64

Drop nulls. Without classification its useles

In [17]:
focal_citation_citeclass.dropna(inplace=True)
focal_citation_citeclass['patent_number'].nunique()

507126

# Current number of patents represented: 507,126

In [18]:
final = pd.merge(focal_citation_citeclass, focal_class, on='patent_number', how='left')
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,uspc_mainclass_id,uspc_subclass_id
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


In [19]:
final.rename(index=str, columns={'uspc_mainclass_id':'focal_mainclass', 'uspc_subclass_id':'focal_subclass'}, inplace=True)

In [20]:
final.shape

(21778093, 7)

In [21]:
final.patent_number.nunique()

507126

In [22]:
final.isnull().sum()

patent_number               0
priority_date               0
cited_patent_number         0
cite_mainclass              0
cite_subclass               0
focal_mainclass        207651
focal_subclass         207651
dtype: int64

4000 of our patents had null citations. These are mainly withdrawn patents and newly granted patent. I assume this has to do with patentsview getting updated information

In [23]:
final.dropna(inplace=True)
final['patent_number'].nunique()

503128

# Current patent count: 503,128

Let's make this smaller to be easier for memory to deal with

In [24]:
final['patent_number'].nunique() /master['patent_number'].nunique()

0.9574454318826239

95.74% of the original dataset is still represented

In [25]:
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


# Analysis with subclass granularity

In [26]:
final_combinations = final.drop(['cited_patent_number','cite_mainclass','focal_mainclass'], axis=1).drop_duplicates().sort_values(by=['priority_date','patent_number','focal_subclass'])
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
4602315,D466542,1902,D16/311,D16/303
4602316,D466542,1902,2/428,D16/303
4602317,D466542,1902,2/452,D16/303
4602318,D466542,1902,2/442,D16/303
4602320,D466542,1902,2/445,D16/303


In [27]:
#necessary for accurate indexing
final_combinations = final_combinations.sort_values('priority_date').reset_index(drop=True)
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
0,D466542,1902,D16/311,D16/303
1,D466542,1902,2/428,D16/303
2,D466542,1902,2/452,D16/303
3,D466542,1902,2/442,D16/303
4,D466542,1902,2/445,D16/303


In [28]:
final_combinations.shape

(11447580, 4)

11,447,580 individual backwards ciations at the subclass level

In [29]:
final_combinations['first_seen'] = 0
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass,first_seen
0,D466542,1902,D16/311,D16/303,0
1,D466542,1902,2/428,D16/303,0
2,D466542,1902,2/452,D16/303,0
3,D466542,1902,2/442,D16/303,0
4,D466542,1902,2/445,D16/303,0


Sort class pairs so they are reciprocal, drop the duplicates. This marks the all the first occurances since dataframe is sorted by year. The length of these indices will indicate how many unique class pairs exist

In [38]:
final_combinations.to_csv('data/final_combinations.csv', index_label=False)

# Analysis with mainclass granularity

In [26]:
final_combinations = final.drop(['cited_patent_number','cite_subclass','focal_subclass'], axis=1).drop_duplicates().sort_values(by=['priority_date','patent_number','focal_mainclass'])
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
4602315,D466542,1902,D16,D16
4602316,D466542,1902,2,D16
6144836,D499408,1904,312,D14
6144839,D499408,1904,D14,D14
6362160,D510547,1905,D12,D12


In [27]:
#necessary for accurate indexing
final_combinations = final_combinations.sort_values('priority_date').reset_index(drop=True)
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
0,D466542,1902,D16,D16
1,D466542,1902,2,D16
2,D499408,1904,312,D14
3,D499408,1904,D14,D14
4,D510547,1905,D12,D12


In [28]:
final_combinations.shape

(2371580, 4)

2,371,580 individual backwards citations pairs at the mainclass level 

In [29]:
final_combinations['first_seen'] = 0
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,first_seen
0,D466542,1902,D16,D16,0
1,D466542,1902,2,D16,0
2,D499408,1904,312,D14,0
3,D499408,1904,D14,D14,0
4,D510547,1905,D12,D12,0


In [30]:
final_combinations.to_csv('data/final_combinations_main.csv', index_label=False)

In [31]:
master

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent,is_missing,num_design_cited,num_utility_cited,non-pat_refs,num_figures,num_assignees,priority_date
0,D257752,1981,1980,1,1.0,0.0,1,1,2,1.0,6.0,1,1980
1,D257924,1981,1980,1,1.0,0.0,1,1,2,1.0,6.0,1,1980
2,D258382,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
3,D258383,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
4,D258571,1981,1980,1,1.0,0.0,1,1,1,0.0,2.0,1,1980
5,D258650,1981,1980,3,1.0,0.0,1,1,1,0.0,6.0,1,1980
6,D258678,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
7,D258755,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
8,D258760,1981,1980,1,1.0,0.0,1,1,1,0.0,6.0,1,1980
9,D258766,1981,1980,1,1.0,0.0,1,1,11,0.0,3.0,1,1980
