In [1]:
import pandas as pd
import numpy as np

import design_class_dictionary

In [2]:
#import relevant data
# master = pd.read_csv('data/master.csv')
master = pd.read_csv('data/master_1980_2013.csv')
citations = pd.read_csv('data/citations.csv', dtype={'patent_number':str})
focal_citation_link = pd.read_csv('data/citations_patents_level.csv')
focal_class = pd.read_csv('data/uspcs_level.csv')

In [3]:
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent,is_missing,num_design_cited,num_utility_cited,non-pat_refs,num_figures,num_assignees,missing_data,family_size,country_transformed,priority_date,origin
0,D257752,1981,1980,1,1,0,1,1,2,1,6,1,0,1,US,1980.0,0
1,D257924,1981,1980,1,1,0,1,1,2,1,6,1,0,1,US,1980.0,0
2,D258382,1981,1980,2,1,0,1,2,1,0,5,1,0,1,US,1980.0,0
3,D258383,1981,1980,2,1,0,1,2,1,0,5,1,0,1,US,1980.0,0
4,D258678,1981,1980,2,1,0,1,2,1,0,5,1,0,1,US,1980.0,0


In [4]:
focal = master[['patent_number','priority_date']]
focal.head()

Unnamed: 0,patent_number,priority_date
0,D257752,1980.0
1,D257924,1980.0
2,D258382,1980.0
3,D258383,1980.0
4,D258678,1980.0


In [5]:
citations.head()

Unnamed: 0,uspc_mainclass_id,uspc_subclass_id,patent_number
0,248,248/423,3930272
1,403,403/107,3930272
2,5,5/11,3930272
3,5,5/100,3930272
4,47,47/58.1R,3930335


In [6]:
focal_citation_link.head()

Unnamed: 0,cited_patent_number,patent_number
0,4162014,D257752
1,4162014,D257924
2,D253842,D258382
3,D253842,D258383
4,D253842,D258678


In [7]:
focal_citation_link.isnull().sum()

cited_patent_number    0
patent_number          0
dtype: int64

In [8]:
focal_citation = pd.merge(focal, focal_citation_link, on='patent_number', how='left')
focal_citation.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number
0,D257752,1980.0,4162014
1,D257924,1980.0,4162014
2,D258382,1980.0,D253842
3,D258383,1980.0,D253842
4,D258678,1980.0,D253842


In [9]:
focal_citation['patent_number'].nunique()

437678

In [10]:
focal_citation.isnull().sum()

patent_number          0
priority_date          0
cited_patent_number    0
dtype: int64

Let's see what is null and why

In [11]:
focal_citation.loc[focal_citation['cited_patent_number'].isnull()]

Unnamed: 0,patent_number,priority_date,cited_patent_number


After viewing a sample, these patents either don't make references, rare, or all of their references are pre 1976, more common. Since they are not useful for analysis, I will drop them

In [12]:
# percentage of patents affected
focal_citation['cited_patent_number'].value_counts(normalize=True, dropna=False).head()

D337569    0.000071
D341848    0.000067
D504889    0.000063
D561358    0.000063
D292182    0.000061
Name: cited_patent_number, dtype: float64

Less than 1 percent of patents are affected by this

In [13]:
focal_citation.dropna(inplace=True)
focal_citation.patent_number.nunique()

437678

# current number of patents represened: 507,127

In [14]:
citations.rename(index=str, columns={'patent_number':'cited_patent_number'}, inplace=True)
focal_citation_citeclass = pd.merge(focal_citation, citations, on='cited_patent_number', how='left')
focal_citation_citeclass.patent_number.nunique()

437678

In [15]:
focal_citation_citeclass.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,uspc_mainclass_id,uspc_subclass_id
0,D257752,1980.0,4162014,211,211/50
1,D257752,1980.0,4162014,211,211/55
2,D257752,1980.0,4162014,211,211/126.1
3,D257924,1980.0,4162014,211,211/50
4,D257924,1980.0,4162014,211,211/55


In [16]:
focal_citation_citeclass.rename(index=str, columns={'uspc_mainclass_id':'cite_mainclass', 'uspc_subclass_id':'cite_subclass'}, inplace=True)
focal_citation_citeclass.isnull().sum()

patent_number            0
priority_date            0
cited_patent_number      0
cite_mainclass         348
cite_subclass          348
dtype: int64

Drop nulls. Without classification its useles

In [17]:
focal_citation_citeclass.dropna(inplace=True)
focal_citation_citeclass['patent_number'].nunique()

437678

In [18]:
focal_citation_citeclass.loc[(focal_citation_citeclass.cite_mainclass.str.contains('D')) & (~focal_citation_citeclass.cited_patent_number.str.contains('D'))]

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass
36,D258766,1980.0,4069641,D25,D25/48.7
37,D258766,1980.0,4069641,D25,D25/119
54,D258766,1980.0,4184297,D25,D25/56
55,D258766,1980.0,4184297,D25,D25/123
90,D259281,1980.0,4098036,D25,D25/1
102,D259509,1980.0,3935984,D24,D24/165
116,D259510,1980.0,3952338,D25,D25/2
154,D259740,1980.0,4098036,D25,D25/1
213,D260549,1980.0,4164826,D22,D22/126
220,D260724,1980.0,3942591,D08,D08/7


In [19]:
citations.head()

Unnamed: 0,uspc_mainclass_id,uspc_subclass_id,cited_patent_number
0,248,248/423,3930272
1,403,403/107,3930272
2,5,5/11,3930272
3,5,5/100,3930272
4,47,47/58.1R,3930335


In [20]:
citations.loc[citations.cited_patent_number.str.match('4069641')]

Unnamed: 0,uspc_mainclass_id,uspc_subclass_id,cited_patent_number
56717,52,52/202,4069641
56718,52,52/476,4069641
56719,52,52/717.1,4069641
56720,D25,D25/48.7,4069641
56721,D25,D25/119,4069641


# Current number of patents represented: 507,126

In [21]:
final = pd.merge(focal_citation_citeclass, focal_class, on='patent_number', how='left')
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,uspc_mainclass_id,uspc_subclass_id
0,D257752,1980.0,4162014,211,211/50,D19,D19/75
1,D257752,1980.0,4162014,211,211/55,D19,D19/75
2,D257752,1980.0,4162014,211,211/126.1,D19,D19/75
3,D257924,1980.0,4162014,211,211/50,D06,D06/573
4,D257924,1980.0,4162014,211,211/55,D06,D06/573


In [22]:
final.rename(index=str, columns={'uspc_mainclass_id':'focal_mainclass', 'uspc_subclass_id':'focal_subclass'}, inplace=True)

In [23]:
final.shape

(16731605, 7)

In [24]:
final.patent_number.nunique()

437678

In [25]:
final.isnull().sum()

patent_number          0
priority_date          0
cited_patent_number    0
cite_mainclass         0
cite_subclass          0
focal_mainclass        0
focal_subclass         0
dtype: int64

4000 of our patents had null citations. These are mainly withdrawn patents and newly granted patent. I assume this has to do with patentsview getting updated information

In [26]:
final.dropna(inplace=True)
final['patent_number'].nunique()

437678

# Current patent count: 503,128

In [27]:
final.to_csv('data/final_no_drops.csv', index_label=False)

In [28]:
final = pd.read_csv('data/final_no_drops.csv')
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980.0,4162014,211,211/50,D19,D19/75
1,D257752,1980.0,4162014,211,211/55,D19,D19/75
2,D257752,1980.0,4162014,211,211/126.1,D19,D19/75
3,D257924,1980.0,4162014,211,211/50,D06,D06/573
4,D257924,1980.0,4162014,211,211/55,D06,D06/573


In [29]:
# final.focal_mainclass.sort_values().unique()

In [30]:
# final.cite_mainclass.sort_values().unique()

There are inconsitency in some of the class name. Ex: D01 and D1. This needs to be reconciled

In [31]:
class_dictionary = {'D1':'D01', 'D2':'D02', 'D3':'D03', 'D4':'D04', 'D5':'D05', 'D6':'D06', 'D7':'D07', 'D8':'D08', 'D9':'D09'}

In [32]:
# replacing mainclass level
final['cite_mainclass'] = final['cite_mainclass'].replace(class_dictionary)
final['focal_mainclass'] = final['focal_mainclass'].replace(class_dictionary)

In [33]:
# final.cite_mainclass.sort_values().unique()

In [34]:
#replacing subclass level
#easier to remove portion of string than adding in the middle contionally. D1 and D01 are equivalent anyways
final['cite_subclass'] = final['cite_subclass'].replace('D0', 'D', regex=True)
final['focal_subclass'] = final['focal_subclass'].replace('D0', 'D', regex=True)


No all patents have a typical class assignements or citations. Going to remove them from the master dataset and put them aside for further analysis

In [35]:
# remove cite_mainclass from no longer published, or plant

In [36]:
special = final.loc[(final.cite_mainclass.str.contains('No longer published')) | (final.cite_mainclass.str.contains('PLT'))]

In [37]:
final = final.drop(special.index)

In [38]:
final.patent_number.nunique()

437678

# Current patent count: 503,117

In [39]:
# remove design patent with utility class and utility patent with design class
design_with_util_class = final.loc[(~final.focal_mainclass.str.contains('D'))]

In [40]:
final = final.drop(design_with_util_class.index)

In [41]:
utility_with_design_class = final.loc[(~final.cited_patent_number.str.contains('D')) & (final.cite_mainclass.str.contains('D'))]
final = final.drop(utility_with_design_class.index)

In [42]:
final.patent_number.nunique()

437678

# Current patent count: 503,107

This is with very tradional design and utility classes

In [43]:
outcast = pd.concat([special, design_with_util_class, utility_with_design_class])

In [44]:
outcast.patent_number.nunique()

86907

102,972 patents are affected by these extraordinary catogories, but only 21 patents overall were unusable

In [45]:
#save
# final.to_csv('data/final.csv', index_label=False)
# outcast.to_csv('data/outcast.csv', index_label=False)

# final.to_csv('data/final_481186.csv', index_label=False)
