In [1]:
import pandas as pd
import numpy as np

In [2]:
class_labels={
'D01':'Edible Products',
'D02':'Apparel and Haberdashery',
'D03':'Travel Goods, Personal Belongings, and Storage or Carrying Articles',
'D04':'Brushware',
'D05':'Textile or Paper Yard Goods; Sheet Material',
'D06':'Furnishings',
'D07':'Equipment for Preparing or Serving Food or Drink Not Elsewhere Specified',
'D08':'Tools and Hardware',
'D09':'Packages and Containers for Goods',
'D10':'Measuring, Testing or Signaling Instruments',
'D11':'Jewelry, Symbolic Insignia, and Ornaments',
'D12':'Transportation',
'D13':'Equipment for Production, Distribution, or Transformation of Energy',
'D14':'Recording, Communication, or Information Retrieval Equipment',
'D15':'Machines Not Elsewhere Specified',
'D16':'Photography and Optical Equipment',
'D17':'Musical Instruments',
'D18':'Printing and Office Machinery',
'D19':'Office Supplies; Artists\' and Teachers\' Materials',
'D20':'Sales and Advertising Equipment',
'D21':'Games, Toys and Sports Goods',
'D22':'Arms, Pyrotechnics, Hunting and Fishing Equipment',
'D23':'Environmental Heating and Cooling, Fluid Handling and Sanitary Equipment',
'D24':'Medical and Laboratory Equipment',
'D25':'Building Units and Construction Elements',
'D26':'Lighting',
'D27':'Tobacco and Smokers\' Supplies',
'D28':'Cosmetic Products and Toilet Articles',
'D29':'Equipment for Safety, Protection and Rescue',
'D30':'Animal Husbandry',
'D32':'Washing, Cleaning or Drying Machines',
'D34':'Material or Article Handling Equipment',
'D99':'Miscellaneous'}

In [3]:
class_dictionary = {'D1':'D01', 'D2':'D02', 'D3':'D03', 'D4':'D04', 'D5':'D05', 'D6':'D06', 'D7':'D07', 'D8':'D08', 'D9':'D09'}

# Discete Mainclass analysis

In [4]:
master = pd.read_csv('data/master.csv')
d2d_seen = pd.read_csv('data/final_main_d2d_seen.csv')
d2d_seen_cont = pd.read_csv('data/final_main_d2d_seen_cont.csv')
d2u_seen = pd.read_csv('data/final_main_d2u_seen.csv')
d2u_seen_cont = pd.read_csv('data/final_main_d2u_seen_cont.csv')

In [5]:
patent_classifications = pd.read_csv('data/uspc_current.tsv', delimiter = '\t', usecols=['patent_id','mainclass_id','sequence'], dtype={'patent_id':str, 'mainclass_id':str})
patent_classifications.head()

Unnamed: 0,patent_id,mainclass_id,sequence
0,3930271,2,0
1,3930271,2,1
2,3930271,2,2
3,3930271,473,3
4,3930272,5,0


In [6]:
listed_patents = list(set(list(d2d_seen.patent_number.unique()) + list(d2u_seen.patent_number.unique())))

In [7]:
patent_classifications = patent_classifications[patent_classifications['patent_id'].isin(listed_patents)]
patent_classifications.head()

Unnamed: 0,patent_id,mainclass_id,sequence
21969998,D257752,D19,0
21970267,D257924,D06,0
21970995,D258382,D23,0
21970996,D258383,D23,0
21971443,D258678,D23,0


In [8]:
#select all first classifications
patent_classifications = patent_classifications.loc[patent_classifications.sequence == 0]
patent_classifications.head()

Unnamed: 0,patent_id,mainclass_id,sequence
21969998,D257752,D19,0
21970267,D257924,D06,0
21970995,D258382,D23,0
21970996,D258383,D23,0
21971443,D258678,D23,0


In [9]:
patent_classifications = patent_classifications.rename(index=str, columns={'patent_id':'patent_number'})

Let's examine how many novel combinations exist in the discrete measure

In [10]:
d2d_novel_count = d2d_seen.loc[d2d_seen.first_seen == 1].groupby('focal_mainclass')['patent_number'].count().to_frame().reset_index().rename(index=str, columns={'patent_number': 'novelty_count'})
# d2d_novel_count

In [11]:
d2u_novel_count = d2u_seen.loc[d2u_seen.first_seen == 1].groupby('focal_mainclass')['patent_number'].count().to_frame().reset_index().rename(index=str, columns={'patent_number': 'novelty_count'})
# d2u_novel_count

In [12]:
print('Design to design number:',d2d_seen.loc[d2d_seen.first_seen == 1].patent_number.nunique(),'\n',
      'Design to design percentage of dataset', d2d_seen.loc[d2d_seen.first_seen == 1].patent_number.nunique()/master.patent_number.nunique(), '\n',
     'Design to utility number:',d2u_seen.loc[d2u_seen.first_seen == 1].patent_number.nunique(),'\n',
      'Design to utility percentage of dataset', d2u_seen.loc[d2u_seen.first_seen == 1].patent_number.nunique()/master.patent_number.nunique())

Design to design number: 1026 
 Design to design percentage of dataset 0.0019524634150982893 
 Design to utility number: 10496 
 Design to utility percentage of dataset 0.019973738796171193


Examin all design patents together to see how many patents are novel in total

In [13]:
all_seen = pd.concat([d2d_seen, d2u_seen])
print('all novel design patents:', all_seen.loc[all_seen.first_seen == 1].patent_number.nunique())
print('percent of dataset novel:', all_seen.loc[all_seen.first_seen == 1].patent_number.nunique()/master.patent_number.nunique())

all novel design patents: 11175
percent of dataset novel: 0.021265866143979906


means table for each focal design catagory

In [14]:
d2d_means_table = d2d_seen.groupby('focal_mainclass')['first_seen'].describe().reset_index()
# d2d_means_table

In [15]:
d2u_means_table = d2u_seen.groupby('focal_mainclass')['first_seen'].describe().reset_index()
# d2u_means_table

In [16]:
d2d_main_table = pd.merge(d2d_novel_count, d2d_means_table, on='focal_mainclass', how='inner')
# d2d_main_table

In [17]:
d2d_main_table.drop(labels=['min','25%','50%','75%','max'], axis=1, inplace=True)
d2d_main_table['class_name'] = d2d_main_table['focal_mainclass'].map(class_labels)
d2d_main_table

Unnamed: 0,focal_mainclass,novelty_count,count,mean,std,class_name
0,D01,21,6593.0,0.003185,0.056352,Edible Products
1,D02,48,27623.0,0.001738,0.04165,Apparel and Haberdashery
2,D03,45,38424.0,0.001171,0.034202,"Travel Goods, Personal Belongings, and Storage..."
3,D04,44,7819.0,0.005627,0.074809,Brushware
4,D05,25,5015.0,0.004985,0.070436,Textile or Paper Yard Goods; Sheet Material
5,D06,65,79474.0,0.000818,0.028587,Furnishings
6,D07,55,59429.0,0.000925,0.030408,Equipment for Preparing or Serving Food or Dri...
7,D08,43,59637.0,0.000721,0.026843,Tools and Hardware
8,D09,58,67506.0,0.000859,0.029299,Packages and Containers for Goods
9,D10,43,31618.0,0.00136,0.036853,"Measuring, Testing or Signaling Instruments"


In [18]:
d2u_main_table = pd.merge(d2u_novel_count, d2u_means_table, on='focal_mainclass', how='inner').drop(labels=['min','25%','50%','75%','max'], axis=1)
d2u_main_table['class_name'] = d2u_main_table['focal_mainclass'].map(class_labels)
d2u_main_table

Unnamed: 0,focal_mainclass,novelty_count,count,mean,std,class_name
0,D01,305,4944.0,0.061691,0.240618,Edible Products
1,D02,444,31882.0,0.013926,0.117187,Apparel and Haberdashery
2,D03,510,52616.0,0.009693,0.097975,"Travel Goods, Personal Belongings, and Storage..."
3,D04,349,12818.0,0.027227,0.162752,Brushware
4,D05,411,8260.0,0.049758,0.217457,Textile or Paper Yard Goods; Sheet Material
5,D06,673,86547.0,0.007776,0.087839,Furnishings
6,D07,532,72832.0,0.007304,0.085154,Equipment for Preparing or Serving Food or Dri...
7,D08,751,102477.0,0.007328,0.085293,Tools and Hardware
8,D09,575,84551.0,0.006801,0.082186,Packages and Containers for Goods
9,D10,593,42228.0,0.014043,0.117669,"Measuring, Testing or Signaling Instruments"


# Discrete Subclass Analysis

In [19]:
d2d_sub_seen = pd.read_csv('data/final_sub_d2d_seen.csv')
d2d_sub_seen_cont = pd.read_csv('data/final_sub_d2d_seen_cont.csv')
d2u_sub_seen = pd.read_csv('data/final_sub_d2u_seen.csv')
d2u_sub_seen_cont = pd.read_csv('data/final_sub_d2u_seen_cont.csv')

In [20]:
d2d_sub_seen[['mainclass', 'subclass']] = d2d_sub_seen['focal_subclass'].str.split('/', expand=True)
d2u_sub_seen[['mainclass', 'subclass']] = d2u_sub_seen['focal_subclass'].str.split('/', expand=True)

In [21]:
d2d_sub_seen['mainclass'] =  d2d_sub_seen['mainclass'].replace(class_dictionary)
d2u_sub_seen['mainclass'] =  d2u_sub_seen['mainclass'].replace(class_dictionary)

In [22]:
d2d_count = d2d_sub_seen.loc[d2d_sub_seen.first_seen == 1].groupby('mainclass')['patent_number'].count().to_frame().reset_index()
d2u_count = d2u_sub_seen.loc[d2u_sub_seen.first_seen == 1].groupby('mainclass')['patent_number'].count().to_frame().reset_index()

In [23]:
d2d_sub_means_table = d2d_sub_seen.groupby('mainclass')['first_seen'].describe().reset_index()
d2u_sub_means_table = d2u_sub_seen.groupby('mainclass')['first_seen'].describe().reset_index()


In [24]:
d2d_sub_table = pd.merge(d2d_count, d2d_sub_means_table, on='mainclass', how='inner').drop(labels=['min','25%','50%','75%','max'], axis=1)
d2d_sub_table['class_name'] = d2d_sub_table['mainclass'].map(class_labels)
d2d_sub_table

Unnamed: 0,mainclass,patent_number,count,mean,std,class_name
0,D01,5799,20159.0,0.287663,0.452684,Edible Products
1,D02,20300,255304.0,0.079513,0.270538,Apparel and Haberdashery
2,D03,30257,159724.0,0.189433,0.391853,"Travel Goods, Personal Belongings, and Storage..."
3,D04,4396,28472.0,0.154397,0.361336,Brushware
4,D05,5121,21382.0,0.239501,0.426789,Textile or Paper Yard Goods; Sheet Material
5,D06,60128,302750.0,0.198606,0.398951,Furnishings
6,D07,62145,388858.0,0.159814,0.366434,Equipment for Preparing or Serving Food or Dri...
7,D08,29630,176773.0,0.167616,0.373526,Tools and Hardware
8,D09,70817,684321.0,0.103485,0.304592,Packages and Containers for Goods
9,D10,19002,115692.0,0.164246,0.370501,"Measuring, Testing or Signaling Instruments"


In [25]:
d2u_sub_table = pd.merge(d2u_count, d2u_sub_means_table, on='mainclass', how='inner').drop(labels=['min','25%','50%','75%','max'], axis=1)
d2u_sub_table['class_name'] = d2u_sub_table['mainclass'].map(class_labels)
d2u_sub_table

Unnamed: 0,mainclass,patent_number,count,mean,std,class_name
0,D01,15344,32573.0,0.471065,0.49917,Edible Products
1,D02,73824,251377.0,0.293678,0.455447,Apparel and Haberdashery
2,D03,117707,280794.0,0.419193,0.493428,"Travel Goods, Personal Belongings, and Storage..."
3,D04,23053,66220.0,0.348127,0.47638,Brushware
4,D05,30129,60351.0,0.49923,0.500004,Textile or Paper Yard Goods; Sheet Material
5,D06,198972,453547.0,0.438702,0.496229,Furnishings
6,D07,194228,479789.0,0.40482,0.490858,Equipment for Preparing or Serving Food or Dri...
7,D08,196784,470704.0,0.418063,0.493241,Tools and Hardware
8,D09,280597,679096.0,0.413192,0.492407,Packages and Containers for Goods
9,D10,96442,187463.0,0.514459,0.499792,"Measuring, Testing or Signaling Instruments"


In [26]:
print('Design to design number:',d2d_sub_seen.loc[d2d_sub_seen.first_seen == 1].patent_number.nunique(),'\n',
      'Design to design percentage of dataset', d2d_sub_seen.loc[d2d_sub_seen.first_seen == 1].patent_number.nunique()/master.patent_number.nunique(), '\n',
     'Design to utility number:',d2u_sub_seen.loc[d2u_sub_seen.first_seen == 1].patent_number.nunique(),'\n',
      'Design to utility percentage of dataset', d2u_sub_seen.loc[d2u_sub_seen.first_seen == 1].patent_number.nunique()/master.patent_number.nunique())

Design to design number: 158010 
 Design to design percentage of dataset 0.3006907838398447 
 Design to utility number: 215898 
 Design to utility percentage of dataset 0.41085082494433767


In [27]:
all_seen = pd.concat([d2d_sub_seen, d2u_sub_seen])
print('all novel design patents:', all_seen.loc[all_seen.first_seen == 1].patent_number.nunique())
print('percent of dataset novel:', all_seen.loc[all_seen.first_seen == 1].patent_number.nunique()/master.patent_number.nunique())

all novel design patents: 280178
percent of dataset novel: 0.5331747511846087


# Continuous mainclass Analysis

In [28]:
#log transformation of commonness
d2d_seen_cont['novelty'] = np.log(d2d_seen_cont['commonness'])*(-1)
d2u_seen_cont['novelty'] = np.log(d2u_seen_cont['commonness'])*(-1)

In [29]:
novelty_median = d2d_seen_cont.groupby(['patent_number'])['novelty'].median().to_frame().reset_index().rename(index=str, columns={'novelty':'median_novelty'})
novelty_median.head()

Unnamed: 0,patent_number,median_novelty
0,D258382,-0.797657
1,D258383,-0.797657
2,D258678,-0.797657
3,D258755,-0.797657
4,D258990,0.777764


In [30]:
d2d_seen_cont = pd.merge(d2d_seen_cont,novelty_median, on='patent_number', how='left')
d2d_seen_cont.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,first_seen,Nt,Nijt,Nit,Nij,commonness,novelty,median_novelty
0,D258382,1980,D23,D23,0,3970,200,598,598,2.220333,-0.797657,-0.797657
1,D258383,1980,D23,D23,0,3970,200,598,598,2.220333,-0.797657,-0.797657
2,D258678,1980,D23,D23,0,3970,200,598,598,2.220333,-0.797657,-0.797657
3,D258755,1980,D23,D23,0,3970,200,598,598,2.220333,-0.797657,-0.797657
4,D258990,1980,D25,D25,0,3970,78,289,289,3.707571,-1.310377,0.777764


In [31]:
# attribute novelty to all classes
d2d_cont_main_table = d2d_seen_cont.groupby('focal_mainclass')['median_novelty'].median().to_frame().reset_index()
d2d_cont_main_table['class_name'] = d2d_cont_main_table['focal_mainclass'].map(class_labels)
d2d_cont_main_table

Unnamed: 0,focal_mainclass,median_novelty,class_name
0,D01,0.650593,Edible Products
1,D02,-1.481058,Apparel and Haberdashery
2,D03,1.130378,"Travel Goods, Personal Belongings, and Storage..."
3,D04,-0.529835,Brushware
4,D05,0.58031,Textile or Paper Yard Goods; Sheet Material
5,D06,0.893309,Furnishings
6,D07,0.775071,Equipment for Preparing or Serving Food or Dri...
7,D08,0.967645,Tools and Hardware
8,D09,0.586829,Packages and Containers for Goods
9,D10,0.403832,"Measuring, Testing or Signaling Instruments"


In [32]:
novelty_median = d2u_seen_cont.groupby(['patent_number'])['novelty'].median().to_frame().reset_index().rename(index=str, columns={'novelty':'median_novelty'})
d2u_seen_cont = pd.merge(d2u_seen_cont,novelty_median, on='patent_number', how='left')
d2u_cont_main_table = d2u_seen_cont.groupby('focal_mainclass')['median_novelty'].median().to_frame().reset_index()
d2u_cont_main_table['class_name'] = d2u_cont_main_table['focal_mainclass'].map(class_labels)
d2u_cont_main_table

Unnamed: 0,focal_mainclass,median_novelty,class_name
0,D01,-1.64652,Edible Products
1,D02,-1.436201,Apparel and Haberdashery
2,D03,-0.656893,"Travel Goods, Personal Belongings, and Storage..."
3,D04,-1.751577,Brushware
4,D05,-1.865339,Textile or Paper Yard Goods; Sheet Material
5,D06,-0.626614,Furnishings
6,D07,-0.965901,Equipment for Preparing or Serving Food or Dri...
7,D08,-0.728793,Tools and Hardware
8,D09,-1.139961,Packages and Containers for Goods
9,D10,-0.937778,"Measuring, Testing or Signaling Instruments"


In [38]:
d2d_sub_seen_cont = pd.merge(d2d_seen_cont,patent_classifications, on='patent_number', how='left')
d2d_sub_seen_cont['mainclass_id'] = d2d_sub_seen_cont['mainclass_id'].replace(class_dictionary)
d2d_cont_main_table_v2 = d2d_sub_seen_cont.groupby('mainclass_id')['median_novelty'].median().to_frame().reset_index()
d2d_cont_main_table_v2 ['name'] = d2d_cont_main_table_v2 ['mainclass_id'].map(class_labels)
d2d_cont_main_table_v2

Unnamed: 0,mainclass_id,median_novelty,name
0,1,0.194767,
1,24,-0.655067,
2,D01,0.641893,Edible Products
3,D02,-1.481058,Apparel and Haberdashery
4,D03,1.119594,"Travel Goods, Personal Belongings, and Storage..."
5,D04,-0.550149,Brushware
6,D05,0.58031,Textile or Paper Yard Goods; Sheet Material
7,D06,0.888962,Furnishings
8,D07,0.775071,Equipment for Preparing or Serving Food or Dri...
9,D08,0.954205,Tools and Hardware


In [39]:
d2u_sub_seen_cont = pd.merge(d2u_seen_cont,patent_classifications, on='patent_number', how='left')
d2u_sub_seen_cont['mainclass_id'] = d2u_sub_seen_cont['mainclass_id'].replace(class_dictionary)
d2u_cont_main_table_v2 = d2u_sub_seen_cont.groupby('mainclass_id')['median_novelty'].median().to_frame().reset_index()
d2u_cont_main_table_v2 ['name'] = d2u_cont_main_table_v2 ['mainclass_id'].map(class_labels)
d2u_cont_main_table_v2

Unnamed: 0,mainclass_id,median_novelty,name
0,70,-0.53823,
1,D01,-1.64652,Edible Products
2,D02,-1.467828,Apparel and Haberdashery
3,D03,-0.664472,"Travel Goods, Personal Belongings, and Storage..."
4,D04,-1.764574,Brushware
5,D05,-1.851823,Textile or Paper Yard Goods; Sheet Material
6,D06,-0.629953,Furnishings
7,D07,-0.960429,Equipment for Preparing or Serving Food or Dri...
8,D08,-0.734952,Tools and Hardware
9,D09,-1.15473,Packages and Containers for Goods


# Continuous subclass analysis

In [None]:
d2d_sub_seen_cont['novelty'] = np.log(d2d_sub_seen_cont['commonness'])*(-1)
d2u_sub_seen_cont['novelty'] = np.log(d2u_sub_seen_cont['commonness'])*(-1)

In [None]:
novelty_median = d2d_sub_seen_cont.groupby(['patent_number'])['novelty'].median().to_frame().reset_index().rename(index=str, columns={'novelty':'median_novelty'})
d2d_sub_seen_cont = pd.merge(d2d_sub_seen_cont,novelty_median, on='patent_number', how='left')
d2d_sub_seen_cont[['mainclass', 'subclass']] = d2d_sub_seen_cont['focal_subclass'].str.split('/', expand=True)
d2d_cont_sub_table = d2d_sub_seen_cont.groupby('mainclass')['median_novelty'].median().to_frame().reset_index()
d2d_cont_sub_table['mainclass'] = d2d_cont_sub_table['mainclass'].replace(class_dictionary)
d2d_cont_sub_table['class_name'] = d2d_cont_sub_table['mainclass'].map(class_labels)
d2d_cont_sub_table.sort_values('mainclass', inplace=True)
d2d_cont_sub_table

In [None]:
novelty_median = d2u_sub_seen_cont.groupby(['patent_number'])['novelty'].median().to_frame().reset_index().rename(index=str, columns={'novelty':'median_novelty'})
d2u_sub_seen_cont = pd.merge(d2u_sub_seen_cont,novelty_median, on='patent_number', how='left')
d2u_sub_seen_cont[['mainclass', 'subclass']] = d2u_sub_seen_cont['focal_subclass'].str.split('/', expand=True)
d2u_cont_sub_table = d2u_sub_seen_cont.groupby('mainclass')['median_novelty'].median().to_frame().reset_index()
d2u_cont_sub_table['mainclass'] = d2u_cont_sub_table['mainclass'].replace(class_dictionary)
d2u_cont_sub_table['class_name'] = d2u_cont_sub_table['mainclass'].map(class_labels)
d2u_cont_sub_table.sort_values('mainclass', inplace=True)
d2u_cont_sub_table

# Save the data!

In [None]:
# d2d_main_table.to_csv('data/d2d_main_table.csv')
# d2u_main_table.to_csv('data/d2u_main_table.csv')
# d2d_sub_table.to_csv('data/d2d_sub_table.csv')
# d2u_sub_table.to_csv('data/d2u_sub_table.csv')

In [None]:
# d2d_cont_main_table.to_csv('data/d2d_cont_main_table.csv')
# d2u_cont_main_table.to_csv('data/d2u_cont_main_table.csv')
# d2u_cont_sub_table.to_csv('data/d2u_cont_sub_table.csv')
# d2d_cont_sub_table.to_csv('data/d2d_cont_sub_table.csv')

How many focal patents have multiple mainclasses?

In [None]:
final = pd.read_csv('data/final.csv',usecols=['patent_number','focal_mainclass'])
final.head()

In [None]:
final.drop_duplicates(inplace=True)
final.head()

In [None]:
num_mainclass = final.groupby('patent_number')['focal_mainclass'].count()
len(num_mainclass)

In [None]:
len(num_mainclass.loc[num_mainclass > 1])

In [None]:
#ratio of patents with more than 1 mainclass
34418/503107