This purpose of this notebook is to split the data into 4 groups for analysis.
1. Design citing design at mainclass level
2. Design citing utility at mainclass level
3. Design citing deisng at subclass level
4. Design citing utility at subclass level

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.nan)

In [2]:
#reading in cleaned data. This csv lists each patent, their priority date, their main class, their subclass,
# the patents they cite, the cited patents mainclass, and cited patent focal class
final = pd.read_csv('data/final.csv')
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D6/573
4,D257924,1980,4162014,211,211/55,D06,D6/573


# mainclass
First we will select just the mainclass levels

In [3]:
#select the mainclass aggregation
final_main = final.drop(labels = ['cite_subclass','focal_subclass'], axis=1)
final_main.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,focal_mainclass
0,D257752,1980,4162014,211,D19
1,D257752,1980,4162014,211,D19
2,D257752,1980,4162014,211,D19
3,D257924,1980,4162014,211,D06
4,D257924,1980,4162014,211,D06


In [4]:
#select just design citing design. The cited patent number will contain a D
#check for proper classifications
final_main_d2d = final_main.loc[final_main.cited_patent_number.str.contains('D')]

print('Cited mainclasses:',np.sort(final_main_d2d.cite_mainclass.unique()),\
      '\n\n','Focal Mainclasses:', np.sort(final_main_d2d.focal_mainclass.unique()))

Cited mainclasses: ['1' '100' '101' '102' '104' '106' '108' '111' '112' '114' '116' '118'
 '119' '12' '123' '124' '125' '126' '128' '131' '132' '135' '137' '138'
 '139' '14' '140' '141' '142' '144' '15' '150' '156' '157' '16' '160'
 '165' '166' '168' '169' '171' '172' '173' '174' '175' '177' '180' '181'
 '182' '184' '188' '19' '190' '193' '194' '198' '2' '200' '206' '209'
 '210' '211' '215' '216' '217' '219' '220' '221' '222' '223' '224' '227'
 '228' '229' '232' '235' '238' '239' '24' '241' '242' '244' '245' '246'
 '248' '249' '250' '251' '254' '256' '26' '261' '267' '269' '27' '271'
 '273' '277' '279' '28' '280' '281' '283' '285' '289' '29' '292' '293'
 '294' '296' '297' '30' '301' '305' '310' '312' '313' '323' '324' '33'
 '34' '340' '341' '342' '343' '345' '346' '347' '348' '351' '353' '356'
 '358' '359' '36' '360' '361' '362' '366' '368' '369' '37' '374' '378'
 '379' '38' '381' '382' '383' '384' '386' '392' '396' '399' '4' '40' '400'
 '401' '402' '403' '404' '405' '407' '408' '409' 

In [5]:
'''It seems as though some cited design patents have untilty classifications mixed in.
We will ignore them by only selecting citation classes that contain "D"
We will also drop the "cited patent number" column because it is no longer needed.
In order to make sure each class pair is represented once for each patent, we drop any
duplicate entries left in the dataframe.
'''
final_main_d2d = final_main_d2d.loc[final_main_d2d.cite_mainclass.str.contains('D')]\
.drop(['cited_patent_number'], axis=1).drop_duplicates()

final_main_d2d.sample(10)

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
62098,D274345,1982,D21,D21
2432057,D388231,1997,D14,D99
3950043,D445055,2000,D10,D11
11629265,D623771,2009,D23,D25
14899700,D698038,2011,D99,D24
10089139,D590257,2007,D19,D09
10364883,D599584,2008,D06,D06
20933085,D775130,2015,D14,D14
1491733,D364190,1993,D19,D19
17328292,D714973,2013,D26,D26


In [6]:
'''Similar procedure as above, except for design citing utility at the mainclass
'''
#select patent numbers that do NOT comtain "D"
final_main_d2u = final_main.loc[~final_main.cited_patent_number.str.contains('D')]
print('Cited mainclass: ',np.sort(final_main_d2u.cite_mainclass.unique()), '\n\n',\
      'Cited subclass: ',np.sort(final_main_d2u.focal_mainclass.unique()))

Cited mainclass:  ['1' '100' '101' '102' '104' '105' '106' '108' '109' '110' '111' '112'
 '114' '116' '117' '118' '119' '12' '122' '123' '124' '125' '126' '127'
 '128' '131' '132' '134' '135' '136' '137' '138' '139' '14' '140' '141'
 '142' '144' '147' '148' '149' '15' '150' '152' '156' '157' '159' '16'
 '160' '162' '163' '164' '165' '166' '168' '169' '171' '172' '173' '174'
 '175' '177' '178' '180' '181' '182' '184' '185' '186' '187' '188' '19'
 '190' '191' '192' '193' '194' '196' '198' '199' '2' '200' '201' '202'
 '203' '204' '205' '206' '208' '209' '210' '211' '212' '213' '215' '216'
 '217' '218' '219' '220' '221' '222' '223' '224' '225' '226' '227' '228'
 '229' '23' '231' '232' '234' '235' '236' '237' '238' '239' '24' '241'
 '242' '244' '245' '246' '248' '249' '250' '251' '252' '254' '256' '257'
 '258' '26' '260' '261' '264' '266' '267' '269' '27' '270' '271' '273'
 '277' '278' '279' '28' '280' '281' '283' '285' '289' '29' '290' '291'
 '292' '293' '294' '295' '296' '297' '298' '299'

In [7]:
'''All the classes seen to be correct.
As above, drop the cited patent number and any duplicates from the data frame to ensure
that each class combination is represented once for each patent
'''
final_main_d2u = final_main_d2u.drop(['cited_patent_number'], axis=1).drop_duplicates()

final_main_d2u.sample(10)

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
17212774,D710976,2013,137,D23
6435861,D513306,2004,239,D23
11270053,D614749,2009,428,D06
17547044,D723015,2013,455,D14
210772,D290815,1985,417,D10
17265962,D712767,2013,222,D24
11822890,D631544,2009,604,D24
13887898,D656652,2010,362,D26
6487905,D515184,2004,210,D11
18561181,D727461,2013,210,D23


# Subclass
Repeat the same procedure as above, except at the subclass level

In [8]:
final_sub = final.drop(labels = ['cite_mainclass','focal_mainclass'], axis=1).drop_duplicates()
final_sub.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_subclass,focal_subclass
0,D257752,1980,4162014,211/50,D19/75
1,D257752,1980,4162014,211/55,D19/75
2,D257752,1980,4162014,211/126.1,D19/75
3,D257924,1980,4162014,211/50,D6/573
4,D257924,1980,4162014,211/55,D6/573


In [9]:
'''select just design citing design. The cited patent number will contain a D
The subclass level contains MANY more classes than the main class.
Printing them all out is slow and unecessary
'''
final_sub_d2d = final_sub.loc[final_sub.cited_patent_number.str.contains('D')]
final_sub_d2d = final_sub_d2d.loc[final_sub_d2d.cite_subclass.str.startswith('D')]\
.drop(['cited_patent_number'], axis=1).drop_duplicates()

final_sub_d2d.sample(10)

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
9152375,D572133,2007,D7/584,D9/500
2931923,D411371,1998,D30/104,D6/300
15200671,D673447,2012,D9/737,D9/424
19932714,D778684,2014,D7/702,D7/600.4
13636374,D647876,2008,D16/208,D14/496
19757041,D766752,2014,D10/97,D10/30
4953709,D473837,2002,D12/211,D12/211
21600050,D794377,2015,D7/549,D7/354
16051016,D699017,2012,D6/601,D1/199
9814732,D622719,2007,D13/110,D14/341


In [10]:
#design citing utility
final_sub_d2u = final_sub.loc[~final_sub.cited_patent_number.str.contains('D')]
final_sub_d2u = final_sub_d2u.loc[~final_sub_d2u.cite_subclass.str.startswith('D')]\
.drop(['cited_patent_number'], axis=1).drop_duplicates()
final_sub_d2u.sample(10)

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
18750300,D734956,2014,211/132.1,D6/525
18731905,D734144,2014,493/51,D9/416
3744133,D467180,1998,264/532,D9/574
10606386,D608858,2008,239/353,D23/226
3338030,D423638,1999,220/565,D23/202
3718352,D449073,1998,401/59,D19/75
9823563,D630524,2007,607/96,D9/725
2376950,D400027,1996,108/33,D6/686
921595,D333306,1991,381/382,D14/205
17252499,D712259,2010,220/906,D9/449


In [11]:
#number of patent numbers represented
final_sub_d2u.patent_number.nunique()

308725

# Stats
Quickt comparisons of the dataframes

In [12]:
def quick_stats(frame, focal_column, cited_column):
    count = frame.shape[0]
    unique_pats = frame.patent_number.nunique()
    focal = frame[focal_column].nunique()
    cite = frame[cited_column].nunique()
    mean_size = round(frame.groupby('patent_number').size().mean(), 2)
    
    print('Number of entries: ', count,'\n',\
         'Number of unique patents: ', unique_pats,'\n',\
         'Number of classes represented by focal patents: ',focal,'\n',\
          'Number of classes represented by cited patents: ', cite,'\n',\
          'Mean number of citation pairs per patent: ', mean_size
         )
    

In [13]:
print('Design from Design mainclass:')
quick_stats(final_main_d2d, 'focal_mainclass','cite_mainclass')
print(' \n Design from utility mainclass:')
quick_stats(final_main_d2u, 'focal_mainclass','cite_mainclass')
print(' \n Design from Design subclass:')
quick_stats(final_sub_d2d, 'focal_subclass','cite_subclass')
print(' \n Design from utility subclass')
quick_stats(final_sub_d2u, 'focal_subclass','cite_subclass')


Design from Design mainclass:
Number of entries:  973666 
 Number of unique patents:  475267 
 Number of classes represented by focal patents:  33 
 Number of classes represented by cited patents:  33 
 Mean number of citation pairs per patent:  2.05
 
 Design from utility mainclass:
Number of entries:  1328315 
 Number of unique patents:  308725 
 Number of classes represented by focal patents:  33 
 Number of classes represented by cited patents:  437 
 Mean number of citation pairs per patent:  4.3
 
 Design from Design subclass:
Number of entries:  4171957 
 Number of unique patents:  475267 
 Number of classes represented by focal patents:  5487 
 Number of classes represented by cited patents:  5490 
 Mean number of citation pairs per patent:  8.78
 
 Design from utility subclass
Number of entries:  7020641 
 Number of unique patents:  308725 
 Number of classes represented by focal patents:  5361 
 Number of classes represented by cited patents:  100717 
 Mean number of citation

Save the dataframes for future analysis

In [14]:

final_main_d2d.to_csv('data/final_main_d2d.csv', index_label=False)
final_main_d2u.to_csv('data/final_main_d2u.csv', index_label=False)

In [15]:

final_sub_d2d.to_csv('data/final_sub_d2d.csv', index_label=False)
final_sub_d2u.to_csv('data/final_sub_d2u.csv', index_label=False)