This purpose of this notebook is to split the data into 4 groups for analysis.
1. Design citing design at mainclass level
2. Design citing utility at mainclass level
3. Design citing deisng at subclass level
4. Design citing utility at subclass level

In [1]:
import pandas as pd
import numpy as np
np.set_printoptions(threshold=np.nan)

In [2]:
#reading in cleaned data. This csv lists each patent, their priority date, their main class, their subclass,
# the patents they cite, the cited patents mainclass, and cited patent focal class
# final =pd.read_csv('data/final.csv')
final = pd.read_csv('data/final.csv')
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D6/573
4,D257924,1980,4162014,211,211/55,D06,D6/573


In [3]:
master = pd.read_csv('data/master_437678.csv')

final = final.loc[final.patent_number.isin(master.patent_number.unique().tolist())]

In [4]:
final.patent_number.nunique()

437678

# mainclass
First we will select just the mainclass levels

In [5]:
#select the mainclass aggregation
final_main = final.drop(labels = ['cite_subclass','focal_subclass'], axis=1)
final_main.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,focal_mainclass
0,D257752,1980,4162014,211,D19
1,D257752,1980,4162014,211,D19
2,D257752,1980,4162014,211,D19
3,D257924,1980,4162014,211,D06
4,D257924,1980,4162014,211,D06


In [6]:
#select just design citing design. The cited patent number will contain a D
#check for proper classifications
final_main_d2d = final_main.loc[final_main.cited_patent_number.str.contains('D')]

print('Cited mainclasses:',np.sort(final_main_d2d.cite_mainclass.unique()),\
      '\n\n','Focal Mainclasses:', np.sort(final_main_d2d.focal_mainclass.unique()))

Cited mainclasses: ['1' '100' '101' '102' '104' '106' '108' '111' '112' '114' '116' '118'
 '119' '12' '123' '124' '125' '126' '128' '131' '132' '135' '137' '138'
 '139' '14' '140' '141' '142' '144' '15' '150' '156' '157' '16' '160'
 '165' '166' '168' '169' '171' '172' '173' '174' '175' '177' '180' '181'
 '182' '184' '188' '19' '190' '193' '194' '198' '2' '200' '206' '209'
 '210' '211' '215' '216' '217' '219' '220' '221' '222' '223' '224' '227'
 '228' '229' '232' '235' '238' '239' '24' '241' '242' '244' '245' '246'
 '248' '249' '250' '251' '254' '256' '26' '261' '267' '269' '27' '271'
 '273' '277' '279' '28' '280' '281' '283' '285' '289' '29' '292' '293'
 '294' '296' '297' '30' '301' '305' '310' '312' '313' '323' '324' '33'
 '34' '340' '341' '342' '343' '345' '346' '347' '348' '351' '353' '356'
 '358' '359' '36' '360' '361' '362' '366' '368' '369' '37' '374' '378'
 '379' '38' '381' '382' '383' '384' '386' '392' '396' '399' '4' '40' '400'
 '401' '402' '403' '404' '405' '407' '408' '409' 

In [7]:
'''It seems as though some cited design patents have untilty classifications mixed in.
We will ignore them by only selecting citation classes that contain "D"
We will also drop the "cited patent number" column because it is no longer needed.
In order to make sure each class pair is represented once for each patent, we drop any
duplicate entries left in the dataframe.
'''
final_main_d2d = final_main_d2d.loc[final_main_d2d.cite_mainclass.str.contains('D')]\
.drop(['cited_patent_number'], axis=1)

In [8]:
'''
We only want to have non-repeating combinations of class pairings
So drop class pairings like A-A, B-B, C-C, etc...
Also if A-B and B-A are represented in focal patents, drop the repeating combinations
'''
#drop A-A
final_main_d2d = final_main_d2d.loc[final_main_d2d.cite_mainclass != final_main_d2d.focal_mainclass]

#drop A-B, B-A duplicates
sorted_cols = final_main_d2d[['cite_mainclass','focal_mainclass']].apply(sorted, axis=1, result_type='expand')
final_main_d2d = pd.concat([final_main_d2d, sorted_cols], axis=1)
final_main_d2d = final_main_d2d.drop_duplicates(['patent_number',0,1]).drop([0,1], axis=1)

final_main_d2d.sample(10)

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
1197776,D347451,1992,D20,D21
18254990,D762459,2013,D19,D08
14254775,D666792,2011,D29,D02
14403178,D671013,2011,D07,D09
18080777,D744595,2013,D12,D20
7468324,D537480,2005,D06,D19
10550100,D606501,2008,D26,D13
7413026,D535102,2005,D30,D09
11498831,D619960,2008,D14,D13
9129940,D571369,2006,D18,D14


In [9]:
'''Similar procedure as above, except for design citing utility at the mainclass
'''
#select patent numbers that do NOT comtain "D"
final_main_d2u = final_main.loc[~final_main.cited_patent_number.str.contains('D')]
print('Cited mainclass: ',np.sort(final_main_d2u.cite_mainclass.unique()), '\n\n',\
      'Cited subclass: ',np.sort(final_main_d2u.focal_mainclass.unique()))

Cited mainclass:  ['1' '100' '101' '102' '104' '105' '106' '108' '109' '110' '111' '112'
 '114' '116' '117' '118' '119' '12' '122' '123' '124' '125' '126' '127'
 '128' '131' '132' '134' '135' '136' '137' '138' '139' '14' '140' '141'
 '142' '144' '147' '148' '149' '15' '150' '152' '156' '157' '159' '16'
 '160' '162' '163' '164' '165' '166' '168' '169' '171' '172' '173' '174'
 '175' '177' '178' '180' '181' '182' '184' '185' '186' '187' '188' '19'
 '190' '191' '192' '193' '194' '196' '198' '199' '2' '200' '201' '202'
 '203' '204' '205' '206' '208' '209' '210' '211' '212' '213' '215' '216'
 '217' '218' '219' '220' '221' '222' '223' '224' '225' '226' '227' '228'
 '229' '23' '231' '232' '234' '235' '236' '237' '238' '239' '24' '241'
 '242' '244' '245' '246' '248' '249' '250' '251' '252' '254' '256' '257'
 '258' '26' '260' '261' '264' '266' '267' '269' '27' '270' '271' '273'
 '277' '278' '279' '28' '280' '281' '283' '285' '289' '29' '290' '291'
 '292' '293' '294' '295' '296' '297' '298' '299'

In [11]:
'''All the classes seen to be correct.
As above, drop the cited patent number and any duplicates from the data frame to ensure
that each class combination is represented once for each patent
'''

#drop A-A
final_main_d2u = final_main_d2u.loc[final_main_d2u.cite_mainclass != final_main_d2u.focal_mainclass]

#drop A-B, B-A duplicates
sorted_cols = final_main_d2u[['cite_mainclass','focal_mainclass']].apply(sorted, axis=1, result_type='expand')
final_main_d2u = pd.concat([final_main_d2u, sorted_cols], axis=1)
final_main_d2u = final_main_d2u.drop_duplicates(['patent_number',0,1]).drop(['cited_patent_number',0,1], axis=1)

final_main_d2u.sample(10)

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
10688944,D612851,2007,348,D14
6767120,D529151,2004,296,D23
13012717,D649744,2010,428,D02
2147392,D382174,1996,222,D07
12488852,D635246,2010,366,D24
5460011,D485255,2003,181,D14
15237478,D674897,2012,606,D24
462815,D307198,1988,15,D28
7882211,D567373,2005,435,D24
691963,D329628,1989,222,D03


# Subclass
Repeat the same procedure as above, except at the subclass level

In [12]:
final_sub = final.drop(labels = ['cite_mainclass','focal_mainclass'], axis=1).drop_duplicates()
final_sub.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_subclass,focal_subclass
0,D257752,1980,4162014,211/50,D19/75
1,D257752,1980,4162014,211/55,D19/75
2,D257752,1980,4162014,211/126.1,D19/75
3,D257924,1980,4162014,211/50,D6/573
4,D257924,1980,4162014,211/55,D6/573


In [13]:
'''select just design citing design. The cited patent number will contain a D
The subclass level contains MANY more classes than the main class.
Printing them all out is slow and unecessary
'''
final_sub_d2d = final_sub.loc[final_sub.cited_patent_number.str.contains('D')]
final_sub_d2d = final_sub_d2d.loc[final_sub_d2d.cite_subclass.str.startswith('D')]\
.drop(['cited_patent_number'], axis=1)


#drop A-A
final_sub_d2d = final_sub_d2d.loc[final_sub_d2d.cite_subclass != final_sub_d2d.focal_subclass]

#drop A-B, B-A duplicates
sorted_cols = final_sub_d2d[['cite_subclass','focal_subclass']].apply(sorted, axis=1, result_type='expand')
final_sub_d2d = pd.concat([final_sub_d2d, sorted_cols], axis=1)
final_sub_d2d = final_sub_d2d.drop_duplicates(['patent_number',0,1]).drop([0,1], axis=1)

final_sub_d2d.sample(10)


Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
17605682,D725323,2013,D13/184,D30/199
3647520,D439595,1998,D16/320,D16/304
16516737,D725909,2012,D9/519,D3/304
15757524,D691245,2012,D13/174,D23/261
1862483,D372327,1995,D8/352,D26/142
7068587,D520364,2005,D9/552,D9/545
811873,D331340,1990,D7/332,D6/691.4
9693065,D598773,2007,D9/559,D9/542
1283379,D369125,1992,D9/425,D11/152
6174100,D502026,2004,D11/118,D6/674


In [14]:
#design citing utility
final_sub_d2u = final_sub.loc[~final_sub.cited_patent_number.str.contains('D')]
final_sub_d2u = final_sub_d2u.loc[~final_sub_d2u.cite_subclass.str.startswith('D')]\
.drop(['cited_patent_number'], axis=1)



#drop A-A
final_sub_d2u = final_sub_d2u.loc[final_sub_d2u.cite_subclass != final_sub_d2u.focal_subclass]

#drop A-B, B-A duplicates
sorted_cols = final_sub_d2u[['cite_subclass','focal_subclass']].apply(sorted, axis=1, result_type='expand')
final_sub_d2u = pd.concat([final_sub_d2u, sorted_cols], axis=1)
final_sub_d2u = final_sub_d2u.drop_duplicates(['patent_number',0,1]).drop([0,1], axis=1)

final_sub_d2u.sample(10)

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
429713,D313574,1987,264/554,D11/153
1855596,D371960,1995,220/293,D1/129
2675816,D406433,1997,280/770,D34/23
17546988,D723010,2013,181/169,D14/222
8079571,D543020,2006,224/255,D3/215
3276167,D419055,1999,403/178,D8/382
3241934,D445958,1998,81/489,D28/48
12505260,D635731,2010,280/30,D34/12
3494293,D431273,1999,29/450,D21/757
101774,D280085,1983,60/325,D11/143


# Stats
Quickt comparisons of the dataframes

In [15]:
def quick_stats(frame, focal_column, cited_column):
    count = frame.shape[0]
    unique_pats = frame.patent_number.nunique()
    focal = frame[focal_column].nunique()
    cite = frame[cited_column].nunique()
    mean_size = round(frame.groupby('patent_number').size().mean(), 2)
    
    print('Number of entries: ', count,'\n',\
         'Number of unique patents: ', unique_pats,'\n',\
         'Number of classes represented by focal patents: ',focal,'\n',\
          'Number of classes represented by cited patents: ', cite,'\n',\
          'Mean number of citation pairs per patent: ', mean_size
         )
    

In [16]:
print('Design from Design mainclass:')
quick_stats(final_main_d2d, 'focal_mainclass','cite_mainclass')
print(' \n Design from utility mainclass:')
quick_stats(final_main_d2u, 'focal_mainclass','cite_mainclass')
print(' \n Design from Design subclass:')
quick_stats(final_sub_d2d, 'focal_subclass','cite_subclass')
print(' \n Design from utility subclass')
quick_stats(final_sub_d2u, 'focal_subclass','cite_subclass')


Design from Design mainclass:
Number of entries:  372974 
 Number of unique patents:  180687 
 Number of classes represented by focal patents:  33 
 Number of classes represented by cited patents:  33 
 Mean number of citation pairs per patent:  2.06
 
 Design from utility mainclass:
Number of entries:  1087784 
 Number of unique patents:  265176 
 Number of classes represented by focal patents:  33 
 Number of classes represented by cited patents:  437 
 Mean number of citation pairs per patent:  4.1
 
 Design from Design subclass:
Number of entries:  2848165 
 Number of unique patents:  356791 
 Number of classes represented by focal patents:  5453 
 Number of classes represented by cited patents:  5475 
 Mean number of citation pairs per patent:  7.98
 
 Design from utility subclass
Number of entries:  5561120 
 Number of unique patents:  265176 
 Number of classes represented by focal patents:  5323 
 Number of classes represented by cited patents:  97139 
 Mean number of citation 

Save the dataframes for future analysis

In [17]:

# final_main_d2d.to_csv('data/final_main_d2d.csv', index_label=False)
# final_main_d2u.to_csv('data/final_main_d2u.csv', index_label=False)

In [18]:

# final_sub_d2d.to_csv('data/final_sub_d2d.csv', index_label=False)
# final_sub_d2u.to_csv('data/final_sub_d2u.csv', index_label=False)

In [19]:
final_main_d2d.to_csv('data/final_main_d2d_drops.csv', index_label=False)
final_main_d2u.to_csv('data/final_main_d2u_drops.csv', index_label=False)
final_sub_d2d.to_csv('data/final_sub_d2d_drops.csv', index_label=False)
final_sub_d2u.to_csv('data/final_sub_d2u_drops.csv', index_label=False)