In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter

Example instructions if we are focusing on design (focal) to utility (cited)
1. retrieve all design patents 
2. retrieve all design patent classes for each focal design patent and all technology classes for the utility patents cited in the focal patent
3. list all pairwise combinations between design-utility classes (from the focal patent and its references) for each design patent
4. record the priority year t for each design patent
5. pool together all design-utility class pairs of design patents applied in the same year to construct a universe of patent class pairs for each year. 

In [2]:
# final = pd.read_csv('data/final_no_drops.csv')
# final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


In [3]:
# final.patent_number.nunique()

503128

# Mainclass aggregation, design citing utility

In [4]:
# d2u_main = final.loc[(~final.cited_patent_number.str.contains('D')) & (~final.cite_mainclass.str.contains('D'))]
# d2u_main.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


In [5]:
# print(d2u_main.shape, d2u_main.patent_number.nunique())

(10707423, 7) 309304


In [6]:
# #cleaning up duplicates
# d2u_main = d2u_main.drop(labels=['cited_patent_number','cite_subclass','focal_subclass'], axis=1).drop_duplicates()
# d2u_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
0,D257752,1980,211,D19
3,D257924,1980,211,D06
10,D258766,1980,52,D25
36,D258766,1980,40,D25
38,D258766,1980,49,D25


In [7]:
# print(d2u_main.shape, d2u_main.patent_number.nunique())

(1342837, 4) 309304


design patents, with design classes, that cites utility patents, with utility classes: 309,304

In [14]:
# d2u_main.reset_index(drop=True, inplace=True)

In [18]:
# Nt = Number of all journal pairs that year
def calc_nt(df, column_name):
    Nt_count = df.groupby(['priority_date'])[column_name].count()
    df['Nt'] = df.apply(lambda x: Nt_count[x['priority_date']], axis=1)
    return df

In [20]:
d2u_main = calc_nt(d2u_main, 'cite_mainclass')
d2u_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,Nt
0,D257752,1980,211,D19,4617
1,D257924,1980,211,D06,4617
2,D258766,1980,52,D25,4617
3,D258766,1980,40,D25,4617
4,D258766,1980,49,D25,4617


In [25]:
#Nijt = number of i-j pairs in Ut
def calc_nijt(df, focal_class, cite_class):
    Nijt_count = df.groupby(['priority_date',focal_class,cite_class])['patent_number'].count()
    df['Nijt'] = df.apply(lambda x: Nijt_count.loc[x['priority_date'],x[focal_class],x[cite_class]],axis=1)
    return df

In [37]:
# d2u_main = calc_nijt(d2u_main,'focal_mainclass','cite_mainclass')
# d2u_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,Nt,Nijt
0,D257752,1980,211,D19,4617,10
1,D257924,1980,211,D06,4617,38
2,D258766,1980,52,D25,4617,76
3,D258766,1980,40,D25,4617,1
4,D258766,1980,49,D25,4617,24


In [51]:
def calc_ni_nj(df, focal_class, cite_class):
    count_class = pd.DataFrame(df.groupby('priority_date').apply(lambda x: pd.Series(Counter(x[focal_class].tolist() + x[cite_class].tolist()))))
    df['Nit'] = df.apply(lambda x: count_class.loc[x['priority_date'],x[focal_class]], axis=1)
    df['Nij'] = df.apply(lambda x: count_class.loc[x['priority_date'],x[cite_class]], axis=1)
    return df

In [61]:
# d2u_main = calc_ni_nj(d2u_main,'focal_mainclass','cite_mainclass')

In [62]:
# d2u_main_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,Nt,Nijt,Nit,Nij
0,D257752,1980,211,D19,4617,10,95,93
1,D257924,1980,211,D06,4617,38,330,93
2,D258766,1980,52,D25,4617,76,221,140
3,D258766,1980,40,D25,4617,1,221,77
4,D258766,1980,49,D25,4617,24,221,44


In [63]:
def commonness(row):
    return (row['Nijt'] * row['Nt'])/(row['Nit'] * row['Nij'])

In [65]:
# d2u_main['commonness'] = d2u_main.apply(commonness, axis=1)
# d2u_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,Nt,Nijt,Nit,Nij,commonness
0,D257752,1980,211,D19,4617,10,95,93,5.225806
1,D257924,1980,211,D06,4617,38,330,93,5.716716
2,D258766,1980,52,D25,4617,76,221,140,11.341047
3,D258766,1980,40,D25,4617,1,221,77,0.271317
4,D258766,1980,49,D25,4617,24,221,44,11.395311


# Mainclass aggregation, design citing design

In [76]:
# d2d_main = final.loc[(final.cited_patent_number.str.contains('D')) & (final.cite_mainclass.str.contains('D'))]
# d2d_main.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
6,D258382,1980,D253842,D23,D23/214,D23,D23/214
7,D258383,1980,D253842,D23,D23/214,D23,D23/214
8,D258678,1980,D253842,D23,D23/214,D23,D23/214
9,D258755,1980,D253842,D23,D23/214,D23,D23/214
102,D258990,1980,D244939,D25,D25/2,D25,D25/2


In [77]:
# print(d2d_main.shape, d2d_main.patent_number.nunique())

(10449629, 7) 475276


In [78]:
# #cleaning up duplicates
# d2d_main = d2d_main.drop(labels=['cited_patent_number','cite_subclass','focal_subclass'], axis=1).drop_duplicates()
# d2d_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass
6,D258382,1980,D23,D23
7,D258383,1980,D23,D23
8,D258678,1980,D23,D23
9,D258755,1980,D23,D23
102,D258990,1980,D25,D25


In [79]:
# print(d2d_main.shape, d2d_main.patent_number.nunique())

(979346, 4) 475276


In [80]:
# d2d_main.reset_index(drop=True, inplace=True)

In [81]:
# d2d_main = calc_nt(d2d_main, 'cite_mainclass')
# d2d_main = calc_nijt(d2d_main,'focal_mainclass','cite_mainclass')
# d2d_main = calc_ni_nj(d2d_main,'focal_mainclass','cite_mainclass')
# d2d_main['commonness'] = d2d_main.apply(commonness, axis=1)
# d2d_main.head()

Unnamed: 0,patent_number,priority_date,cite_mainclass,focal_mainclass,Nt,Nijt,Nit,Nij,commonness
0,D258382,1980,D23,D23,4119,200,607,607,2.235859
1,D258383,1980,D23,D23,4119,200,607,607,2.235859
2,D258678,1980,D23,D23,4119,200,607,607,2.235859
3,D258755,1980,D23,D23,4119,200,607,607,2.235859
4,D258990,1980,D25,D25,4119,78,295,295,3.691836


In [86]:
# #save what we have so far

# d2d_main.to_csv('data/d2d_main.csv', index_label=False)
# d2u_main.to_csv('data/d2u_main.csv', index_label=False)

# Subclass aggregation, design citing design

In [94]:
# d2d_sub = final.loc[(final.cited_patent_number.str.contains('D')) & (final.cite_mainclass.str.contains('D'))]
# d2d_sub.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
6,D258382,1980,D253842,D23,D23/214,D23,D23/214
7,D258383,1980,D253842,D23,D23/214,D23,D23/214
8,D258678,1980,D253842,D23,D23/214,D23,D23/214
9,D258755,1980,D253842,D23,D23/214,D23,D23/214
102,D258990,1980,D244939,D25,D25/2,D25,D25/2


In [95]:
# #cleaning up duplicates
# d2d_sub = d2d_sub.drop(labels=['cited_patent_number','cite_mainclass','focal_mainclass'], axis=1).drop_duplicates()
# d2d_sub.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
6,D258382,1980,D23/214,D23/214
7,D258383,1980,D23/214,D23/214
8,D258678,1980,D23/214,D23/214
9,D258755,1980,D23/214,D23/214
102,D258990,1980,D25/2,D25/2


In [96]:
# print(d2d_sub.shape, d2d_sub.patent_number.nunique())

(4186249, 4) 475276


In [97]:
d2d_sub.reset_index(drop=True, inplace=True)
d2d_sub = calc_nt(d2d_sub, 'cite_subclass')
d2d_sub = calc_nijt(d2d_sub,'focal_subclass','cite_subclass')
d2d_sub = calc_ni_nj(d2d_sub,'focal_subclass','cite_subclass')
d2d_sub['commonness'] = d2d_sub.apply(commonness, axis=1)
d2d_sub.head()
d2d_sub.to_csv('data/d2d_sub.csv', index_label=False)

# Subclass aggregation, design citing utility

In [98]:
# d2u_sub = final.loc[(~final.cited_patent_number.str.contains('D')) & (~final.cite_mainclass.str.contains('D'))]
# d2u_sub.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


In [99]:
# #cleaning up duplicates
# d2u_sub = d2u_sub.drop(labels=['cited_patent_number','cite_mainclass','focal_mainclass'], axis=1).drop_duplicates()
# d2u_sub.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
0,D257752,1980,211/50,D19/75
1,D257752,1980,211/55,D19/75
2,D257752,1980,211/126.1,D19/75
3,D257924,1980,211/50,D06/573
4,D257924,1980,211/55,D06/573


In [100]:
# print(d2u_sub.shape, d2u_sub.patent_number.nunique())

(7064702, 4) 309304


In [101]:
# d2u_sub.reset_index(drop=True, inplace=True)
# d2u_sub = calc_nt(d2u_sub, 'cite_subclass')
# d2u_sub = calc_nijt(d2u_sub,'focal_subclass','cite_subclass')
# d2u_sub = calc_ni_nj(d2u_sub,'focal_subclass','cite_subclass')
# d2u_sub['commonness'] = d2u_sub.apply(commonness, axis=1)
# d2u_sub.head()
# d2u_sub.to_csv('data/d2u_sub.csv', index_label=False)