This notebook creates the binary novelty measure for a dataframe. A patent is considered novel if ANY of its backwards class citation combinations is novel. A combination is considered novel if it has never been seen for its listed focal patent's priority year


In [15]:
import pandas as pd
import numpy as np

In [2]:
def first_occurence_index(df, cite_col_name, focal_col_name):
    '''
    This function marks the first occurence of class pair. It does NOT address simultanious novelty,
    which will be addressed in a separate function. The input dataframe should already by sorted by date,
    that way we can simply use the index to decide the first time a pair occurs
    
    methodology
    1. select the citation and focal class columns
    2. make classes invariant to citation order, e.g. class A citing class B is equal to class B citing class A 
    3. Only keep the first occurance of class pair by dropping the duplicates
    4. Take the index of these first occurances
    '''
    indx = df[[cite_col_name,focal_col_name]].apply(sorted, axis = 1, result_type='expand')\
    .drop_duplicates(keep='first').index
    
    return indx

In [3]:
def simultanious_occurence_index(df, cite_col_name, focal_col_name):
    '''
    This function marks patents of simultanious novelty in a given year.
    If two separate patents use the same class pair in a year, and that year was occurence
    of that pairing, then both patents should be listed as novel.
    The first occurences should be identified prior to using this function
    
    methodology
    1. select the citation and focal class columns
    2. make classes invariant to citation order, e.g. class A citing class B is equal to class B citing class A 
    3. Create an intermediate dataframe with the original class columns and the invariant class columns
    4. from the intermediate select the rows that have been marked as novel and save their unique
    year-class1-class2 combination
    5. select the rows in intermediate that also match any of the year-class1-class2 from the previous step
    6. return the indices of these rows
    '''
    df1 = df[[cite_col_name,focal_col_name]].apply(sorted, axis=1, result_type='expand')\
    .rename({0:'sorted_class1', 1:'sorted_class2'}, axis='columns') 
    df1 = df1.reset_index(drop=True)
    
    #acreate intermediate
    intermediate = pd.concat([df, df1], axis=1, join_axes=[df1.index])
    cols = ['priority_date','sorted_class1','sorted_class2']
    seen = intermediate.loc[intermediate['first_seen'] == 1][cols]

    # converting datafrane into series of strings for easy matching that does not rely on the index
    a = intermediate[cols].astype(str).sum(1)
    b = seen.astype(str).sum(1)
    also_seen = intermediate.loc[a.isin(b)].index
    
    return also_seen

In [4]:
def mark_binary_novelty(df, cite_col_name, focal_col_name):
    '''
    This identifies novel patents using the binary measure
    
    methodology
    1. Mark first novel class pairs
    2. Mark simultaniously novel class pairs
    '''
    
    df['first_seen'] = 0
    
    #standarize datatype of citations to string
    df[cite_col_name] = df[cite_col_name].astype(str)
    
    # Exploit indices by sorting by date
    df = df.sort_values('priority_date').reset_index()
    indx = first_occurence_index(df, cite_col_name, focal_col_name)
    df.loc[indx, 'first_seen'] = 1
    
    #reset the index again
    df.reset_index(drop=True, inplace=True)
    
    #find simultanious novelty and mark them
    also_seen = simultanious_occurence_index(df, cite_col_name, focal_col_name)
    df.loc[also_seen, 'first_seen'] = 1
    return df


In [5]:
'''
import design citing design mainclass
design citing utility mainclass
design citing design subclass
design citing utility subclass
'''
main_d2d = pd.read_csv('data/final_main_d2d.csv')
main_d2u = pd.read_csv('data/final_main_d2u.csv')
sub_d2d = pd.read_csv('data/final_sub_d2d.csv')
sub_d2u = pd.read_csv('data/final_sub_d2u.csv')

Create the measures from the data

In [6]:
main_d2d = mark_binary_novelty(main_d2d,'cite_mainclass','focal_mainclass')

#examin first 50
main_d2d.head(50).sort_values(['priority_date','cite_mainclass','focal_mainclass'])

Unnamed: 0,index,patent_number,priority_date,cite_mainclass,focal_mainclass,first_seen
0,4602315,D466542,1902,D16,D16,1
1,6144839,D499408,1904,D14,D14,1
2,6362160,D510547,1905,D12,D12,1
4,7136555,D523461,1906,D14,D16,1
3,7136450,D523461,1906,D16,D16,0
6,3193750,D428991,1908,D08,D24,1
8,8125635,D545734,1908,D12,D12,0
7,3193745,D428991,1908,D24,D24,1
5,3193752,D428991,1908,D32,D24,1
10,10301398,D597396,1909,D07,D08,1


In [7]:
main_d2u = mark_binary_novelty(main_d2u,'cite_mainclass','focal_mainclass')

#examin first 50
main_d2u.head(50).sort_values(['priority_date','cite_mainclass','focal_mainclass'])

Unnamed: 0,index,patent_number,priority_date,cite_mainclass,focal_mainclass,first_seen
0,4602316,D466542,1902,2,D16,1
1,6144836,D499408,1904,312,D14,1
2,6362164,D510547,1905,180,D12,1
9,7136459,D523461,1906,16,D16,1
10,7136615,D523461,1906,2,D16,0
6,7136393,D523461,1906,345,D16,1
7,7136396,D523461,1906,348,D16,1
5,7136354,D523461,1906,351,D16,1
8,7136426,D523461,1906,359,D16,1
11,7136624,D523461,1906,379,D16,1


In [8]:
sub_d2d = mark_binary_novelty(sub_d2d,'cite_subclass','focal_subclass')

#examin first 50
sub_d2d.head(50).sort_values(['priority_date','cite_subclass','focal_subclass'])

Unnamed: 0,index,patent_number,priority_date,cite_subclass,focal_subclass,first_seen
0,4602321,D466542,1902,D16/303,D16/303,1
1,4602315,D466542,1902,D16/311,D16/303,1
2,6144840,D499408,1904,D14/441,D14/444,1
3,6144839,D499408,1904,D14/444,D14/444,1
5,6362162,D510547,1905,D12/110,D12/111,1
4,6362160,D510547,1905,D12/111,D12/111,1
6,6362166,D510547,1905,D12/178,D12/111,1
11,7136555,D523461,1906,D14/189,D16/309,1
12,7136556,D523461,1906,D14/189,D16/330,1
13,7136557,D523461,1906,D14/189,D16/335,1


In [9]:
sub_d2u = mark_binary_novelty(sub_d2u,'cite_subclass','focal_subclass')

#examin first 50
sub_d2u.head(50).sort_values(['priority_date','cite_subclass','focal_subclass'])

Unnamed: 0,index,patent_number,priority_date,cite_subclass,focal_subclass,first_seen
0,4602316,D466542,1902,2/428,D16/303,1
2,4602318,D466542,1902,2/442,D16/303,1
3,4602320,D466542,1902,2/445,D16/303,1
1,4602317,D466542,1902,2/452,D16/303,1
4,6144836,D499408,1904,312/223.2,D14/444,1
5,6144837,D499408,1904,312/306,D14/444,1
6,6144838,D499408,1904,312/319.2,D14/444,1
8,6362164,D510547,1905,180/219,D12/111,1
7,6362165,D510547,1905,180/311,D12/111,1
45,7136615,D523461,1906,2/422,D16/309,1


In [11]:
def is_novel_patent_level(df):
    '''
    A patent is considered novel is any of it's class combinations is considered novel
    
    Methodology:
    1. group patents by their patent number
    2. if any of the pairs are novel, mark the patent as novel
    '''
    
    is_novel = df.groupby('patent_number')\
    .apply(lambda x: np.any(x.first_seen)).astype(int)\
    .reset_index().rename(index=int, columns={0:'is novel'})
    return is_novel

In [19]:
def novel_count_patent_level(df):
    '''
    This function counts the number of binary novel combinations in a patent
    
    Methodolgy:
    1. group patents by their patent number
    2. count the number of novel combinations in each patent
    '''
    
    novel_count = df.groupby('patent_number')\
    .apply(lambda x: np.sum(x.first_seen))\
    .reset_index().rename(index= int, columns={0:'novelty count'})
    return novel_count

In [23]:
def consolidate(df):
    '''
    This function creates a dataframe that consolidates the
    novely count and is novelty measure at the patent level
    '''
    
    is_novel = is_novel_patent_level(df)
    count = novel_count_patent_level(df)
    
    out = pd.merge(is_novel, count)
    return out
    

Consolidate the data at the patent level

In [24]:
main_d2d_out = consolidate(main_d2d)
main_d2u_out = consolidate(main_d2u)
sub_d2d_out = consolidate(sub_d2d)
sub_d2u_out = consolidate(sub_d2u)

In [28]:
sub_d2d_out.head()

Unnamed: 0,patent_number,is novel,novelty count
0,D258382,1,1
1,D258383,1,1
2,D258678,1,1
3,D258755,1,1
4,D258990,1,3


In [30]:
# Save the resulting data frames for furher analysis
main_d2d_out.to_csv('data/d2d_main_binary.csv', index_label = False)
main_d2u_out.to_csv('data/d2u_main_binary.csv', index_label = False)
sub_d2d_out.to_csv('data/d2d_sub_binary.csv', index_label = False) 
sub_d2u_out.to_csv('data/d2u_sub_binary.csv', index_label = False)