This notebook create the measure for continous novelty. Continuous novelty is calculated based off the commoness of a particular class pair (i,j) for a year, t

$$Commoness_ijt = \frac{Observed \ number \ of \ pairs_ijt}{expected \ number \ of \ pairs_ijt} = \frac{N_ijt * N_t}{N_it * N_jt}$$

Where N_ijt is the number of i-j class pairs in year t, N_it is the number of class pairs that include class i in year t, N_it is the number of class pairs that inclide class j in year t, and N_t is the number of all pair in year t

Novelty is the negative log function of commonness. Then for each patent the maximum novelty is taken as the final continuous measure of novelty


In [1]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter

In [2]:
def calc_nt(df, column_name):
    '''
    This method calculates Nt, the total number of class pairs that exist for a given year t
    
    methodology:
    1. Group patents by their priority year
    2. Count the number of pairings that exist for each year
    3. For each patent, create a column Nt with the Nt measure corresponding to its priority year
    '''
    Nt_count = df.groupby(['priority_date'])[column_name].count()
    df['Nt'] = df.apply(lambda x: Nt_count[x['priority_date']], axis=1)
    return df



In [3]:
def calc_nijt(df, focal_class, cite_class):
    '''
    This method calculates Nijt, the number of i-j pairs that exist for year t
    
    Methology:
    1. group patents by their priority date, focal class, and citation class grouping
    2. count the number patents that exist for each grouping
    3. for each patent, create a column Nijt with Nijt measure corresponding to the grouping it belongs to 
    '''
    Nijt_count = df.groupby(['priority_date',focal_class,cite_class])['patent_number'].count()
    df['Nijt'] = df.apply(lambda x: Nijt_count.loc[x['priority_date'],x[focal_class],x[cite_class]],axis=1)
    return df

In [4]:
def calc_ni_nj(df, focal_class, cite_class):
    '''
    This method calculates Ni and Nj, the number of pairs that contain class i and class j, respectively
    
    Methodology:
    1.group patents by their priority date
    2.For each year count the number of time each class occurs in either the focal class or citation class
    For Pairings A-B, B-C, C-D, A and D appear once, B and C appear twice.
    3. For each patent, create a column Nit with Nit measure corresponding to its class i
    4. For each patent, create a column Nj with Nj measure corresponding to class j 
    
    
    '''
    count_class = pd.DataFrame(df.groupby('priority_date')\
                    .apply(lambda x: pd.Series(Counter(x[focal_class].tolist() + x[cite_class].tolist()))))
    df['Nit'] = df.apply(lambda x: count_class.loc[x['priority_date'],x[focal_class]], axis=1)
    df['Nij'] = df.apply(lambda x: count_class.loc[x['priority_date'],x[cite_class]], axis=1)
    return df

In [5]:
def commonness(row):
    '''
    This function calculates the commononess of class pairings i,j for a given year t
    Nijt, Nt, Nit, and Nij should be calculated prior to this
    
    Methodology: 
    Cijt = (Nijt*Nt) / (Nit*Nij) 
    '''
    return (row['Nijt'] * row['Nt'])/(row['Nit'] * row['Nij'])


In [6]:
def commonness_to_novelty(df):
    '''
    This function calculates the novelty of a patent based on its novelty
    
    Methodology:
    For each patent
    Novelty = -1* log_transformation(commonness)
    '''
    df['novelty'] = np.log(df['commonness'])*(-1)
    return df

In [7]:
def max_patent_novelty(df):
    '''
    This function find the max novelty for each patent
    
    methodology:
    1. Group patents by their patent numbers priority date, focal and citation classes
        The last three measures are just passed through so they can be retrieved later
        Only patent number will be used in the calculation
    2. Find the max novelty of each patent
    3. Return a cleaned dataframe with columns patent number, max_novelty
    '''
    max_novelty = df.groupby('patent_number')['novelty'].max().reset_index()\
    .rename(index=str, columns={'novelty':'max_novelty'})
    return max_novelty

In [8]:
def continuous_novelty(df, focal_col_name, cite_col_name):
    '''
    This function calculates the continuous novelty of a patent based off the formula noted above
    
    Methodology:
    1. Calculate Nt of each patent
    2. Calcualte Nijt of each patent
    3. Calculate Ni of each patent
    4. Calculate Nj of each patent
    5. Calculate Commonness of each patent
    6. Calculate Novelty of each patent
    7. Calculate the maximum novelty of each patent
    
    '''
    df.reset_index(drop=True, inplace=True)
    df = calc_nt(df, cite_col_name)

    df = calc_nijt(df,focal_col_name,cite_col_name)


    df = calc_ni_nj(df,focal_col_name,cite_col_name)

    df['commonness'] = df.apply(commonness, axis=1)
    df = commonness_to_novelty(df)
    df = max_patent_novelty(df)
    return df

In [9]:
'''
import design citing design mainclass
design citing utility mainclass
design citing design subclass
design citing utility subclass
'''
main_d2d = pd.read_csv('data/final_main_d2d.csv')
main_d2u = pd.read_csv('data/final_main_d2u.csv')
sub_d2d = pd.read_csv('data/final_sub_d2d.csv')
sub_d2u = pd.read_csv('data/final_sub_d2u.csv')

Create the measures from the data

In [10]:
main_d2d = continuous_novelty(main_d2d,'focal_mainclass', 'cite_mainclass')

#examin first 50
main_d2d.head(50)

Unnamed: 0,patent_number,max_novelty
0,D258382,-0.797657
1,D258383,-0.797657
2,D258678,-0.797657
3,D258755,-0.797657
4,D258990,0.809274
5,D259247,0.614096
6,D259248,0.614096
7,D259249,0.614096
8,D259250,0.614096
9,D259251,0.614096


In [11]:
main_d2u = continuous_novelty(main_d2u,'focal_mainclass','cite_mainclass')

#examin first 50
main_d2u.head(50)

Unnamed: 0,patent_number,max_novelty
0,D257752,-1.621493
1,D257924,-1.711278
2,D258766,0.935415
3,D259281,-2.407163
4,D259509,-1.583463
5,D259510,0.099624
6,D259740,-2.407163
7,D259867,-3.47095
8,D259875,-1.994316
9,D260061,-0.986043


In [12]:
sub_d2d = continuous_novelty(sub_d2d,'focal_subclass','cite_subclass')

#examin first 50
sub_d2d.head(50)

Unnamed: 0,patent_number,max_novelty
0,D258382,-5.392617
1,D258383,-5.392617
2,D258678,-5.392617
3,D258755,-5.392617
4,D258990,-1.809098
5,D259247,-3.339265
6,D259248,-3.339265
7,D259249,-3.078982
8,D259250,-3.339265
9,D259251,-3.339265


In [13]:
sub_d2u = continuous_novelty(sub_d2u,'focal_subclass','cite_subclass')

#examin first 50
sub_d2u.head(50)

Unnamed: 0,patent_number,max_novelty
0,D257752,-4.951017
1,D257924,-5.724207
2,D258766,-2.018799
3,D259281,-2.355271
4,D259509,-7.852439
5,D259510,-3.575773
6,D259740,-2.355271
7,D259867,-5.935516
8,D259875,-6.417355
9,D260061,-7.852439


In [23]:
#save this data for future analysis

main_d2d.to_csv('data/d2d_main_cont.csv', index_label=False)
main_d2u.to_csv('data/d2u_main_cont.csv', index_label=False)
sub_d2d.to_csv('data/d2d_sub_cont.csv', index_label=False)
sub_d2u.to_csv('data/d2u_sub_cont.csv', index_label=False)