In [1]:
import pandas as pd
import numpy as np
import itertools

In [2]:
#import relevant data
master = pd.read_csv('data/master.csv')
citations = pd.read_csv('data/citations.csv', dtype={'patent_number':str})
focal_citation_link = pd.read_csv('data/citations_patents_level.csv')
focal_class = pd.read_csv('data/uspcs_level.csv')

In [3]:
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent,is_missing,num_design_cited,num_utility_cited,non-pat_refs,num_figures,num_assignees,priority_date
0,D257752,1981,1980,1,1.0,0.0,1,1,2,1.0,6.0,1,1980
1,D257924,1981,1980,1,1.0,0.0,1,1,2,1.0,6.0,1,1980
2,D258382,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
3,D258383,1981,1980,2,1.0,0.0,1,2,1,0.0,5.0,1,1980
4,D258571,1981,1980,1,1.0,0.0,1,1,1,0.0,2.0,1,1980


In [4]:
focal = master[['patent_number','priority_date']]
focal.head()

Unnamed: 0,patent_number,priority_date
0,D257752,1980
1,D257924,1980
2,D258382,1980
3,D258383,1980
4,D258571,1980


In [5]:
citations.head()

Unnamed: 0,uspc_mainclass_id,uspc_subclass_id,patent_number
0,248,248/423,3930272
1,403,403/107,3930272
2,5,5/11,3930272
3,5,5/100,3930272
4,47,47/58.1R,3930335


In [6]:
focal_citation_link.head()

Unnamed: 0,cited_patent_number,patent_number
0,4162014,D257752
1,4162014,D257924
2,D253842,D258382
3,D253842,D258383
4,D253842,D258678


In [7]:
focal_citation_link.isnull().sum()

cited_patent_number    0
patent_number          0
dtype: int64

Let's Test this out on a sample set

In [8]:
focal_citation = pd.merge(focal, focal_citation_link, on='patent_number', how='left')
focal_citation.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number
0,D257752,1980,4162014
1,D257924,1980,4162014
2,D258382,1980,D253842
3,D258383,1980,D253842
4,D258571,1980,


In [9]:
focal_citation['patent_number'].nunique()

525490

In [10]:
focal_citation.isnull().sum()

patent_number              0
priority_date              0
cited_patent_number    18363
dtype: int64

Let's see what is null and why

In [11]:
focal_citation.loc[focal_citation['cited_patent_number'].isnull()]

Unnamed: 0,patent_number,priority_date,cited_patent_number
4,D258571,1980,
5,D258650,1980,
8,D258760,1980,
21,D259183,1980,
45,D259575,1980,
46,D259610,1980,
56,D259669,1980,
63,D259753,1980,
65,D259820,1980,
75,D259959,1980,


After viewing a sample, these patents either don't make references, rare, or all of their references are pre 1976, more common. Since they are not useful for analysis, I will drop them

In [12]:
# percentage of patents affected
focal_citation['cited_patent_number'].value_counts(normalize=True, dropna=False).head()

NaN        0.002822
D504889    0.000077
D337569    0.000071
D341848    0.000068
D561358    0.000059
Name: cited_patent_number, dtype: float64

Less than 1 percent of patents are affected by this

In [13]:
focal_citation.dropna(inplace=True)
focal_citation.patent_number.nunique()

507127

# current number of patents represened: 507,127

In [14]:
citations.rename(index=str, columns={'patent_number':'cited_patent_number'}, inplace=True)
focal_citation_citeclass = pd.merge(focal_citation, citations, on='cited_patent_number', how='left')
focal_citation_citeclass.patent_number.nunique()

507127

In [15]:
focal_citation_citeclass.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,uspc_mainclass_id,uspc_subclass_id
0,D257752,1980,4162014,211,211/50
1,D257752,1980,4162014,211,211/55
2,D257752,1980,4162014,211,211/126.1
3,D257924,1980,4162014,211,211/50
4,D257924,1980,4162014,211,211/55


In [16]:
focal_citation_citeclass.rename(index=str, columns={'uspc_mainclass_id':'cite_mainclass', 'uspc_subclass_id':'cite_subclass'}, inplace=True)
focal_citation_citeclass.isnull().sum()

patent_number            0
priority_date            0
cited_patent_number      0
cite_mainclass         530
cite_subclass          530
dtype: int64

Drop nulls. Without classification its useles

In [17]:
focal_citation_citeclass.dropna(inplace=True)
focal_citation_citeclass['patent_number'].nunique()

507126

# Current number of patents represented: 507,126

In [18]:
final = pd.merge(focal_citation_citeclass, focal_class, on='patent_number', how='left')
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,uspc_mainclass_id,uspc_subclass_id
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


In [19]:
final.rename(index=str, columns={'uspc_mainclass_id':'focal_mainclass', 'uspc_subclass_id':'focal_subclass'}, inplace=True)

In [20]:
final.shape

(21778093, 7)

In [21]:
final.patent_number.nunique()

507126

In [22]:
final.isnull().sum()

patent_number               0
priority_date               0
cited_patent_number         0
cite_mainclass              0
cite_subclass               0
focal_mainclass        207651
focal_subclass         207651
dtype: int64

4000 of our patents had null citations. These are mainly withdrawn patents and newly granted patent. I assume this has to do with patentsview getting updated information

In [23]:
final.dropna(inplace=True)
final['patent_number'].nunique()

503128

# Current patent count: 503,128

Let's make this smaller to be easier for memory to deal with

In [24]:
final['patent_number'].nunique() /master['patent_number'].nunique()

0.9574454318826239

95.74% of the original dataset is still represented

In [25]:
final.head()

Unnamed: 0,patent_number,priority_date,cited_patent_number,cite_mainclass,cite_subclass,focal_mainclass,focal_subclass
0,D257752,1980,4162014,211,211/50,D19,D19/75
1,D257752,1980,4162014,211,211/55,D19,D19/75
2,D257752,1980,4162014,211,211/126.1,D19,D19/75
3,D257924,1980,4162014,211,211/50,D06,D06/573
4,D257924,1980,4162014,211,211/55,D06,D06/573


In [26]:
final_combinations = final.drop(['cited_patent_number','cite_mainclass','focal_mainclass'], axis=1).drop_duplicates().sort_values(by=['priority_date','patent_number','focal_subclass'])
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
4602315,D466542,1902,D16/311,D16/303
4602316,D466542,1902,2/428,D16/303
4602317,D466542,1902,2/452,D16/303
4602318,D466542,1902,2/442,D16/303
4602320,D466542,1902,2/445,D16/303


In [27]:
#necessary for accurate indexing
final_combinations = final_combinations.sort_values('priority_date').reset_index(drop=True)
final_combinations.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
0,D466542,1902,D16/311,D16/303
1,D466542,1902,2/428,D16/303
2,D466542,1902,2/452,D16/303
3,D466542,1902,2/442,D16/303
4,D466542,1902,2/445,D16/303


In [31]:
p_year = final_combinations['priority_date'].unique()

In [33]:
for year in p_year:
    print(year,' ',final_combinations.loc[final_combinations['priority_date'] == year].shape[0])

1902   6
1904   5
1905   5
1906   195
1908   13
1909   12
1913   7
1919   37
1920   1116
1934   21
1963   24
1966   7
1977   23
1978   71
1979   1453
1980   22957
1981   26423
1982   30758
1983   40300
1984   50685
1985   54035
1986   62019
1987   74464
1988   88342
1989   103199
1990   126546
1991   138690
1992   136701
1993   163258
1994   187675
1995   199728
1996   219570
1997   257621
1998   284649
1999   307193
2000   308700
2001   339476
2002   393695
2003   418418
2004   512301
2005   527442
2006   488292
2007   522454
2008   570858
2009   602760
2010   717878
2011   722273
2012   785476
2013   771273
2014   706202
2015   482274


Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
0,D466542,1902,D16/311,D16/303
1,D466542,1902,2/428,D16/303
2,D466542,1902,2/452,D16/303
3,D466542,1902,2/442,D16/303
4,D466542,1902,2/445,D16/303
5,D466542,1902,D16/303,D16/303


# Analysis with subclass granularity

In [34]:
def unique_combinations(class1, class2):
    pairs = list(zip(class1, class2))
    return list(set(tuple(sorted(p)) for p in pairs))

In [35]:
subclass_combos = unique_combinations(final_combinations['focal_subclass'], final_combinations['cite_subclass'])

In [36]:
test =final_combinations.loc[final_combinations['priority_date'] == 1902]

In [37]:
test

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
0,D466542,1902,D16/311,D16/303
1,D466542,1902,2/428,D16/303
2,D466542,1902,2/452,D16/303
3,D466542,1902,2/442,D16/303
4,D466542,1902,2/445,D16/303
5,D466542,1902,D16/303,D16/303


In [93]:
final_combinations.shape

(11447580, 5)

11,447,580 rows of pairs

# Number of unique patent backwards combinations: 2,716,435

Let's split by reference class type. Maybe this will be faster?

In [38]:
#selecting Design -> Design

design_to_design = final_combinations.loc[(final_combinations['focal_subclass'].str.contains('D')) & (final_combinations['cite_subclass'].str.contains('D'))]
design_to_design.head()

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass
0,D466542,1902,D16/311,D16/303
5,D466542,1902,D16/303,D16/303
9,D499408,1904,D14/444,D14/444
10,D499408,1904,D14/441,D14/444
11,D510547,1905,D12/178,D12/111


In [99]:
print(design_to_design.shape, design_to_design['patent_number'].nunique())

(4389261, 5) 484611


4,389,261 combinations are represented, 484,611 parents are represented too

In [39]:
design_to_design = design_to_design.sort_values('priority_date').reset_index(drop=True)

In [48]:
for year in design_to_design['priority_date'].unique():
    print(year, ' ',design_to_design.loc[design_to_design['priority_date'] == year].shape[0])

1902   2
1904   2
1905   3
1906   30
1908   6
1909   11
1913   4
1919   20
1920   720
1934   15
1963   8
1966   7
1977   10
1978   38
1979   590
1980   8783
1981   10208
1982   12636
1983   16423
1984   19938
1985   22538
1986   24665
1987   26919
1988   33375
1989   39536
1990   48299
1991   50612
1992   51138
1993   57399
1994   66197
1995   74168
1996   82218
1997   98346
1998   101484
1999   113581
2000   117993
2001   120274
2002   143001
2003   150211
2004   182020
2005   201082
2006   198399
2007   212634
2008   225623
2009   229956
2010   274780
2011   273300
2012   290939
2013   301492
2014   290461
2015   217167


In [45]:
design_design_subclass_combos = unique_combinations(design_to_design['focal_subclass'], design_to_design['cite_subclass'])

In [55]:
seen_pairs = set()
seen_pairs

set()

In [59]:
year1 = design_to_design.loc[design_to_design['priority_date'] == 1902]
year1['first_seen'] = 0
year1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass,first_seen
0,D466542,1902,D16/311,D16/303,0
1,D466542,1902,D16/303,D16/303,0


In [53]:
year1_pairs = unique_combinations(year1['focal_subclass'], year1['cite_subclass'])
year1_pairs

[('D16/303', 'D16/311'), ('D16/303', 'D16/303')]

In [57]:
for pair in year1_pairs:
    print(pair in seen_pairs)
    fo

False
False


In [92]:
year1.isin(year1_pairs[0])

Unnamed: 0,patent_number,priority_date,cite_subclass,focal_subclass,first_seen
0,False,False,True,True,False
1,False,False,True,True,False


In [109]:
def mark_first_novel_appearance(df, unique_combos, class1_name, class2_name, year_name):
    df['first_seen'] = 0
    year_col = df.columns.get_loc(year_name)
    i = 0
    for combo in(unique_combos):
        if (i% 1000 == 0):
            print('loop counter: ',i )
        # calculate year of first appearance
        index = df[[class1_name,class2_name]].isin(combo).all(axis=1).idxmax()
        year = df.iloc[index, year_col]
        year_boolean = df[[year_name]].isin([year]).all(axis=1)

#         # set all single or simulatnious novel appearances
        combo_boolean = df[[class1_name,class2_name]].isin(combo).all(axis=1)
        novel_indicies = np.where(combo_boolean & year_boolean)[0]
#         print(novel_indicies)
        df.loc[novel_indicies, 'first_seen'] = 1
        i = i+ 1
    return df

In [103]:
len(design_design_subclass_combos)

565186

565,186 combinations represented

In [None]:
design_design_1 = mark_first_novel_appearance(design_to_design, design_design_subclass_combos, 'focal_subclass', 'cite_subclass', 'priority_date')

loop counter:  0
loop counter:  1000
loop counter:  2000
loop counter:  3000


# Protoype dataframe

In [None]:
data = [('D111111',2000, 'A','B'),('D222222', 2001, 'B', 'A'),('D333333',2000,'B','A'),('D444444',2005, 'A','B'), ('D555555',2000,'C','D'),('D666666',2010,'D','C')]
cols = ['id', 'year','class1','class2']

In [None]:
df = pd.DataFrame.from_records(data, columns=cols)
df

In [None]:
df = df.sort_values('year').reset_index(drop=True)
df

In [None]:
df['first_seen'] = 0
df

In [None]:
list1 = df['class1'].tolist()
list2 = df['class2'].tolist()

In [None]:
pairs = list(zip(list1, list2))

In [None]:
unique_combos = list(set(tuple(sorted(p)) for p in pairs))
unique_combos

In [None]:
i = df[['class1','class2']].isin(unique_combos[0]).all(axis=1).idxmax()

In [None]:
year = df.iloc[i,1]
year

In [None]:
b1 = df[['class1','class2']].isin(unique_combos[0]).all(axis=1)

In [None]:
b2 = df[['year']].isin([year]).all(axis=1)

In [None]:
(b1 & b2)

In [None]:
test = np.where(b1 & b2)[0]

In [None]:
acol = df.columns.get_loc('first_seen')

In [None]:
df

In [None]:
test = [0,1]

In [None]:
df.loc[test, 'first_seen'] = 1

In [None]:
df