In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as py
%matplotlib inline

In [2]:
focal = pd.read_csv('designYearClass.csv', usecols=['patent_number','year','class'])
citation_util = pd.read_csv('utility_patent_citations_clean.csv', usecols=['patent_number','class'])
citation_des = pd.read_csv('design_patent_citation_clean.csv', usecols=['patent_number','class'])
citation_link = pd.read_csv('designCitations.csv', usecols=['patent_number','reference'])

In [3]:
focal.head()

Unnamed: 0,patent_number,year,class
0,D257752,1980,D19/75
1,D257924,1980,D06/573
2,D258382,1980,D23/214
3,D258383,1980,D23/214
4,D258571,1980,D09/560


Let's do some sanity checks

In [4]:
#number of patents
num_pats = focal.groupby('patent_number').ngroups

In [5]:
citation_link.head()

Unnamed: 0,patent_number,reference
0,D257752,4162014
1,D257924,4162014
2,D258382,D253842
3,D258383,D253842
4,D258678,D253842


In [6]:
#number of patents with references
num_pats_with_cits = citation_link.groupby('patent_number').ngroups

In [7]:
#number of patents without references
print('percent of patents without refernces:', (num_pats -num_pats_with_cits) / num_pats)

percent of patents without refernces: 0.03535776008333399


Let's use the patents that have at least one citation

Question: what to do with patents that do not have a citation?

In [8]:
focal_with_cits = focal.loc[focal['patent_number'].isin(citation_link['patent_number'])]

In [9]:
#sanity check
assert (focal_with_cits.groupby('patent_number').ngroups == num_pats_with_cits)

In [10]:
test_df = focal_with_cits.head(25)

Now let's add the reference patents to the list

In [11]:
# test_df.groupby(['year','patent_number']).count()

In [12]:
# pd.merge(test_df, citation_link, how='inner', on='patent_number').groupby(['year','patent_number','reference']).count()

In [13]:
master = pd.merge(focal_with_cits, citation_link, how='inner', on='patent_number')

In [14]:
#doing some sanity checking
# master.set_index(['year','pat])

Now let's add the reference patents classes. Utitly and design references were separated, so must ensure that they add up

In [15]:
citation_util.head()

Unnamed: 0,patent_number,class
0,4069641,52/202
1,4069641,52/476
2,4069641,52/717.1
3,4069641,D25/48.7
4,4069641,D25/119


In [16]:
citation_util.dtypes

patent_number     int64
class            object
dtype: object

In [17]:
citation_util['patent_number'] = citation_util['patent_number'].astype(str)

In [18]:
citation_util.loc[citation_util['patent_number'].isin(citation_link['reference'])].groupby('patent_number').ngroups

463113

In [19]:
citation_util.groupby('patent_number').ngroups

463148

So a few patents got lost in this step. But 35 is a negligable number. Let's keep going and do the same with Design

In [20]:
utility = citation_util.loc[citation_util['patent_number'].isin(citation_link['reference'])]

In [21]:
citation_des.loc[citation_des['patent_number'].isin(citation_link['reference'])].groupby('patent_number').ngroups

445321

In [22]:
citation_des.groupby('patent_number').ngroups

445322

In [23]:
design = citation_des.loc[citation_des['patent_number'].isin(citation_link['reference'])]

Merge the reference patent class information to master

In [24]:
utility.rename(index=str, columns={"patent_number": "reference"},inplace=True)
design.rename(index=str, columns={"patent_number": "reference"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  return super(DataFrame, self).rename(**kwargs)


In [25]:
test = master.head(100)

In [26]:
#some data validation
# pd.merge(test,design, how='inner', on='reference').groupby(['year','patent_number','reference','class_x','class_y']).count()

In [27]:
frames = pd.concat([design, utility])

In [28]:
frames.groupby('reference').ngroups

908434

In [29]:
citation_link.groupby('reference').ngroups

915572

In [36]:
master  = pd.merge(master, frames, how='inner',on='reference')

In [37]:
master.groupby('patent_number').ngroups

487695

In [38]:
master.shape

(20301306, 5)

# In our final dataset 487,695 design parents are represented.
These patents all have at least one citation. Due to combinatorial explosion, there are 20,301,306 reference-focal class combinations are represented from patents applied from years 1980-2015

In [39]:
master.head()

Unnamed: 0,patent_number,year,class_x,reference,class_y
0,D257752,1980,D19/75,4162014,211/50
1,D257752,1980,D19/75,4162014,211/55
2,D257752,1980,D19/75,4162014,211/126.1
3,D257924,1980,D06/573,4162014,211/50
4,D257924,1980,D06/573,4162014,211/55


In [42]:
#delete class_x and class_y to save memory

master['class_combo'] = master['class_x'] +'-'+ master['class_y']
master.drop(['class_x','class_y'], inplace=True, axis=1)

master.head()

Unnamed: 0,patent_number,year,reference,class_combo
0,D257752,1980,4162014,D19/75-211/50
1,D257752,1980,4162014,D19/75-211/55
2,D257752,1980,4162014,D19/75-211/126.1
3,D257924,1980,4162014,D06/573-211/50
4,D257924,1980,4162014,D06/573-211/55


In [43]:
master['new_combo'] = 0

# Let's rock and roll here!

In [21]:
# to be more time efficient, it is best to sort the dataframe as best as possible
master_merge.sort_by('year')

AttributeError: 'DataFrame' object has no attribute 'sort_by'

In [13]:
def check_new_combo(row):
#     combo = row['class_combo']
#     base_year = row['year']
    earlier_years = master_merge[master_merge['year'] < row['year']]
    return not earlier_years['class_combo'].isin([row['class_combo']]).any()
    

In [14]:
master_merge.sort_values('year').reset_index(inplace=True)

In [None]:
master_merge['new_combo'] = master_merge.apply(check_new_combo, axis=1)

In [15]:
mini_year = master_merge[master_merge.year == 2000]

In [None]:
mini_year['new_combo'] = mini_year.apply(check_new_combo, axis=1)