# Gathing the foundational dataset that this project will be based on

What: Granted Design patents from the USPTO that were applied during the years 1980-2015

How: Using the PatentsView API, which already disambiguates and aggregates patent data for us. This is a product directly from USPTO, makig it a realiable source. 

Data fields we are interested in:
1. patent number
2. application year
3. number of inventors
4. number of assignees
5. number of cited design patents
6. number of cited utility patents
7. number of cited non-patent prior arts
8. assignee name
9. assignee city
10. assignee state
11. assignee country
12. grant year
13. priority date
14. if cited any foreign patents
15. design patent class
16. design patent subclass
17. number of figures
18. if US inventor
19. if any missing citations

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

#This particular API doesn't quite follow the standard REST protocol
#This is setting up the parts of the URL that won't change between patents
#See http://www.patentsview.org/api/query-language.html for more info


base_url = "http://www.patentsview.org/api/patents/query?"
field_list = "&f=[\"patent_number\",\"app_date\",\"inventor_id\",\"assignee_city\",\"assignee_country\",\"assignee_state\",\"assignee_organization\",\"patent_year\",\"uspc_subclass_id\",\"uspc_mainclass_id\",\"cited_patent_number\",\"patent_num_foreign_citations\",\"patent_num_us_patent_citations\",\"forprior_country\", \"forprior_date\", \"patent_firstnamed_inventor_country\"]"

top_level = pd.DataFrame()
applications_level = pd.DataFrame()
assignee_level = pd.DataFrame()
cited_patents_level = pd.DataFrame()
inventor_level = pd.DataFrame()
foreign_priority_level = pd.DataFrame()
uspcs_level = pd.DataFrame()

In [3]:
#Due to the limits of the amount of results the API can return, query must be performed in loop by date and page

for year in range(1980,2016):
    page = 1
    more_patents = True
    
    while more_patents:
        
        query = "q={{\"_and\":[{{\"patent_type\":\"Design\"}},{{\"_gte\":{{\"app_date\":\"{0}-01-01\"}}}},{{\"_lte\":{{\"app_date\":\"{0}-12-31\"}}}}]}}".format(year)
        options = "&o={{\"per_page\":10000,\"page\":{}}}".format(page)

        full_url = base_url + query + field_list + options
#         print(full_url)
        r = requests.get(full_url)
        print(r)
        data = r.json()
        print(year,"-",page,"-",data['count'])

        # several columns in the JSON are nested, which will create a nested dataframe.
        # This creates different dataframes for each nested object, which can all be combined later
        df = pd.io.json.json_normalize(data['patents'])
        top_level = pd.concat([top_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['applications'], meta=['patent_number'])
        applications_level = pd.concat([applications_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['assignees'], meta=['patent_number'])
        assignee_level = pd.concat([assignee_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path = ['cited_patents'], meta=['patent_number'])
        cited_patents_level = pd.concat([cited_patents_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['inventors'], meta=['patent_number'])
        inventor_level = pd.concat([inventor_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['foreign_priority'], meta=['patent_number'])
        foreign_priority_level = pd.concat([foreign_priority_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path = ['uspcs'], meta=['patent_number'])
        uspcs_level = pd.concat([uspcs_level, df], ignore_index=True)


        
        # decide if to continue to next year or next page
        if data['count'] < 10000:
            more_patents = False
        else:
            page += 1
            
      


<Response [200]>
1980 - 1 - 5034
<Response [200]>
1981 - 1 - 4807
<Response [200]>
1982 - 1 - 5216
<Response [200]>
1983 - 1 - 5495
<Response [200]>
1984 - 1 - 6020
<Response [200]>
1985 - 1 - 6506
<Response [200]>
1986 - 1 - 6337
<Response [200]>
1987 - 1 - 6950
<Response [200]>
1988 - 1 - 7409
<Response [200]>
1989 - 1 - 7581
<Response [200]>
1990 - 1 - 8388
<Response [200]>
1991 - 1 - 8533
<Response [200]>
1992 - 1 - 8722
<Response [200]>
1993 - 1 - 9296
<Response [200]>
1994 - 1 - 10000
<Response [200]>
1994 - 2 - 932
<Response [200]>
1995 - 1 - 10000
<Response [200]>
1995 - 2 - 1779
<Response [200]>
1996 - 1 - 10000
<Response [200]>
1996 - 2 - 2402
<Response [200]>
1997 - 1 - 10000
<Response [200]>
1997 - 2 - 3494
<Response [200]>
1998 - 1 - 10000
<Response [200]>
1998 - 2 - 4274
<Response [200]>
1999 - 1 - 10000
<Response [200]>
1999 - 2 - 4990
<Response [200]>
2000 - 1 - 10000
<Response [200]>
2000 - 2 - 5837
<Response [200]>
2001 - 1 - 10000
<Response [200]>
2001 - 2 - 5589
<Re

Some of the data isnt available via the API, so it has to be extracted from the raw data downloaded from PatentsView website.
This includes the number of non-patent citations and number of figures

In [4]:
figures = pd.read_csv('figures.tsv',delimiter='\t',usecols=['patent_id','num_figures'])
figures.rename(index=str, columns={"patent_id":"patent_number"}, inplace=True)
figures.head()

Unnamed: 0,patent_number,num_figures
0,7484162,4
1,6682704,21
2,7589277,25
3,9560644,13
4,8679767,27


In [5]:
otherrefs = pd.read_csv('otherreference.tsv', delimiter='\t', usecols=['uuid','patent_id'], engine='python', error_bad_lines=False)
otherrefs = otherrefs['patent_id'].value_counts().reset_index().rename(index=str, columns={'index':'patent_number','patent_id':'non-pat_refs'})
otherrefs.head()

Skipping line 68509: '	' expected after '"'
Skipping line 477416: '	' expected after '"'
Skipping line 1613021: '	' expected after '"'
Skipping line 1629932: '	' expected after '"'
Skipping line 1722194: '	' expected after '"'
Skipping line 1802364: '	' expected after '"'
Skipping line 4451386: '	' expected after '"'
Skipping line 15481262: '	' expected after '"'
Skipping line 19864269: '	' expected after '"'
Skipping line 20327012: '	' expected after '"'
Skipping line 21623945: '	' expected after '"'
Skipping line 22887714: '	' expected after '"'
Skipping line 23737968: '	' expected after '"'
Skipping line 24016042: '	' expected after '"'
Skipping line 24561541: '	' expected after '"'
Skipping line 24709825: '	' expected after '"'
Skipping line 25651401: '	' expected after '"'
Skipping line 26432186: '	' expected after '"'
Skipping line 27273914: '	' expected after '"'
Skipping line 27432379: '	' expected after '"'
Skipping line 27964407: '	' expected after '"'
Skipping line 28058407:

Unnamed: 0,patent_number,non-pat_refs
0,8401902,2964
1,9487823,2725
2,9100375,2501
3,9037713,2380
4,9094399,2376


In [6]:
print("top level: ",top_level.shape,
      "applications: ",applications_level.shape,
      "assignee: ", assignee_level.shape,
      "cited patents: ",cited_patents_level.shape,
      "inventors: ", inventor_level.shape,
      "foreign priority: ", foreign_priority_level.shape,
      "classes: ", uspcs_level.shape)

top level:  (525512, 11) applications:  (525512, 3) assignee:  (529678, 5) cited patents:  (6777757, 2) inventors:  (935273, 3) foreign priority:  (527951, 3) classes:  (740512, 3)


# Data cleaning and combining

Now that we have all the basic data that we are interested in, it's time to clean them up into neat pretty rows for useful analysis

In [8]:
master = pd.DataFrame()

Let's look at the top level, non-nested, level data

In [7]:
top_level.head()

Unnamed: 0,applications,assignees,cited_patents,foreign_priority,inventors,patent_firstnamed_inventor_country,patent_num_foreign_citations,patent_num_us_patent_citations,patent_number,patent_year,uspcs
0,"[{'app_date': '1980-01-14', 'app_id': '06/1118...","[{'assignee_city': 'Columbus', 'assignee_count...","[{'cited_patent_number': None}, {'cited_patent...","[{'forprior_country': None, 'forprior_date': N...","[{'inventor_id': '5325978-1', 'inventor_key_id...",US,0,5,D257752,1981,"[{'uspc_subclass_id': 'D19/75', 'uspc_mainclas..."
1,"[{'app_date': '1980-01-14', 'app_id': '06/1118...","[{'assignee_city': 'Columbus', 'assignee_count...","[{'cited_patent_number': None}, {'cited_patent...","[{'forprior_country': None, 'forprior_date': N...","[{'inventor_id': '5325978-1', 'inventor_key_id...",US,0,5,D257924,1981,"[{'uspc_subclass_id': 'D06/573', 'uspc_maincla..."
2,"[{'app_date': '1980-01-17', 'app_id': '06/1130...","[{'assignee_city': None, 'assignee_country': N...","[{'cited_patent_number': None}, {'cited_patent...","[{'forprior_country': None, 'forprior_date': N...","[{'inventor_id': '4248260-1', 'inventor_key_id...",US,0,10,D258382,1981,"[{'uspc_subclass_id': 'D23/214', 'uspc_maincla..."
3,"[{'app_date': '1980-01-17', 'app_id': '06/1130...","[{'assignee_city': None, 'assignee_country': N...","[{'cited_patent_number': None}, {'cited_patent...","[{'forprior_country': None, 'forprior_date': N...","[{'inventor_id': '4248260-1', 'inventor_key_id...",US,0,10,D258383,1981,"[{'uspc_subclass_id': 'D23/214', 'uspc_maincla..."
4,"[{'app_date': '1980-04-03', 'app_id': '06/1368...","[{'assignee_city': None, 'assignee_country': N...",[{'cited_patent_number': None}],"[{'forprior_country': None, 'forprior_date': N...","[{'inventor_id': 'D258571-1', 'inventor_key_id...",US,0,4,D258571,1981,"[{'uspc_subclass_id': 'D09/560', 'uspc_maincla..."


In [12]:
master['patent_number'] = top_level['patent_number']
master.head()

Unnamed: 0,patent_number,grant_year
525507,D806321,2017
525508,D806324,2017
525509,D806336,2017
525510,D806337,2017
525511,D806338,2017


In [13]:
master['grant_year'] = top_level['patent_year']
master.head()

Unnamed: 0,patent_number,grant_year
525507,D806321,2017
525508,D806324,2017
525509,D806336,2017
525510,D806337,2017
525511,D806338,2017


Extracting application date

In [15]:
applications_level.head()

Unnamed: 0,app_date,app_id,patent_number
0,1980-01-14,06/111875,D257752
1,1980-01-14,06/111813,D257924
2,1980-01-17,06/113084,D258382
3,1980-01-17,06/113090,D258383
4,1980-04-03,06/136808,D258571


In [21]:
applications_level['app_date'] = pd.to_datetime(applications_level['app_date'], errors='coerce').apply(lambda x: x.year)
master = pd.merge(master, applications_level, how='left', on='patent_number')

In [24]:
master = master.rename(index=str, columns={"app_date":"app_year"}).drop(labels=['app_id'], axis=1)
master.head()

Unnamed: 0,patent_number,grant_year,app_year
0,D257752,1981,1980
1,D257924,1981,1980
2,D258382,1981,1980
3,D258383,1981,1980
4,D258571,1981,1980


Number of inventors

In [25]:
inventor_level.head()

Unnamed: 0,inventor_id,inventor_key_id,patent_number
0,5325978-1,1012091,D257752
1,5325978-1,1012091,D257924
2,4248260-1,286926,D258382
3,4248260-2,286927,D258382
4,4248260-1,286926,D258383


In [31]:
number_inventors = inventor_level['patent_number'].value_counts().reset_index().rename(index=str, columns={'index':'patent_number','patent_number':'num_inventors'})
master = pd.merge(master, number_inventors, how='left',on='patent_number')
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors
0,D257752,1981,1980,1
1,D257924,1981,1980,1
2,D258382,1981,1980,2
3,D258383,1981,1980,2
4,D258571,1981,1980,1


If US-first inventor

In [36]:
master = pd.merge(master, top_level[['patent_firstnamed_inventor_country', 'patent_number']], how='left', on='patent_number')

In [37]:
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,patent_firstnamed_inventor_country
0,D257752,1981,1980,1,US
1,D257924,1981,1980,1,US
2,D258382,1981,1980,2,US
3,D258383,1981,1980,2,US
4,D258571,1981,1980,1,US


In [44]:
master['patent_firstnamed_inventor_countrymaster'] = master['patent_firstnamed_inventor_country'].str.match('US').astype(float)
master.rename(index=str, columns={'patent_firstnamed_inventor_country':'us_inventor'}, inplace=True)
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,patent_firstnamed_inventor_countrymaster
0,D257752,1981,1980,1,US,1.0
1,D257924,1981,1980,1,US,1.0
2,D258382,1981,1980,2,US,1.0
3,D258383,1981,1980,2,US,1.0
4,D258571,1981,1980,1,US,1.0


In [51]:
master.drop(labels=['us_inventor'], axis=1, inplace=True)
master.rename(index=str, columns={'patent_firstnamed_inventor_countrymaster':'us_inventor'}, inplace=True)
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor
0,D257752,1981,1980,1,1.0
1,D257924,1981,1980,1,1.0
2,D258382,1981,1980,2,1.0
3,D258383,1981,1980,2,1.0
4,D258571,1981,1980,1,1.0


any foreign patents cited

In [63]:
master = pd.merge(master, top_level[['patent_num_foreign_citations', 'patent_number']], how='left', on='patent_number')
master['patent_num_foreign_citations'] = (master['patent_num_foreign_citations'].astype(float) > 0).astype(float)
master.rename(index=str, columns={'patent_num_foreign_citations':'cite_foreign_patent'}, inplace=True)
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent
0,D257752,1981,1980,1,1.0,0.0
1,D257924,1981,1980,1,1.0,0.0
2,D258382,1981,1980,2,1.0,0.0
3,D258383,1981,1980,2,1.0,0.0
4,D258571,1981,1980,1,1.0,0.0


Any missing patents

In [65]:
cited_patents_level.head()

Unnamed: 0,cited_patent_number,patent_number
0,,D257752
1,4162014.0,D257752
2,,D257924
3,4162014.0,D257924
4,,D258382


In [81]:
missing_cits = cited_patents_level['patent_number'].value_counts().reset_index().rename(index=str, columns={'index':'patent_number','patent_number':'num_cits_reported'})
missing_cits.head()

Unnamed: 0,patent_number,num_cits_reported
0,D680220,1062
1,D748259,992
2,D754357,917
3,D724745,851
4,D788312,824


In [105]:
missing_cits = pd.merge(missing_cits, top_level[['patent_number','patent_num_us_patent_citations']], how='inner', on='patent_number')

missing_cits['reported_less_than_count'] = (missing_cits['num_cits_reported'] < missing_cits['patent_num_us_patent_citations'].astype(int))

missing_cits.head()

Unnamed: 0,patent_number,num_cits_reported,patent_num_us_patent_citations,reported_less_than_count
0,D680220,1062,1113,True
1,D748259,992,1014,True
2,D754357,917,924,True
3,D724745,851,858,True
4,D788312,824,825,True


In [103]:
any_null_cites = cited_patents_level.groupby('patent_number')['cited_patent_number'].apply(lambda x: x.isnull().any()).to_frame()
any_null_cites.reset_index(inplace=True)
any_null_cites.head()

In [108]:
missing_cits = pd.merge(missing_cits, any_null_cites, how='inner', on='patent_number')
missing_cits.head()

Unnamed: 0,patent_number,num_cits_reported,patent_num_us_patent_citations,reported_less_than_count,cited_patent_number
0,D680220,1062,1113,True,True
1,D748259,992,1014,True,True
2,D754357,917,924,True,True
3,D724745,851,858,True,True
4,D788312,824,825,True,True


In [122]:
missing_cits['is_missing'] = (missing_cits['reported_less_than_count'] | missing_cits['cited_patent_number']).astype(int)
missing_cits.head()

Unnamed: 0,patent_number,num_cits_reported,patent_num_us_patent_citations,reported_less_than_count,cited_patent_number,is_missing
0,D680220,1062,1113,True,True,1
1,D748259,992,1014,True,True,1
2,D754357,917,924,True,True,1
3,D724745,851,858,True,True,1
4,D788312,824,825,True,True,1


In [123]:
master = pd.merge(master, missing_cits[['patent_number','is_missing']], on='patent_number', how='left')
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent,is_missing_x,is_missing_y
0,D257752,1981,1980,1,1.0,0.0,1,1
1,D257924,1981,1980,1,1.0,0.0,1,1
2,D258382,1981,1980,2,1.0,0.0,1,1
3,D258383,1981,1980,2,1.0,0.0,1,1
4,D258571,1981,1980,1,1.0,0.0,1,1


In [124]:
master = master.drop(labels=['is_missing_x'], axis=1).rename(index=str, columns={'is_missing_y':'is_missing'})
master.head()

Unnamed: 0,patent_number,grant_year,app_year,num_inventors,us_inventor,cite_foreign_patent,is_missing
0,D257752,1981,1980,1,1.0,0.0,1
1,D257924,1981,1980,1,1.0,0.0,1
2,D258382,1981,1980,2,1.0,0.0,1
3,D258383,1981,1980,2,1.0,0.0,1
4,D258571,1981,1980,1,1.0,0.0,1


# Pick up from here tomorrow. Be sure to rerun above to pull the assignee data with the patent number attatched

In [172]:
master.to_csv('still_cleaning.csv', index=False)
top_level.to_csv('top_level.csv')
applications_level.to_csv('applications_level.csv',index = False)
cited_patents_level.to_csv('cited_patents_level.csv',index = False)
inventor_level.to_csv('inventor_level.csv',index = False)
foreign_priority_level.to_csv('foreign_priority_level.csv',index = False)
uspcs_level.to_csv('uspcs_level.csv',index = False)
figures.to_csv('figures.csv', index=False)
otherrefs.to_csv('otherrefs.csv', index=False)

Number of cited design patents

In [None]:
pattern = "D\d{6}"

In [None]:
cited_patents_level.groupby('patent_number')['cited_patent_number'].apply(lambda x: x.str.contains())

In [125]:
test = cited_patents_level.sample(100)

In [131]:
cited_patents_level[cited_patents_level['patent_number'].str.match('D372097')]

Unnamed: 0,cited_patent_number,patent_number
512943,D343684,D372097
512944,D349960,D372097
512945,D359805,D372097
512946,D359806,D372097
512947,D359807,D372097
512948,4474570,D372097
512949,4942883,D372097
512950,5088978,D372097


In [162]:
test.groupby('patent_number')['cited_patent_number'].apply(lambda x:x.isser

patent_number
D291180    1
D314766    1
D318860    1
D323631    1
D360894    1
D367600    1
D370345    1
D372097    1
D377094    1
D382101    1
D391263    1
D399885    1
D402796    1
D423600    1
D431167    1
D433800    1
D440580    1
D441405    1
D441434    1
D453510    1
D470753    1
D473013    1
D473216    1
D479815    1
D487837    1
D497081    1
D511284    1
D512713    1
D514272    1
D516562    1
          ..
D656284    1
D660314    1
D668006    1
D668906    1
D668960    1
D670415    1
D673649    1
D674366    1
D682854    1
D688558    1
D688897    1
D696845    1
D698109    1
D704903    1
D716133    1
D719871    1
D721981    1
D723007    1
D728276    1
D728445    1
D729891    1
D744095    1
D748830    1
D750376    1
D752412    1
D760379    1
D773840    1
D777230    1
D780155    1
D786820    1
Name: cited_patent_number, Length: 100, dtype: int64

Since design patents are designated by an D in the beginning of their number, let's see if any were missclassified. If so, they need to be removed

In [None]:
master[~master.patent_number.str.contains('D')]

In [None]:
#filter to make sure mis-classified patents are not included
def remove_non_design(df):
    return df[df.patent_number.str.contains('D')]
    

In [None]:
master = remove_non_design(master)

master.shape

Extract application and grant dates

In [None]:
test = master.head()