# Gathing the foundational dataset that this project will be based on

What: Granted Design patents from the USPTO that were applied during the years 1980-2015

How: Using the PatentsView API, which already disambiguates and aggregates patent data for us. This is a product directly from USPTO, makig it a realiable source. 

Data fields we are interested in:
1. patent number
2. application year
3. number of inventors
4. number of assignees
5. number of cited design patents
6. number of cited utility patents
7. number of cited non-patent prior arts
8. assignee name
9. assignee city
10. assignee state
11. assignee country
12. grant year
13. priority date
14. if cited any foreign patents
15. design patent class
16. design patent subclass
17. number of figures
18. if US inventor
19. if any missing citations

In [1]:
import requests
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline



In [2]:

#This particular API doesn't quite follow the standard REST protocol
#This is setting up the parts of the URL that won't change between patents
#See http://www.patentsview.org/api/query-language.html for more info


base_url = "http://www.patentsview.org/api/patents/query?"
field_list = "&f=[\"patent_number\",\"app_date\",\"inventor_id\",\"assignee_city\",\"assignee_country\",\"assignee_state\",\"assignee_organization\",\"patent_year\",\"uspc_subclass_id\",\"uspc_mainclass_id\",\"cited_patent_number\",\"patent_num_foreign_citations\",\"patent_num_us_patent_citations\",\"forprior_country\", \"forprior_date\", \"patent_firstnamed_inventor_country\"]"

top_level = pd.DataFrame()
applications_level = pd.DataFrame()
assignee_level = pd.DataFrame()
cited_patents_level = pd.DataFrame()
inventor_level = pd.DataFrame()
foreign_priority_level = pd.DataFrame()
uspcs_level = pd.DataFrame()

In [3]:
#Due to the limits of the amount of results the API can return, query must be performed in loop by date and page

for year in range(1980,2016):
    page = 1
    more_patents = True
    
    while more_patents:
        
        query = "q={{\"_and\":[{{\"patent_type\":\"Design\"}},{{\"_gte\":{{\"app_date\":\"{0}-01-01\"}}}},{{\"_lte\":{{\"app_date\":\"{0}-12-31\"}}}}]}}".format(year)
        options = "&o={{\"per_page\":10000,\"page\":{}}}".format(page)

        full_url = base_url + query + field_list + options
#         print(full_url)
        r = requests.get(full_url)
        print(r)
        data = r.json()
        print(year,"-",page,"-",data['count'])

        # several columns in the JSON are nested, which will create a nested dataframe.
        # This creates different dataframes for each nested object, which can all be combined later
        df = pd.io.json.json_normalize(data['patents'])
        top_level = pd.concat([top_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['applications'], meta=['patent_number'])
        applications_level = pd.concat([applications_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['assignees'])
        assignee_level = pd.concat([assignee_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path = ['cited_patents'], meta=['patent_number'])
        cited_patents_level = pd.concat([cited_patents_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['inventors'], meta=['patent_number'])
        inventor_level = pd.concat([inventor_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path=['foreign_priority'], meta=['patent_number'])
        foreign_priority_level = pd.concat([foreign_priority_level, df], ignore_index=True)
        
        df = pd.io.json.json_normalize(data['patents'], record_path = ['uspcs'], meta=['patent_number'])
        uspcs_level = pd.concat([uspcs_level, df], ignore_index=True)


        
        # decide if to continue to next year or next page
        if data['count'] < 10000:
            more_patents = False
        else:
            page += 1
            
      


<Response [200]>
1980 - 1 - 5034
<Response [200]>
1981 - 1 - 4807
<Response [200]>
1982 - 1 - 5216
<Response [200]>
1983 - 1 - 5495
<Response [200]>
1984 - 1 - 6020
<Response [200]>
1985 - 1 - 6506
<Response [200]>
1986 - 1 - 6337
<Response [200]>
1987 - 1 - 6950
<Response [200]>
1988 - 1 - 7409
<Response [200]>
1989 - 1 - 7581
<Response [200]>
1990 - 1 - 8388
<Response [200]>
1991 - 1 - 8533
<Response [200]>
1992 - 1 - 8722
<Response [200]>
1993 - 1 - 9296
<Response [200]>
1994 - 1 - 10000
<Response [200]>
1994 - 2 - 932
<Response [200]>
1995 - 1 - 10000
<Response [200]>
1995 - 2 - 1779
<Response [200]>
1996 - 1 - 10000
<Response [200]>
1996 - 2 - 2402
<Response [200]>
1997 - 1 - 10000
<Response [200]>
1997 - 2 - 3494
<Response [200]>
1998 - 1 - 10000
<Response [200]>
1998 - 2 - 4274
<Response [200]>
1999 - 1 - 10000
<Response [200]>
1999 - 2 - 4990
<Response [200]>
2000 - 1 - 10000
<Response [200]>
2000 - 2 - 5837
<Response [200]>
2001 - 1 - 10000
<Response [200]>
2001 - 2 - 5589
<Re

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [4]:
cited_patents_level.shape

(5537201, 2)

Some of the data isnt available via the API, so it has to be extracted from the raw data downloaded from PatentsView website.
This includes the number of non-patent citations and number of figures

In [None]:
master_df.shape

In [None]:
figures = pd.read_csv('figures.tsv',delimiter='\t',usecols=['patent_id','num_figures'])
figures.head()

In [None]:
figures.rename(index=str, columns={"patent_id":"patent_number"}, inplace=True)
master_df = pd.merge(master_df, figures, how='left',on='patent_number')
master_df.head()

In [None]:
master_df.shape

In [None]:
otherrefs = pd.read_csv('otherreference.tsv', delimiter='\t', usecols=['uuid','patent_id'], engine='python', error_bad_lines=False)
otherrefs.head()

In [None]:
otherrefs = otherrefs['patent_id'].value_counts().reset_index().rename(index=str, columns={'index':'patent_number','patent_id':'non-pat_refs'})
otherrefs.head()

In [None]:
master_df = pd.merge(master_df, otherrefs, how='left', on='patent_number')
master_df.head()

In [None]:
master_df.shape

# Data cleaning

Now that we have all the basic data that we are interested in, it's time to clean them up into neat pretty rows for useful analysis

Since design patents are designated by an D in the beginning of their number, let's see if any were missclassified. If so, they need to be removed

In [None]:
master[~master.patent_number.str.contains('D')]

In [None]:
#filter to make sure mis-classified patents are not included
def remove_non_design(df):
    return df[df.patent_number.str.contains('D')]
    

In [None]:
master = remove_non_design(master)

master.shape

Extract application and grant dates

In [None]:
test = master.head()

In [None]:
def extract_date(df):
    #extract application date and year
    df['app_date'] = df['applications'].astype(str).str.extract('(\d{4}-\d{2}-\d{2})')    
    df['app_date'] = pd.to_datetime(df['app_date'], errors='coerce').apply(lambda x: x.year)
    return df

In [None]:
master = extract_date(master)

Extract numer of inventors

In [None]:
def extract_num_inventors(df):
    df['num_inventors'] = df['inventors'].str.count("inventor_id")
    return df

In [None]:
master = extract_num_inventors(master)

Extract number of assignees, assignees name, city, and country

In [None]:
def extract_num_assignee(df):
    df['num_assignees'] = df['assignees'].str.count('assignee_organization')
    return df

Extract number of cited design and utility patents, non-patent prior art, and if any foreign patents were cited

In [None]:
def any_foreign_patents(df):
    df['foreign'] = (df['patent_num_foreign_citations'] > 0).astype(int)
    return df

In [None]:
master = any_foreign_patents(master)

In [None]:
list1 = master.iloc[176,1].split(',')

In [None]:
str(list1[0]).split(':')

In [None]:
for i in range(5):
    print(test.iloc[i,1])
# test.iloc[1,1]

In [None]:
def extract_class(df):
    keep = df.copy()
    #extract class information
    pattern = "([D0-9]\d{2}/\d{1,3}\.?\d{1,2})"

    keep['uspcs'] = keep['uspcs'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['uspcs']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'class'
    return keep.drop('uspcs', axis=1).join(holder)

In [None]:
def mark_missing_citaitons(df):
    
    """ Marks patents that have missing citaiton data
        
        Args:
        df(dataframe): the datafame to be cleaned
        
        Returns:
        df(dataframe): the proccessed dataframe with additional columns is_missing(boolean) and num_missing(int)
    """
    
    keep = df.copy()

    keep['num_cited_returned'] = keep['cited_patents'].astype(str).str.findall('(\d{7}|D\d{6})').apply(lambda x: len(x))
    keep['patent_num_us_patent_citations'] = keep['patent_num_us_patent_citations'].astype(np.int64)
    keep['num_missing'] = keep['patent_num_us_patent_citations'].sub(keep['num_cited_returned'])
    keep['is_missing'] = np.where(keep['num_missing'] >0, 1, 0)
    
    return keep
    

In [None]:
master_df.sample(5)

In [None]:
master_df = remove_non_design(master_df)

In [None]:
master_df.to_csv('designDirty')

In [None]:
master_df = extract_date(master_df)

In [None]:
master_df.head()

How has design patent activity changed over time?

In [None]:
by_year = master_df.set_index('patent_number')

In [None]:
by_year.head()

In [None]:
by_year = by_year[['uspcs', 'year']]

In [None]:
by_year.groupby('year').agg('count').plot(kind='line', legend=False, title="USPTO Design patent Activity by Application Year")

What percentage of all patent applications are design patents?

In [None]:
utility_df = pd.DataFrame()
field_list = "&f=[\"patent_number\",\"app_date\"]"

In [None]:
#Due to the limits of the amount of results the API can return, query must be performed in loop by date and page

for year in range(1993,2016):
    page = 1
    more_patents = True
    
    while more_patents:
        
        query = "q={{\"_and\":[{{\"patent_type\":\"Utility\"}},{{\"_gte\":{{\"app_date\":\"{0}-01-01\"}}}},{{\"_lte\":{{\"app_date\":\"{0}-12-31\"}}}}]}}".format(year)
        options = "&o={{\"per_page\":10000,\"page\":{}}}".format(page)

        full_url = base_url + query + field_list + options
        r = requests.get(full_url)
        print(r)
        data = r.json()
        print(year,"-",page,"-",data['count'])

        
        df = pd.io.json.json_normalize(data['patents'])
        utility_df = pd.concat([master_df, df], ignore_index=True)

        
        # decide if to continue to next year or next page
        if data['count'] < 10000:
            more_patents = False
        else:
            page += 1
            
      


In [None]:
utility_df.tail()

In [None]:
classes = extract_date(master_df)

In [None]:
classes = extract_class(classes)

In [None]:
classes.drop(columns = ['cited_patents','patent_num_us_patent_citations','app_date'], inplace=True)

In [None]:
classes.head()

# How much citation data is missing?

In [None]:
# # master_df = remove_non_design(master_df)
# # master_df = extract_date(master_df)
# # master_df = extract_class(master_df)
# missing = mark_missing_citaitons(master_df)

In [None]:
# missing.head()

In [None]:
# missing.shape

In [None]:

# (missing[missing['num_missing'] >0].shape[0]/master_df.shape[0]) *100

Approx 54.926% of citation data is missing. This is mostly due to citing patents that were granted before 1976, which is the yeat that patentViews beings coverage.

Let's break it down by year

In [None]:

# master_df.groupby(['year']).apply((lambda x: (x[x['num_missing'] > 0].shape[0]/ x.shape[0])*100)).plot(title="Percent of Patents with missing Citation Data")

In [None]:
# master_df['test'] = master_df['cited_patents'].astype(str).str.findall(pattern)

In [None]:
# missing.drop(columns=['applications','patent_num_us_patent_citations','uspcs','num_cited_returned','num_missing'], inplace=True)

In [None]:
# pattern = "([D0-9]\d{6})"

In [None]:
# missing['cited_patents'] = missing['cited_patents'].astype(str).str.findall(pattern)

In [None]:
# holder = missing.apply(lambda x: pd.Series(x['cited_patents']),axis=1).stack().reset_index(level=1, drop=True)

In [None]:
# holder.name = 'class'

In [None]:
# missing.drop('cited_patents', axis=1).join(holder)