# Using the PatentsView API to pull data about design patents granted by the USPTO applied for from 1980-2015

Associated data fields:
1. patent number
2. application data
3. USPC class at the subclass level

In [1]:
import requests
import json
import pandas as pd

In [2]:
def clean_dataframe(df):
    """ Cleans dataframe created by PatentsView JSON object
        
        Args:
        df(dataframe): the datafame to be cleaned
        
        Returns:
        df(dataframe): the proccessed dataframe
    """
    
    df = df[df.patent_number.str.contains('D')]

    df['app_date'] = df['applications'].astype(str).str.extract('(\d{4}-\d{2}-\d{2})')
    pd.to_datetime(df.app_date, errors='coerce')
    df.drop('applications', axis=1, inplace=True)

    df['uspcs'] = df['uspcs'].astype(str).str.findall('(D\d{2}\/\d{3}|\d{3}\/\d{3})')
    holder = df.apply(lambda x: pd.Series(x['uspcs']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'class'
    df = df.drop('uspcs', axis=1).join(holder)
    df['mainclass'] = df['class'].str[0:3]
    return df


In [3]:
# building the query dynamically
base_url = "http://www.patentsview.org/api/patents/query?"
field_list = "&f=[\"patent_number\",\"app_date\",\"uspc_subclass_id\"]"

master_df = pd.DataFrame()

In [4]:
#Due to the limits of the amount of results the API can return, query must be performed in loop by date and page

for year in range(1980,2016):
#     print(year)
    page = 1
    more_patents = True
    
    while more_patents:
        
        query = "q={{\"_and\":[{{\"patent_type\":\"Design\"}},{{\"_gte\":{{\"app_date\":\"{0}-01-01\"}}}},{{\"_lte\":{{\"app_date\":\"{0}-12-31\"}}}}]}}".format(year)
        options = "&o={{\"per_page\":10000,\"page\":{}}}".format(page)

        full_url = base_url + query + field_list + options
        r = requests.get(full_url)
        data = r.json()
        print(year,"-",page,"-",data['count'])

        df = pd.io.json.json_normalize(data['patents'])
        cleaned = clean_dataframe(df)
        master_df = pd.concat([master_df, cleaned], ignore_index=True)

        
        # decide if to continue to next year or next page
        if data['count'] < 10000:
            more_patents = False
        else:
            page += 1
            
      


1980 - 1 - 5034


  del sys.path[0]


1981 - 1 - 4807


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1982 - 1 - 5216
1983 - 1 - 5495
1984 - 1 - 6020
1985 - 1 - 6506
1986 - 1 - 6337
1987 - 1 - 6950
1988 - 1 - 7409
1989 - 1 - 7581
1990 - 1 - 8388
1991 - 1 - 8533
1992 - 1 - 8722
1993 - 1 - 9296
1994 - 1 - 10000
1994 - 2 - 932
1995 - 1 - 10000
1995 - 2 - 1779
1996 - 1 - 10000
1996 - 2 - 2402
1997 - 1 - 10000
1997 - 2 - 3494
1998 - 1 - 10000
1998 - 2 - 4274
1999 - 1 - 10000
1999 - 2 - 4990
2000 - 1 - 10000
2000 - 2 - 5837
2001 - 1 - 10000
2001 - 2 - 5589
2002 - 1 - 10000
2002 - 2 - 7202
2003 - 1 - 10000
2003 - 2 - 8665
2004 - 1 - 10000
2004 - 2 - 9673
2005 - 1 - 10000
2005 - 2 - 10000
2005 - 3 - 966
2006 - 1 - 10000
2006 - 2 - 10000
2006 - 3 - 471
2007 - 1 - 10000
2007 - 2 - 10000
2007 - 3 - 1711
2008 - 1 - 10000
2008 - 2 - 10000
2008 - 3 - 1459
2009 - 1 - 10000
2009 - 2 - 10000
2009 - 3 - 863
2010 - 1 - 10000
2010 - 2 - 10000
2010 - 3 - 3437
2011 - 1 - 10000
2011 - 2 - 10000
2011 - 3 - 4647
2012 - 1 - 10000
2012 - 2 - 10000
2012 - 3 - 6286
2013 - 1 - 10000
2013 - 2 - 10000
2013 - 3 - 8788

In [7]:
master_df.isnull().sum()

patent_number         0
app_date              0
class            110356
mainclass        110356
dtype: int64

In [9]:
master_df.dropna(inplace=True) # removing data poins where the classification data is null, since its not useful.
#most of the data null data is from 2015, so this might improve with time.
master_df.to_csv('USDesign.csv',index=False)