In [1]:
import requests
import json
import pandas as pd

In [2]:
def clean_dataframe(df):
    """ Cleans dataframe created by PatentsView JSON object
        
        Args:
        df(dataframe): the datafame to be cleaned
        
        Returns:
        df(dataframe): the proccessed dataframe
    """
    
    df = df[df.patent_number.str.contains('D')]

    df['app_date'] = df['applications'].astype(str).str.extract('(\d{4}-\d{2}-\d{2})')
    pd.to_datetime(df.app_date, errors='coerce')
    df.drop('applications', axis=1, inplace=True)

    df['uspcs'] = df['uspcs'].astype(str).str.findall('(\d{3}|D\d{2})')
    holder = df.apply(lambda x: pd.Series(x['uspcs']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'class'
    return df.drop('uspcs', axis=1).join(holder)


In [3]:
base_url = "http://www.patentsview.org/api/patents/query?"
field_list = "&f=[\"patent_number\",\"app_date\",\"uspc_mainclass_id\"]"

master_df = pd.DataFrame()

In [4]:
#Due to the limits of the amount of results the API can return, query must be performed in loop by date and page

for year in range(1980,2016):
    print(year)
    page = 1
    more_patents = True
    
    while more_patents:
        
        query = "q={{\"_and\":[{{\"patent_type\":\"Design\"}},{{\"_gte\":{{\"app_date\":\"{0}-01-01\"}}}},{{\"_lte\":{{\"app_date\":\"{0}-12-31\"}}}}, {{\"patent_firstnamed_inventor_country\":\"US\"}}]}}".format(year)
        options = "&o={{\"per_page\":10000,\"page\":{}}}".format(page)

        full_url = base_url + query + field_list + options
        r = requests.get(full_url)
        data = r.json()
#         print(year, " ", page, "", data['count'])

        df = pd.io.json.json_normalize(data['patents'])
        cleaned = clean_dataframe(df)
        master_df = pd.concat([master_df, cleaned], ignore_index=True)

        
        # decide if to continue to next year or next page
        if data['count'] < 10000:
            more_patents = False
        else:
            page += 1
            
      


1980


  del sys.path[0]


1981
1982


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015


In [5]:
master_df.isnull().sum()

patent_number       0
app_date            0
class            2535
dtype: int64

In [7]:
master_df.dropna(inplace=True)
master_df.to_csv('USDesign.csv',index=False)