In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import grequests
import json
import asyncio
import concurrent.futures
import random
import asyncio
from aiohttp import ClientSession
import os
import aiohttp


In [None]:
# debugging ascyncio

os.environ['PYTHONASYNCIODEBUG'] = '1'

# Note 1
If the citation data is not local avaiable on your machine, run the following code blocks. Otherwise this can be skipped until note 2

In [None]:
master = pd.read_csv('designDirty.csv', usecols=['applications','cited_patents','patent_number','uspcs'])
master.head()

In [None]:
#clean data
def remove_non_design(df):
    #filter to make sure mis-classified patents are not included
    return df[df.patent_number.str.contains('D')]

In [None]:
master = remove_non_design(master)

In [None]:
master.shape

In [None]:
def extract_date(df):
    keep = df.copy()
    #extract application date and year
    keep['app_date'] = keep['applications'].astype(str).str.extract('(\d{4}-\d{2}-\d{2})')
#     set_trace()
    pd.to_datetime(keep.app_date, errors='coerce')
    keep.drop('applications', axis=1, inplace=True)
    
    keep['app_date'] = pd.to_datetime(keep['app_date'], errors='coerce')
    keep['year'] =keep['app_date'].apply(lambda x: x.year)
    return keep

In [None]:
des = extract_date(master)

In [None]:
des = des[['patent_number','year','uspcs']]
des.head()

In [None]:
def extract_class(df):
    keep = df.copy()
    #extract class information
    pattern = "([D0-9]\d{2}/\d{1,3}\.?\d{1,2})"

    keep['uspcs'] = keep['uspcs'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['uspcs']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'class'
    return keep.drop('uspcs', axis=1).join(holder)

In [None]:
des = extract_class(des)

In [None]:
des.isnull().sum()

In [None]:
des[des['class'].isnull()]

In [None]:
#remove the patents without a class because they are useless for our anaysis
# figure out why they are null
master[master.patent_number == 'D806240']

In [None]:
des = des.dropna().reset_index()

In [None]:
des.shape

In [None]:
# number of unique patent numbers represented
des.patent_number.nunique()

In [None]:
des.to_csv('designYearClass.csv')

In [None]:
#creating data frame just for citations
des_citations = master[master.patent_number.isin(des.drop_duplicates(subset=['patent_number']).patent_number)]

#checking for the same number of patents represented
des_citations.shape

In [None]:
afiliatedes_citations = des_citations[['patent_number','cited_patents']].reset_index(drop=True)

In [None]:
des_citations.tail()

In [None]:
def extract_citation(df):
    keep = df.copy()
    #extract class information
    pattern = "(\d{7}|D\d{6})"

    keep['cited_patents'] = keep['cited_patents'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['cited_patents']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'reference'
    return keep.drop('cited_patents', axis=1).join(holder)

In [None]:
# because of how much citations explode, the extraction of citations must be done iterativly and combined for the final dataframe
citations = pd.DataFrame()

for i in range(0,des_citations.shape[0],50):
    print(i)
    if (des_citations.shape[0] - i < 50):
        temp = extract_citation(des_citations.loc[i:])
    else:
        temp = extract_citation(des_citations.iloc[i:i+50])
    
    citations = pd.concat([citations, temp], ignore_index=True)

In [None]:
citations.sample(50)

In [None]:
citations.shape

In [None]:
citations.dropna(subset=['reference'], inplace=True)

In [None]:
citations.shape

In [None]:
citations.groupby('patent_number').agg('count').describe()

In [None]:
citations.to_csv('designCitations.csv')

# Note 2
If the citation data is available locally on your machine, just run the following code blocks, including the commented out immediatly after this. Be sure to run the first code block for the imports

In [8]:
citations = pd.read_csv('designCitations.csv', usecols=['patent_number','reference'])

In [9]:
# Just design citations
citations_design = citations[citations.reference.str.contains('D')].reset_index(drop=True)
citations_design.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,461903.0
mean,9.413589
std,14.965143
min,1.0
25%,3.0
50%,6.0
75%,11.0
max,561.0


In [10]:
# just utility citations
citations_utility = citations[~citations.reference.str.contains('D')].reset_index(drop=True)
citations_utility.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,301536.0
mean,6.246083
std,16.454657
min,1.0
25%,1.0
50%,3.0
75%,6.0
max,988.0


In [11]:
def gather_urls(series):
    urls = []
    base_url = "http://www.patentsview.org/api/patents/query?"
    field_list = "&f=[\"patent_number\",\"uspc_subclass_id\"]"
    for i, value in series.iteritems():
        query = "q={{\"patent_number\":\"{}\"}}".format(value)
        urls.append(base_url + query + field_list)
    return urls

In [12]:
utility_urls = gather_urls(citations_utility.drop_duplicates(subset=['reference']).reference)
len(utility_urls)

470214

In [13]:
des_urls = gather_urls(citations_design.drop_duplicates(subset=['reference']).reference)
len(des_urls)

445358

In [14]:
#explor these lists so its easier to work with at home
pd.DataFrame(utility_urls).to_csv('utility_urls.csv', index=False, header=True)
pd.DataFrame(des_urls).to_csv('des_urls.csv', index=False, header=True)

# Note 3
the list of utilies already cleaned and ready to go
 

In [19]:
u = pd.read_csv('utility_urls.csv')
d = pd.read_csv('des_urls.csv')

In [20]:
utility_urls = u.values.tolist()
des_urls = d.values.tolist()

In [21]:
# in order to not overload the server and to keep track of which patents are being choosen, divide into 10 groups then add their results back
# together. Not the most elegant or pythonic method, but this will prevent overloading and restarting
#The patents view server is experiemental and seems to get overloaded easily
partition = int(np.floor(len(utility_urls)/10))
u1 = utility_urls[:partition]
u2 = utility_urls[partition:partition*2]
u3 = utility_urls[partition*2:partition*3]
u4 = utility_urls[partition*3:partition*4]
u5 = utility_urls[partition*4:partition*5]
u6 = utility_urls[partition*5:partition*6]
u7 = utility_urls[partition*6:partition*7]
u8 = utility_urls[partition*7:partition*8]
u9 = utility_urls[partition*8:partition*9]
u10 = utility_urls[partition*9:]

In [24]:
u1[2][0]

'http://www.patentsview.org/api/patents/query?q={"patent_number":"3955331"}&f=["patent_number","uspc_subclass_id"]'

In [38]:
step = 1000
sucessful_responses = []

In [39]:
def pull_urls(urls):
    runloop(urls[i:i+step])

async def fetch(url, session):
    async with session.get(url) as response:
        print(response.status)
        if(response.status == 503):
            print('dead')
        else:
            sucessful_responses.append(await response.json())
        

async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        await fetch(url, session)


async def run(r, urls):
    tasks = []
    # create instance of Semaphore
    sem = asyncio.Semaphore(1000)

    # Create client session that will ensure we dont open new connection
    # per each request.
    async with ClientSession() as session:
        for i in range(r):
            # pass Semaphore and session to every GET request
            task = asyncio.ensure_future(bound_fetch(sem, urls[i][0], session))
            tasks.append(task)

        responses = asyncio.gather(*tasks)
        await responses

def runloop(urls):
    number = step
    loop = asyncio.get_event_loop()

    future = asyncio.ensure_future(run(number, urls))
    loop.run_until_complete(future)

In [40]:
pull_urls(u1)

0
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
20

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200


CancelledError: 