In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import grequests
import json
import asyncio
import concurrent.futures
import random
import asyncio
from aiohttp import ClientSession
import os
import aiohttp


In [4]:
# debugging ascyncio

os.environ['PYTHONASYNCIODEBUG'] = '1'

# Note 1
If the citation data is not local avaiable on your machine, run the following code blocks. Otherwise this can be skipped until note 2

In [None]:
master = pd.read_csv('designDirty.csv', usecols=['applications','cited_patents','patent_number','uspcs'])
master.head()

In [None]:
#clean data
def remove_non_design(df):
    #filter to make sure mis-classified patents are not included
    return df[df.patent_number.str.contains('D')]

In [None]:
master = remove_non_design(master)

In [None]:
master.shape

In [None]:
def extract_date(df):
    keep = df.copy()
    #extract application date and year
    keep['app_date'] = keep['applications'].astype(str).str.extract('(\d{4}-\d{2}-\d{2})')
#     set_trace()
    pd.to_datetime(keep.app_date, errors='coerce')
    keep.drop('applications', axis=1, inplace=True)
    
    keep['app_date'] = pd.to_datetime(keep['app_date'], errors='coerce')
    keep['year'] =keep['app_date'].apply(lambda x: x.year)
    return keep

In [None]:
des = extract_date(master)

In [None]:
des = des[['patent_number','year','uspcs']]
des.head()

In [None]:
def extract_class(df):
    keep = df.copy()
    #extract class information
    pattern = "([D0-9]\d{2}/\d{1,3}\.?\d{1,2})"

    keep['uspcs'] = keep['uspcs'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['uspcs']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'class'
    return keep.drop('uspcs', axis=1).join(holder)

In [None]:
des = extract_class(des)

In [None]:
des.isnull().sum()

In [None]:
des[des['class'].isnull()]

In [None]:
#remove the patents without a class because they are useless for our anaysis
# figure out why they are null
master[master.patent_number == 'D806240']

In [None]:
des = des.dropna().reset_index()

In [None]:
des.shape

In [None]:
# number of unique patent numbers represented
des.patent_number.nunique()

In [None]:
des.to_csv('designYearClass.csv')

In [None]:
#creating data frame just for citations
des_citations = master[master.patent_number.isin(des.drop_duplicates(subset=['patent_number']).patent_number)]

#checking for the same number of patents represented
des_citations.shape

In [None]:
afiliatedes_citations = des_citations[['patent_number','cited_patents']].reset_index(drop=True)

In [None]:
des_citations.tail()

In [None]:
def extract_citation(df):
    keep = df.copy()
    #extract class information
    pattern = "(\d{7}|D\d{6})"

    keep['cited_patents'] = keep['cited_patents'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['cited_patents']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'reference'
    return keep.drop('cited_patents', axis=1).join(holder)

In [None]:
# because of how much citations explode, the extraction of citations must be done iterativly and combined for the final dataframe
citations = pd.DataFrame()

for i in range(0,des_citations.shape[0],50):
    print(i)
    if (des_citations.shape[0] - i < 50):
        temp = extract_citation(des_citations.loc[i:])
    else:
        temp = extract_citation(des_citations.iloc[i:i+50])
    
    citations = pd.concat([citations, temp], ignore_index=True)

In [None]:
citations.sample(50)

In [None]:
citations.shape

In [None]:
citations.dropna(subset=['reference'], inplace=True)

In [None]:
citations.shape

In [None]:
citations.groupby('patent_number').agg('count').describe()

In [None]:
citations.to_csv('designCitations.csv')

# Note 2
If the citation data is available locally on your machine, just run the following code blocks, including the commented out immediatly after this. Be sure to run the first code block for the imports

In [5]:
citations = pd.read_csv('designCitations.csv', usecols=['patent_number','reference'])

In [6]:
# Just design citations
citations_design = citations[citations.reference.str.contains('D')].reset_index(drop=True)
citations_design.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,461903.0
mean,9.413589
std,14.965143
min,1.0
25%,3.0
50%,6.0
75%,11.0
max,561.0


In [7]:
# just utility citations
citations_utility = citations[~citations.reference.str.contains('D')].reset_index(drop=True)
citations_utility.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,301536.0
mean,6.246083
std,16.454657
min,1.0
25%,1.0
50%,3.0
75%,6.0
max,988.0


In [None]:
# ## Now pull citation
# def pull_citation_classes(starter_df, reference_df):
#     base_url = "http://www.patentsview.org/api/patents/query?"
#     field_list = "&f=[\"patent_number\",\"uspc_subclass_id\"]"

#     for i, value in reference_df.reference.iteritems():
#         query = "q={{\"patent_number\":\"{}\"}}".format(value)
#         full_url = base_url + query + field_list
#     #     print(full_url)
#         r = requests.get(full_url)
#         print(i)
#         print(r)
#         data = r.json()
#         df = pd.io.json.json_normalize(data['patents'])
#         starter_df = pd.concat([starter_df, df], ignore_index=True)
#     return starter_df

In [None]:
# citations_design_class = pd.DataFrame()
# citations_design_class = pull_citation_classes(citations_design_class, citations_design)

In [8]:
def gather_urls(series):
    urls = []
    base_url = "http://www.patentsview.org/api/patents/query?"
    field_list = "&f=[\"patent_number\",\"uspc_subclass_id\"]"
    for i, value in series.iteritems():
        query = "q={{\"patent_number\":\"{}\"}}".format(value)
        urls.append(base_url + query + field_list)
    return urls

In [None]:
# def urls_to_dataframe(urls, size=100):
#     results = grequests.map((grequests.get(u) for u in urls), size=size)
#     starter_df = pd.DataFrame()
#     for r in results:
#         data = r.json()
#         df = pd.io.json.json_normalize(data['patents'])
#         starter_df = pd.concat([starter_df, df], ignore_index=True)
#     return starter_df
        

In [9]:
utility_urls = gather_urls(citations_utility.reference)

len(utility_urls)

1883419

In [38]:
util_mini = utility_urls[:1000]

In [39]:
sucessful_responses = []
unsucessful_responses = []## Bad coding practice! creates a weird coupling, will fix later (maybe)

In [41]:
async def fetch(url, session):
    async with session.get(url) as response:
        print(response.status)
        if(response.status == 503):
            print('this is where I died')
#             sucessful_responses.append(await response.json())
#         else:
#             response.release()
        


async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        await fetch(url, session)


async def run(r):
    tasks = []
    # create instance of Semaphore
    sem = asyncio.Semaphore(1000)

    # Create client session that will ensure we dont open new connection
    # per each request.
    async with ClientSession() as session:
        for i in range(r):
            # pass Semaphore and session to every GET request
            task = asyncio.ensure_future(bound_fetch(sem, util_mini[i], session))
            tasks.append(task)

        responses = asyncio.wait(*tasks)
        await responses
        
number = 1000
loop = asyncio.get_event_loop()

future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)

Executing <Task pending coro=<run() running at <ipython-input-41-347a7b8bf17e>:32> wait_for=<_GatheringFuture pending cb=[<TaskWakeupMethWrapper object at 0x7f8bea9f8df8>()] created at /home/tiera/anaconda3/lib/python3.6/asyncio/tasks.py:549> cb=[_run_until_complete_cb() at /home/tiera/anaconda3/lib/python3.6/asyncio/base_events.py:176] created at <ipython-input-41-347a7b8bf17e>:37> took 0.285 seconds
Executing <Task pending coro=<bound_fetch() running at <ipython-input-41-347a7b8bf17e>:15> wait_for=<Future pending cb=[<TaskWakeupMethWrapper object at 0x7f8bf44117c8>()] created at /home/tiera/anaconda3/lib/python3.6/asyncio/base_events.py:275> cb=[gather.<locals>._done_callback(214)() at /home/tiera/anaconda3/lib/python3.6/asyncio/tasks.py:616] created at <ipython-input-41-347a7b8bf17e>:28> took 0.430 seconds
Executing <Handle <TaskWakeupMethWrapper object at 0x7f8be91fabb8>(<Future finis...events.py:275>) created at /home/tiera/anaconda3/lib/python3.6/asyncio/locks.py:431> took 0.471 

200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
200
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died
503
this is where I died


CancelledError: 

In [31]:
dir(aiohttp.ClientResponse)

['ATTRS',
 '__aenter__',
 '__aexit__',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_cleanup_writer',
 '_closed',
 '_connection',
 '_content_dict',
 '_content_type',
 '_headers',
 '_notify_content',
 '_parse_content_type',
 '_reader',
 '_response_eof',
 '_source_traceback',
 '_stored_content_type',
 'charset',
 'close',
 'closed',
 'connection',
 'content',
 'content_disposition',
 'content_length',
 'content_type',
 'get_encoding',
 'headers',
 'history',
 'host',
 'json',
 'raise_for_status',
 'raw_headers',
 'read',
 'reason',
 'release',
 'request_info',
 'start',
 'status',
 'text',
 'url',
 'url_obj',
 'version',
 'wait_for_close']

In [None]:
starter_df = pd.DataFrame()
for r in results:
    data = r.json()
    df = pd.io.json.json_normalize(data['patents'])
    starter_df = pd.concat([starter_df, df], ignore_index=True)

In [None]:
starter_df

In [26]:
len(sucessful_responses)

3017

In [24]:
len(unsucessful_responses)

1

In [19]:
sucessful_responses

[{'count': 1,
  'patents': [{'patent_number': '3939620',
    'uspcs': [{'uspc_subclass_id': '52/717.5'},
     {'uspc_subclass_id': '52/476'},
     {'uspc_subclass_id': '52/775'},
     {'uspc_subclass_id': '52/781'}]}],
  'total_patent_count': 1},
 {'count': 1,
  'patents': [{'patent_number': '4021980',
    'uspcs': [{'uspc_subclass_id': '40/732'},
     {'uspc_subclass_id': '49/61'},
     {'uspc_subclass_id': '52/202'},
     {'uspc_subclass_id': '52/476'}]}],
  'total_patent_count': 1},
 {'count': 1,
  'patents': [{'patent_number': '4069641',
    'uspcs': [{'uspc_subclass_id': '52/202'},
     {'uspc_subclass_id': '52/476'},
     {'uspc_subclass_id': '52/717.1'},
     {'uspc_subclass_id': 'D25/48.7'},
     {'uspc_subclass_id': 'D25/119'}]}],
  'total_patent_count': 1},
 {'count': 1,
  'patents': [{'patent_number': '3935984',
    'uspcs': [{'uspc_subclass_id': '600/499'},
     {'uspc_subclass_id': 'D24/165'}]}],
  'total_patent_count': 1},
 {'count': 1,
  'patents': [{'patent_number': '39