In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import grequests
import json
import asyncio
import concurrent.futures
import random
import asyncio
from aiohttp import ClientSession


# Note 1
If the citation data is not local avaiable on your machine, run the following code blocks. Otherwise this can be skipped until note 2

In [2]:
master = pd.read_csv('designDirty.csv', usecols=['applications','cited_patents','patent_number','uspcs'])
master.head()

Unnamed: 0,applications,cited_patents,patent_number,uspcs
0,"[{'app_date': '1980-01-14', 'app_id': '06/1118...","[{'cited_patent_number': None}, {'cited_patent...",D257752,[{'uspc_subclass_id': 'D19/75'}]
1,"[{'app_date': '1980-01-14', 'app_id': '06/1118...","[{'cited_patent_number': None}, {'cited_patent...",D257924,[{'uspc_subclass_id': 'D06/573'}]
2,"[{'app_date': '1980-01-17', 'app_id': '06/1130...","[{'cited_patent_number': None}, {'cited_patent...",D258382,[{'uspc_subclass_id': 'D23/214'}]
3,"[{'app_date': '1980-01-17', 'app_id': '06/1130...","[{'cited_patent_number': None}, {'cited_patent...",D258383,[{'uspc_subclass_id': 'D23/214'}]
4,"[{'app_date': '1980-04-03', 'app_id': '06/1368...",[{'cited_patent_number': None}],D258571,[{'uspc_subclass_id': 'D09/560'}]


In [3]:
#clean data
def remove_non_design(df):
    #filter to make sure mis-classified patents are not included
    return df[df.patent_number.str.contains('D')]

In [4]:
master = remove_non_design(master)

In [5]:
master.shape

(525490, 4)

In [6]:
def extract_date(df):
    keep = df.copy()
    #extract application date and year
    keep['app_date'] = keep['applications'].astype(str).str.extract('(\d{4}-\d{2}-\d{2})')
#     set_trace()
    pd.to_datetime(keep.app_date, errors='coerce')
    keep.drop('applications', axis=1, inplace=True)
    
    keep['app_date'] = pd.to_datetime(keep['app_date'], errors='coerce')
    keep['year'] =keep['app_date'].apply(lambda x: x.year)
    return keep

In [7]:
des = extract_date(master)

  after removing the cwd from sys.path.


In [8]:
des = des[['patent_number','year','uspcs']]
des.head()

Unnamed: 0,patent_number,year,uspcs
0,D257752,1980,[{'uspc_subclass_id': 'D19/75'}]
1,D257924,1980,[{'uspc_subclass_id': 'D06/573'}]
2,D258382,1980,[{'uspc_subclass_id': 'D23/214'}]
3,D258383,1980,[{'uspc_subclass_id': 'D23/214'}]
4,D258571,1980,[{'uspc_subclass_id': 'D09/560'}]


In [9]:
def extract_class(df):
    keep = df.copy()
    #extract class information
    pattern = "([D0-9]\d{2}/\d{1,3}\.?\d{1,2})"

    keep['uspcs'] = keep['uspcs'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['uspcs']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'class'
    return keep.drop('uspcs', axis=1).join(holder)

In [10]:
des = extract_class(des)

In [11]:
des.isnull().sum()

patent_number        0
year                 0
class            18614
dtype: int64

In [12]:
des[des['class'].isnull()]

Unnamed: 0,patent_number,year,class
10,D258990,1980,
127,D261221,1980,
222,D261664,1980,
363,D262413,1980,
496,D262884,1980,
541,D263109,1980,
549,D263150,1980,
550,D263151,1980,
602,D263301,1980,
682,D263548,1980,


In [13]:
#remove the patents without a class because they are useless for our anaysis
# figure out why they are null
master[master.patent_number == 'D806240']

Unnamed: 0,applications,cited_patents,patent_number,uspcs
525494,"[{'app_date': '2015-10-09', 'app_id': '29/5419...","[{'cited_patent_number': None}, {'cited_patent...",D806240,[{'uspc_subclass_id': None}]


In [14]:
des = des.dropna().reset_index()

In [15]:
des.shape

(712801, 4)

In [16]:
# number of unique patent numbers represented
des.patent_number.nunique()

506876

In [29]:
des.to_csv('designYearClass.csv')

In [17]:
#creating data frame just for citations
des_citations = master[master.patent_number.isin(des.drop_duplicates(subset=['patent_number']).patent_number)]

#checking for the same number of patents represented
des_citations.shape

(506876, 4)

In [18]:
afiliatedes_citations = des_citations[['patent_number','cited_patents']].reset_index(drop=True)

In [19]:
des_citations.tail()

Unnamed: 0,patent_number,cited_patents
506871,D806098,"[{'cited_patent_number': 'D479846'}, {'cited_p..."
506872,D806099,"[{'cited_patent_number': 'D479846'}, {'cited_p..."
506873,D806157,"[{'cited_patent_number': None}, {'cited_patent..."
506874,D806194,"[{'cited_patent_number': None}, {'cited_patent..."
506875,D806225,"[{'cited_patent_number': 'D508560'}, {'cited_p..."


In [20]:
def extract_citation(df):
    keep = df.copy()
    #extract class information
    pattern = "(\d{7}|D\d{6})"

    keep['cited_patents'] = keep['cited_patents'].astype(str).str.findall(pattern)
    holder = keep.apply(lambda x: pd.Series(x['cited_patents']),axis=1).stack().reset_index(level=1, drop=True)
    holder.name = 'reference'
    return keep.drop('cited_patents', axis=1).join(holder)

In [21]:
# because of how much citations explode, the extraction of citations must be done iterativly and combined for the final dataframe
citations = pd.DataFrame()

for i in range(0,des_citations.shape[0],50):
    print(i)
    if (des_citations.shape[0] - i < 50):
        temp = extract_citation(des_citations.loc[i:])
    else:
        temp = extract_citation(des_citations.iloc[i:i+50])
    
    citations = pd.concat([citations, temp], ignore_index=True)

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550
4600
4650
4700
4750
4800
4850
4900
4950
5000
5050
5100
5150
5200
5250
5300
5350
5400
5450
5500
5550
5600
5650
5700
5750
5800
5850
5900
5950
6000
6050
6100
6150
6200
6250
6300
6350
6400
6450
6500
6550
6600
6650
6700
6750
6800
6850
6900
6950
7000
7050
7100
7150
7200
7250
7300
7350
7400
7450
7500
7550
7600
7650
7700
7750
7800
7850
7900
7950
8000
8050
8100
8150
8200
8250
8300
8350
8400
8450
8500
8550
8600
8650
8700
8750
8800
8850
8900
8950
9000
9050
9100
9150
9200
9250
9300
9350
9400
9450
9500
9550
9600
9650
9700
9750
9800
9850
9900
9950
10000
10050
10100
10150

70200
70250
70300
70350
70400
70450
70500
70550
70600
70650
70700
70750
70800
70850
70900
70950
71000
71050
71100
71150
71200
71250
71300
71350
71400
71450
71500
71550
71600
71650
71700
71750
71800
71850
71900
71950
72000
72050
72100
72150
72200
72250
72300
72350
72400
72450
72500
72550
72600
72650
72700
72750
72800
72850
72900
72950
73000
73050
73100
73150
73200
73250
73300
73350
73400
73450
73500
73550
73600
73650
73700
73750
73800
73850
73900
73950
74000
74050
74100
74150
74200
74250
74300
74350
74400
74450
74500
74550
74600
74650
74700
74750
74800
74850
74900
74950
75000
75050
75100
75150
75200
75250
75300
75350
75400
75450
75500
75550
75600
75650
75700
75750
75800
75850
75900
75950
76000
76050
76100
76150
76200
76250
76300
76350
76400
76450
76500
76550
76600
76650
76700
76750
76800
76850
76900
76950
77000
77050
77100
77150
77200
77250
77300
77350
77400
77450
77500
77550
77600
77650
77700
77750
77800
77850
77900
77950
78000
78050
78100
78150
78200
78250
78300
78350
78400
78450
7850

133050
133100
133150
133200
133250
133300
133350
133400
133450
133500
133550
133600
133650
133700
133750
133800
133850
133900
133950
134000
134050
134100
134150
134200
134250
134300
134350
134400
134450
134500
134550
134600
134650
134700
134750
134800
134850
134900
134950
135000
135050
135100
135150
135200
135250
135300
135350
135400
135450
135500
135550
135600
135650
135700
135750
135800
135850
135900
135950
136000
136050
136100
136150
136200
136250
136300
136350
136400
136450
136500
136550
136600
136650
136700
136750
136800
136850
136900
136950
137000
137050
137100
137150
137200
137250
137300
137350
137400
137450
137500
137550
137600
137650
137700
137750
137800
137850
137900
137950
138000
138050
138100
138150
138200
138250
138300
138350
138400
138450
138500
138550
138600
138650
138700
138750
138800
138850
138900
138950
139000
139050
139100
139150
139200
139250
139300
139350
139400
139450
139500
139550
139600
139650
139700
139750
139800
139850
139900
139950
140000
140050
140100
140150

191800
191850
191900
191950
192000
192050
192100
192150
192200
192250
192300
192350
192400
192450
192500
192550
192600
192650
192700
192750
192800
192850
192900
192950
193000
193050
193100
193150
193200
193250
193300
193350
193400
193450
193500
193550
193600
193650
193700
193750
193800
193850
193900
193950
194000
194050
194100
194150
194200
194250
194300
194350
194400
194450
194500
194550
194600
194650
194700
194750
194800
194850
194900
194950
195000
195050
195100
195150
195200
195250
195300
195350
195400
195450
195500
195550
195600
195650
195700
195750
195800
195850
195900
195950
196000
196050
196100
196150
196200
196250
196300
196350
196400
196450
196500
196550
196600
196650
196700
196750
196800
196850
196900
196950
197000
197050
197100
197150
197200
197250
197300
197350
197400
197450
197500
197550
197600
197650
197700
197750
197800
197850
197900
197950
198000
198050
198100
198150
198200
198250
198300
198350
198400
198450
198500
198550
198600
198650
198700
198750
198800
198850
198900

250400
250450
250500
250550
250600
250650
250700
250750
250800
250850
250900
250950
251000
251050
251100
251150
251200
251250
251300
251350
251400
251450
251500
251550
251600
251650
251700
251750
251800
251850
251900
251950
252000
252050
252100
252150
252200
252250
252300
252350
252400
252450
252500
252550
252600
252650
252700
252750
252800
252850
252900
252950
253000
253050
253100
253150
253200
253250
253300
253350
253400
253450
253500
253550
253600
253650
253700
253750
253800
253850
253900
253950
254000
254050
254100
254150
254200
254250
254300
254350
254400
254450
254500
254550
254600
254650
254700
254750
254800
254850
254900
254950
255000
255050
255100
255150
255200
255250
255300
255350
255400
255450
255500
255550
255600
255650
255700
255750
255800
255850
255900
255950
256000
256050
256100
256150
256200
256250
256300
256350
256400
256450
256500
256550
256600
256650
256700
256750
256800
256850
256900
256950
257000
257050
257100
257150
257200
257250
257300
257350
257400
257450
257500

309000
309050
309100
309150
309200
309250
309300
309350
309400
309450
309500
309550
309600
309650
309700
309750
309800
309850
309900
309950
310000
310050
310100
310150
310200
310250
310300
310350
310400
310450
310500
310550
310600
310650
310700
310750
310800
310850
310900
310950
311000
311050
311100
311150
311200
311250
311300
311350
311400
311450
311500
311550
311600
311650
311700
311750
311800
311850
311900
311950
312000
312050
312100
312150
312200
312250
312300
312350
312400
312450
312500
312550
312600
312650
312700
312750
312800
312850
312900
312950
313000
313050
313100
313150
313200
313250
313300
313350
313400
313450
313500
313550
313600
313650
313700
313750
313800
313850
313900
313950
314000
314050
314100
314150
314200
314250
314300
314350
314400
314450
314500
314550
314600
314650
314700
314750
314800
314850
314900
314950
315000
315050
315100
315150
315200
315250
315300
315350
315400
315450
315500
315550
315600
315650
315700
315750
315800
315850
315900
315950
316000
316050
316100

367550
367600
367650
367700
367750
367800
367850
367900
367950
368000
368050
368100
368150
368200
368250
368300
368350
368400
368450
368500
368550
368600
368650
368700
368750
368800
368850
368900
368950
369000
369050
369100
369150
369200
369250
369300
369350
369400
369450
369500
369550
369600
369650
369700
369750
369800
369850
369900
369950
370000
370050
370100
370150
370200
370250
370300
370350
370400
370450
370500
370550
370600
370650
370700
370750
370800
370850
370900
370950
371000
371050
371100
371150
371200
371250
371300
371350
371400
371450
371500
371550
371600
371650
371700
371750
371800
371850
371900
371950
372000
372050
372100
372150
372200
372250
372300
372350
372400
372450
372500
372550
372600
372650
372700
372750
372800
372850
372900
372950
373000
373050
373100
373150
373200
373250
373300
373350
373400
373450
373500
373550
373600
373650
373700
373750
373800
373850
373900
373950
374000
374050
374100
374150
374200
374250
374300
374350
374400
374450
374500
374550
374600
374650

426100
426150
426200
426250
426300
426350
426400
426450
426500
426550
426600
426650
426700
426750
426800
426850
426900
426950
427000
427050
427100
427150
427200
427250
427300
427350
427400
427450
427500
427550
427600
427650
427700
427750
427800
427850
427900
427950
428000
428050
428100
428150
428200
428250
428300
428350
428400
428450
428500
428550
428600
428650
428700
428750
428800
428850
428900
428950
429000
429050
429100
429150
429200
429250
429300
429350
429400
429450
429500
429550
429600
429650
429700
429750
429800
429850
429900
429950
430000
430050
430100
430150
430200
430250
430300
430350
430400
430450
430500
430550
430600
430650
430700
430750
430800
430850
430900
430950
431000
431050
431100
431150
431200
431250
431300
431350
431400
431450
431500
431550
431600
431650
431700
431750
431800
431850
431900
431950
432000
432050
432100
432150
432200
432250
432300
432350
432400
432450
432500
432550
432600
432650
432700
432750
432800
432850
432900
432950
433000
433050
433100
433150
433200

484650
484700
484750
484800
484850
484900
484950
485000
485050
485100
485150
485200
485250
485300
485350
485400
485450
485500
485550
485600
485650
485700
485750
485800
485850
485900
485950
486000
486050
486100
486150
486200
486250
486300
486350
486400
486450
486500
486550
486600
486650
486700
486750
486800
486850
486900
486950
487000
487050
487100
487150
487200
487250
487300
487350
487400
487450
487500
487550
487600
487650
487700
487750
487800
487850
487900
487950
488000
488050
488100
488150
488200
488250
488300
488350
488400
488450
488500
488550
488600
488650
488700
488750
488800
488850
488900
488950
489000
489050
489100
489150
489200
489250
489300
489350
489400
489450
489500
489550
489600
489650
489700
489750
489800
489850
489900
489950
490000
490050
490100
490150
490200
490250
490300
490350
490400
490450
490500
490550
490600
490650
490700
490750
490800
490850
490900
490950
491000
491050
491100
491150
491200
491250
491300
491350
491400
491450
491500
491550
491600
491650
491700
491750

In [22]:
citations.sample(50)

Unnamed: 0,patent_number,reference
3919755,D672848,7854396
4712659,D703731,D547355
5870493,D763341,D554168
3225118,D617723,D506172
4444408,D702240,7135782
5904623,D766198,D719118
3616482,D681246,D536464
5899170,D765740,D495345
3359825,D637474,D552968
3645248,D643606,D539014


In [23]:
citations.shape

(6249506, 2)

In [24]:
citations.dropna(subset=['reference'], inplace=True)

In [25]:
citations.shape

(6231584, 2)

In [28]:
citations.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,488954.0
mean,12.744724
std,22.099597
min,1.0
25%,4.0
50%,8.0
75%,14.0
max,1060.0


In [31]:
citations.to_csv('designCitations.csv')

# Note 2
If the citation data is available locally on your machine, just run the following code blocks, including the commented out immediatly after this. Be sure to run the first code block for the imports

In [2]:
citations = pd.read_csv('designCitations.csv', usecols=['patent_number','reference'])

In [3]:
# Just design citations
citations_design = citations[citations.reference.str.contains('D')].reset_index(drop=True)
citations_design.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,461903.0
mean,9.413589
std,14.965143
min,1.0
25%,3.0
50%,6.0
75%,11.0
max,561.0


In [9]:
# just utility citations
citations_utility = citations[~citations.reference.str.contains('D')].reset_index(drop=True)
citations_utility.groupby('patent_number').agg('count').describe()

Unnamed: 0,reference
count,301536.0
mean,6.246083
std,16.454657
min,1.0
25%,1.0
50%,3.0
75%,6.0
max,988.0


In [60]:
# ## Now pull citation
# def pull_citation_classes(starter_df, reference_df):
#     base_url = "http://www.patentsview.org/api/patents/query?"
#     field_list = "&f=[\"patent_number\",\"uspc_subclass_id\"]"

#     for i, value in reference_df.reference.iteritems():
#         query = "q={{\"patent_number\":\"{}\"}}".format(value)
#         full_url = base_url + query + field_list
#     #     print(full_url)
#         r = requests.get(full_url)
#         print(i)
#         print(r)
#         data = r.json()
#         df = pd.io.json.json_normalize(data['patents'])
#         starter_df = pd.concat([starter_df, df], ignore_index=True)
#     return starter_df

In [7]:
# citations_design_class = pd.DataFrame()
# citations_design_class = pull_citation_classes(citations_design_class, citations_design)

In [10]:
def gather_urls(series):
    urls = []
    base_url = "http://www.patentsview.org/api/patents/query?"
    field_list = "&f=[\"patent_number\",\"uspc_subclass_id\"]"
    for i, value in series.iteritems():
        query = "q={{\"patent_number\":\"{}\"}}".format(value)
        urls.append(base_url + query + field_list)
    return urls

In [9]:
# def urls_to_dataframe(urls, size=100):
#     results = grequests.map((grequests.get(u) for u in urls), size=size)
#     starter_df = pd.DataFrame()
#     for r in results:
#         data = r.json()
#         df = pd.io.json.json_normalize(data['patents'])
#         starter_df = pd.concat([starter_df, df], ignore_index=True)
#     return starter_df
        

In [11]:
utility_urls = gather_urls(citations_utility.reference)

len(utility_urls)

1883419

In [23]:
util_mini = utility_urls[:100000]

In [33]:
# modified fetch function with semaphore
# import random
# import asyncio
# from aiohttp import ClientSession

url_responses = [] ## Bad coding practice! creates a weird coupling, will fix later (maybe)

async def fetch(url, session):
    async with session.get(url) as response:
        print(response)
        url_responses.append(await response.json())


async def bound_fetch(sem, url, session):
    # Getter function with semaphore.
    async with sem:
        await fetch(url, session)


async def run(r):
#     url = "http://localhost:8090/{}"
    tasks = []
    # create instance of Semaphore
    sem = asyncio.Semaphore(1000)

    # Create client session that will ensure we dont open new connection
    # per each request.
    async with ClientSession() as session:
        for i in range(r):
            # pass Semaphore and session to every GET request
            task = asyncio.ensure_future(bound_fetch(sem, util_mini[i], session))
            tasks.append(task)

        responses = asyncio.gather(*tasks)
        await responses

number = 100000
loop = asyncio.get_event_loop()

future = asyncio.ensure_future(run(number))
loop.run_until_complete(future)

<ClientResponse(http://www.patentsview.org/api/patents/query?q=%7B%22patent_number%22:%223939620%22%7D&f=%5B%22patent_number%22,%22uspc_subclass_id%22%5D) [200 OK]>
<CIMultiDictProxy('Content-Type': 'application/json; charset=utf-8', 'Server': 'Microsoft-IIS/8.5', 'X-Powered-By': 'PHP/5.6.31', 'X-Powered-By': 'ASP.NET', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Methods': 'GET, PUT, POST, DELETE, OPTIONS', 'Access-Control-Max-Age': '1000', 'Date': 'Thu, 26 Apr 2018 08:58:45 GMT', 'Content-Length': '210')>

<ClientResponse(http://www.patentsview.org/api/patents/query?q=%7B%22patent_number%22:%223971178%22%7D&f=%5B%22patent_number%22,%22uspc_subclass_id%22%5D) [200 OK]>
<CIMultiDictProxy('Content-Type': 'application/json; charset=utf-8', 'Server': 'Microsoft-IIS/8.5', 'X-Powered-By': 'PHP/5.6.31', 'X-Powered-By': 'ASP.NET', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Methods': 'GET, PUT


<ClientResponse(http://www.patentsview.org/api/patents/query?q=%7B%22patent_number%22:%223992809%22%7D&f=%5B%22patent_number%22,%22uspc_subclass_id%22%5D) [200 OK]>
<CIMultiDictProxy('Content-Type': 'application/json; charset=utf-8', 'Server': 'Microsoft-IIS/8.5', 'X-Powered-By': 'PHP/5.6.31', 'X-Powered-By': 'ASP.NET', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Methods': 'GET, PUT, POST, DELETE, OPTIONS', 'Access-Control-Max-Age': '1000', 'Date': 'Thu, 26 Apr 2018 08:58:45 GMT', 'Content-Length': '115')>

<ClientResponse(http://www.patentsview.org/api/patents/query?q=%7B%22patent_number%22:%224164309%22%7D&f=%5B%22patent_number%22,%22uspc_subclass_id%22%5D) [200 OK]>
<CIMultiDictProxy('Content-Type': 'application/json; charset=utf-8', 'Server': 'Microsoft-IIS/8.5', 'X-Powered-By': 'PHP/5.6.31', 'X-Powered-By': 'ASP.NET', 'Access-Control-Allow-Origin': '*', 'Access-Control-Allow-Headers': '*', 'Access-Control-Allow-Methods': 'GET, PU

<ClientResponse(http://www.patentsview.org/api/patents/query?q=%7B%22patent_number%22:%224039094%22%7D&f=%5B%22patent_number%22,%22uspc_subclass_id%22%5D) [503 Service Unavailable]>
<CIMultiDictProxy('Content-Length': '175')>



ContentTypeError: 0, message='Attempt to decode JSON with unexpected mimetype: '

In [21]:
starter_df = pd.DataFrame()
for r in results:
    data = r.json()
    df = pd.io.json.json_normalize(data['patents'])
    starter_df = pd.concat([starter_df, df], ignore_index=True)

In [22]:
starter_df

Unnamed: 0,patent_number,uspcs
0,D253842,[{'uspc_subclass_id': 'D23/214'}]
1,D253842,[{'uspc_subclass_id': 'D23/214'}]
2,D253842,[{'uspc_subclass_id': 'D23/214'}]
3,D253842,[{'uspc_subclass_id': 'D23/214'}]
4,D243463,"[{'uspc_subclass_id': 'D07/521'}, {'uspc_subcl..."
