In [138]:
import requests 
import json 
from bs4 import BeautifulSoup
import pandas as pd 
import numpy as np
import re 
from nltk.corpus import reuters, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [139]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

In [140]:
def process_text(doc):
    doc = cleanhtml(str(doc))
    sw = set(stopwords.words('english'))#set of default stopwords
    sw_addon= {'ul', 'itemscope', 'repeatli', 'itemscope', 'repeatspan', 'itempropcpcs','itempropcpcs',
              'itempropcodegspanspan','itempropdescriptionphysicsspanlili', 'li'}
    regex = re.compile('[^a-zA-Z ]')#compile all non-letter character
    re_clean = regex.sub('', doc)# substitute all non-letter characters
    words = word_tokenize(re_clean)# break down articles into words
    words = [lemmatizer.lemmatize(word) for word in words]# lemmatize each word
    output = [word.lower() for word in words if word.lower() not in sw.union(sw_addon)]# list of words not part of sw
    output = ' '.join(output)
    return output

def getClaims(soup):
    claim = soup.findAll(class_ = 'claim-text')
    return pd.Series(process_text(claim))

def getDescription(soup):
    description = soup.findAll(class_ = 'description')
    return pd.Series(process_text(description))

def getClassification(soup):
    classification = soup.findAll(itemprop = 'cpcs')
    return pd.Series(process_text(classification))

def getCitations(soup): 
    a = len(soup.findAll(itemprop = 'forwardReferencesOrig'))
    b = len(soup.findAll(itemprop="forwardReferencesFamily"))
    return pd.Series(a + b)


data = pd.read_csv('pricingData.csv')

links_a = pd.Series(data['Patent No. Hyperlink']).dropna()
links_b = pd.Series(data['Publication No. Hyperlink']).dropna()

links = links_a.append(links_b)


columns = ['Citations', 'Description', 'Classification', 'Claims']

#finalDF = pd.DataFrame(columns = columns, index = range(135))
finalDF = pd.DataFrame(columns = columns)

index = 0 
for link in links:
    url = link
    result = requests.get(url)
    html = result.text
    soup = BeautifulSoup(html,'lxml')
    df = pd.DataFrame({
        'Citations' : getCitations(soup),
        'Description' : getDescription(soup),
        'Classification' : getClassification(soup),
        'Claims' : getClaims(soup)
    })
    finalDF = pd.concat([finalDF, df])

In [141]:
finalDF

Unnamed: 0,Citations,Description,Classification,Claims
0,12,applicant hereby claim priority provisional pa...,gphysicsgmeasuring testinggsradio directionfin...,positioning system comprising first correlatio...
0,15,background inventionthe present invention rela...,gphysicsgcomputing calculating countinggfelect...,system finding trading ad network comprisinga ...
0,24,field inventionthe present invention relates e...,gphysicsgcomputing calculating countinggqdata ...,method performed computer server establishing ...
0,53,related applicationsthis application claim pri...,gphysicsgcomputing calculating countinggqdata ...,method marketing marketable entity selected gr...
0,4,background invention field inventionthe presen...,gphysicsgcomputing calculating countinggqdata ...,userconcerned information provision system com...
...,...,...,...,...
0,8,related applications application continuation ...,gf gf gphysicsgcomputing calculating countingg...,data synchronisation system comprisinga data s...
0,7,cross reference related application patent app...,gf gf gphysicsgcomputing calculating countingg...,method comprisingsending request network serve...
0,30,crossreference related applications patent app...,helectricityhelectric communication techniqueh...,system transmitting talking replay electronic ...
0,58,claim benefit prior applications present appli...,helectricityhelectric communication techniqueh...,canceled device locationbased authentication u...


In [144]:
df.iloc[0,1]

'crossreference related applications patent application continuation us patent application ser filed jun entitled method system delivery content communication networks continuation us patent application ser filed jan entitled system method augmenting rich media content using multiple content repositories entirety herein incorporated reference technical field present disclosure relates generally content distribution specifically computerimplemented system method enabling augmenting content traditionally delivered broadcast medium background traditional broadcast medium ie content bound television radio delivered audience channel terrestrial radio satellite coax fiberoptic cable delivered within context schedule eg radio programming schedule television broadcast schedule convention content item eg television episode movie radio program scheduled fit within time slot begin end either top bottom hour eg conforming convention broadcaster ie company transmit delivery broadcast medium attract

In [143]:
finalDF.to_csv('draftDF.csv')