# Data Download and pre-processing 
In this notebook we will do web scrapping to collect patents data from Google's patent database. 

In [1]:
# Import initial libraries
import pandas as pd 
from bs4 import BeautifulSoup
import requests
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re 

# instantiate WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [2]:
def cleanhtml(raw_html):
    # clean html tags
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    
    # remove newlines, \n, and turn text into long string
    output = " ".join(cleantext.splitlines()) 
    return output

In [3]:
def removeSpecialCharacter(string):
    # Removes '-:;' from combined words 
    regex = re.compile('[-:;,]')
    clean_text = regex.sub(' ', string)
    return clean_text

In [4]:
def process_text(doc):
    doc = cleanhtml(str(doc)) # remove html characters
    sw = set(stopwords.words('english')) # set of default stopwords
    sw_addon = {'according', 'system', 'comprising', 'comprise','means', 'method', 'claim', 'input', 'output', 'wherein', 'one','said',
                'a', 'though', 'least', 'plurality', 'first', 'second', 'member', 'aparatus', 'part'}
    regex = re.compile('[^a-zA-Z\-\:\;\, ]') # compile all non-letter character. Ignore '-' and '\n'
    re_clean = regex.sub('', doc) #substitute all non-letter characters
    final_clean = removeSpecialCharacter(re_clean) #remove '-' from combined words
    words = word_tokenize(final_clean) # break down articles into words
    words = [lemmatizer.lemmatize(word) for word in words] #lemmatize each word
    output = [word.lower() for word in words if word.lower() not in sw.union(sw_addon)]# list of words not part of sw
    output = ' '.join(output)
    return output

In [5]:
def getClaims(url): 
    # Retrieve text-based html content
    # create Beatiful Soup object 
    result = requests.get(url)
    text = result.text
    soup = BeautifulSoup(text, 'lxml')
    
    # Find all instances of Claims
    claims = soup.findAll('div', {'class':"claim-text"})
    
    # Clean and return claim data
    return process_text(claims)    

In [6]:
# import df with patents' links
links_df = pd.read_csv('G06_FINAL.csv', header=None).rename(columns={0:'Patent No. Hyperlink', 1:'Class'})
links_df.head()

Unnamed: 0,Patent No. Hyperlink,Class
0,https://patents.google.com/patent/US3476313A/en,G06C
1,https://patents.google.com/patent/US3451618A/en,G06C
2,https://patents.google.com/patent/US3441211A/en,G06C
3,https://patents.google.com/patent/US3472450A/en,G06C
4,https://patents.google.com/patent/US3476312A/en,G06C


In [7]:
claims_df = pd.DataFrame(columns = ['URL','Class', 'Claims'])
for url, class_ in zip(links_df['Patent No. Hyperlink'], links_df['Class']):
    # Run the following.
    try:       
        df = pd.DataFrame({
            'URL': url,
            'Class': class_,
            'Claims': getClaims(url)
        }, index = [0])
        
        # Check for patents with blank claims.
        # Skip patent if true
        if df['Claims'][0] == '':
            continue
        else: 
            claims_df = pd.concat([claims_df,df])
         
    # If an error/exception occurs, pass and go to the next iteration of the loop.   
    except: 
        pass

In [8]:
claims_df.reset_index(drop=True, inplace = True)

In [9]:
claims_df.to_csv('G06_final_1.csv')

In [10]:
claims_df.head()

Unnamed: 0,URL,Class,Claims
0,https://patents.google.com/patent/US3561674A/en,G06C,business machine apparatus constructed arrange...
1,https://patents.google.com/patent/US3869081A/en,G06C,ten key calculating maching value mechanism re...
2,https://patents.google.com/patent/US3432094A/en,G06C,printing calculator ten digit key four functio...
3,https://patents.google.com/patent/US3682379A/en,G06C,cash register amount bank key includes differe...
4,https://patents.google.com/patent/US3604619A/en,G06C,machine calculating decimal number coded accor...


In [11]:
df = claims_df
df.head()

Unnamed: 0,URL,Class,Claims
0,https://patents.google.com/patent/US3561674A/en,G06C,business machine apparatus constructed arrange...
1,https://patents.google.com/patent/US3869081A/en,G06C,ten key calculating maching value mechanism re...
2,https://patents.google.com/patent/US3432094A/en,G06C,printing calculator ten digit key four functio...
3,https://patents.google.com/patent/US3682379A/en,G06C,cash register amount bank key includes differe...
4,https://patents.google.com/patent/US3604619A/en,G06C,machine calculating decimal number coded accor...


In [12]:
df.drop(columns='URL', inplace=True)
df.head()

Unnamed: 0,Class,Claims
0,G06C,business machine apparatus constructed arrange...
1,G06C,ten key calculating maching value mechanism re...
2,G06C,printing calculator ten digit key four functio...
3,G06C,cash register amount bank key includes differe...
4,G06C,machine calculating decimal number coded accor...


In [13]:
df.to_csv('G06_updated_cleaned_words.csv')