# Web Scraping Example

In [7]:
from urllib.request import urlopen
import numpy as np

In [8]:
url = "https://www.iss.nus.edu.sg/"

In [9]:
# open the web page
# urlopen() retuns an HTTPResponse object
page = urlopen(url)

In [10]:
page

<http.client.HTTPResponse at 0x2a512411490>

In [5]:
page.read()

b'<html>\r\n<head>\r\n<META NAME="robots" CONTENT="noindex,nofollow">\r\n<script src="/_Incapsula_Resource?SWJIYLWA=5074a744e2e3d891814e9a2dace20bd4,719d34d31c8e3a6e6fffd425f7e032f3">\r\n</script>\r\n<body>\r\n</body></html>\r\n'

Processing HTML Page

In [12]:
# Beautifile soup is used to pull data out of HTML files
#           https://www.crummy.com/software/BeautifulSoup/bs4/doc/

# if you have not installed, in the package
#    pip install beautifulsoup4

from bs4 import BeautifulSoup
from urllib.request import urlopen

ModuleNotFoundError: No module named 'bs4'

In [None]:
# extract the HTML from the page
html_bytes = page.read()  #returns a sequence of bytes
html = html_bytes.decode("utf-8")  # decodes the bytes to a string
soup = BeautifulSoup(html, "html.parser")

In [None]:
print(soup.prettify)

Load a smaller html page for this demonstration

In [None]:
import codecs
file = codecs.open("data/TimeTable.html", "r", "utf-8")
print(file)
soup = BeautifulSoup(file, "html.parser")

print(soup.prettify)

In [None]:
soup.title

In [None]:
soup.title.name

In [None]:
soup.title.string

In [None]:
soup.h3

In [None]:
soup.find_all('td')

In [None]:
soup.td

In [None]:
soup.table

In [None]:
data = []
table = soup.find('table', attrs={'class':'student_time_table'})
rows = table.find_all('tr')
for x in range (1, len(rows)):
    cols = rows[x].find_all('td')        
    cols = [ele.text.strip() for ele in cols]       
    data.append([ele for ele in cols]) 

In [None]:
# display data
data

Put the data into a dataframe

In [None]:
import pandas as pd
df = pd.DataFrame(data)
df

# Regular Expression
Sometimes you may need to further process the html data.  Regular Expression might be useful.

A regular expression is a sequence of characters to defines a search pattern.
Python re module provides features to search a string with a pattern (https://docs.python.org/3/library/re.html#regular-expression-syntax)

In [None]:
import re

In [None]:
# any word starts with 'a', followed by 0 to any instances of 'b', ends with a 'c'
re.search('ab*c', 'abbcc ac a ab').group()

In [None]:
# findall() Find all substrings where the RE matches, and returns them as a list.
re.findall('ab*c', 'abbcc ac a ab')

In [None]:
res= re.search("<o.*?>", " <fox> <jumps> <over>  <dog> <ox>") 
result = res.group()
print(result)

In [None]:
res= re.findall("<o.*?>", "<the> <quick> <brown> <fox> <jumps> <over> <the> <lazy> <dog> <ox>") 
result = res
print(result)

In [None]:
# group - return the string matched by the RE
# group extraction (https://developers.google.com/edu/python/regular-expressions)
# group(0), group(1) - logical group within the search results

# search a pattern enclosed by "()"
module_code = re.search('\((.*?)\)', "OOPCS(4010)").group()
print(module_code)

module_code = re.search('\((.*?)\)', "OOPCS(4010)").group(1)
#year = re.search('\((.*?)\)', cols[1]).group(1)
print(module_code)

In [None]:
# RE.sub() : replace with a string

# remove "(1984)"
re.sub('\((.*?)\)', "",'OOPCS(4010)')


In [None]:
#Possible Answer
data = []
table = soup.find('table', attrs={'class':'student_time_table'})
#table_body = table.find('tbody')
#rows = table_body.find_all('tr')
rows = table.find_all('tr')
for x in range (1, len(rows)):
    cols = rows[x].find_all('td')        
    cols = [ele.text.strip() for ele in cols]    
    year = re.search('\((.*?)\)', cols[1]).group(1)
    cols[1] = re.sub('\((.*?)\)', "",cols[1])
    data.append([ele for ele in cols]) 
    
# display the data