# Advanced Topics in Data Science (CS5661). Cal State Univ. LA, CS Dept.
### Dr. Mohammad Porhoumayoun
----------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------

# Data Cleaning and Scraping
## ----------------------------------------------------------------------------------------------

### All Needed Libraries/Modules:

In [2]:
## all imports:
from IPython.display import HTML
import numpy as np
import pandas as pd
import html5lib
import matplotlib as plt

import requests
from bs4 import BeautifulSoup
import csv
import re # regular expressions library

%matplotlib inline

## ---------------------------------------------------------------------------------------------------------
# WARNING: 
## In web data extraction, parsing, download, and web scraping, make sure to review and follow the webpage's Copyrights and Permissions!!!
## ---------------------------------------------------------------------------------------------------------


# Web Scraping in Python:
### HTML format:

In [3]:
# sample HTML in python:

my_html_string = """<!DOCTYPE html>
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <h3> Data Science </h3>
    <p>Hello world!</p>
    <p>I love my Data Science Class!</p>
  </body>
</html>"""

my_html = HTML(my_html_string)
my_html

### The following cell just defines a url as a string and then reads the data from that url using the `requests` library. 

In [7]:
my_url = 'https://en.wikipedia.org/wiki/California_State_University,_Los_Angeles'

webpage = requests.get(my_url)

webpage_content = webpage.text # this returns the webpage html content

print(webpage_content)


<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>California State University, Los Angeles - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"California_State_University,_Los_Angeles","wgTitle":"California State University, Los Angeles","wgCurRevisionId":775309817,"wgRevisionId":775309817,"wgArticleId":489557,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Use mdy dates from June 2013","Articles needing additional references from January 2013","All articles needing additional references","Pages using deprecated image syntax","Instances of Infobox university using 

### Some interesting questions:
* Is the word 'Faculty' mentioned on the above webpage?
* How many times is the word 'Science' mentioned on this webpage?
* Where is the phrase "Downtown Los Angeles campus" mentioned?


In [8]:
# Is the word 'Faculty' mentioned on the above webpage?
print('Faculty' in webpage_content)

# How many times does the word 'Science' happen on this webpage?
print(webpage_content.count('Science'))

# Where is the phrase "Downtown Los Angeles campus" mentioned?
location =  webpage_content.find("Downtown Los Angeles campus")
print(location)


True
32
18666


# Beautiful Soup
### Beautiful Soup is a Python package for parsing HTML and XML documents, and extracting data out of it.

In [18]:
from bs4 import BeautifulSoup

web_content = """<!DOCTYPE html>
<html>
  <head>
    <title>This is a title</title>
  </head>
  <body>
    <h3> Data Science </h3>
    <p>Hello world!</p>
    <p>I love my Data Science Class!</p>
  </body>
</html>"""

# get bs4 object
soup = BeautifulSoup(web_content, 'html.parser')
#print(soup)
#print(soup.prettify)

# get html root node
root_node = soup.html
#print(root_node)
#print(root_node.contents) # content of the html in the form of a list

# get head from root using contents
head = root_node.contents[1]
#OR:
head = soup.head
#print(head)

# get body from root
body = root_node.contents[3]
#OR:
head = soup.body
#print(body)

## get h3 tag from body
h3 = body.contents[1]
#OR:
head = soup.h3
#print(h3)

['\n', <head>
<title>This is a title</title>
</head>, '\n', <body>
<h3> Data Science </h3>
<p>Hello world!</p>
<p>I love my Data Science Class!</p>
</body>, '\n']


## Parsing a Website using Beautiful Soup:

In [19]:
from bs4 import BeautifulSoup

my_url = 'https://en.wikipedia.org/wiki/California_State_University,_Los_Angeles'

# Use the requests library to grab the page source:
webpage = requests.get(my_url)
webpage_content = webpage.text # this returns the webpage html content

# turn the webpage into a BeautifulSoup object 
# Use BeautifulSoup to parse the html and navigate to data:
my_soup = BeautifulSoup(webpage_content, 'html.parser')

print(my_soup.prettify)

<bound method Tag.prettify of <!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>California State University, Los Angeles - Wikipedia</title>
<script>document.documentElement.className = document.documentElement.className.replace( /(^|\s)client-nojs(\s|$)/, "$1client-js$2" );</script>
<script>(window.RLQ=window.RLQ||[]).push(function(){mw.config.set({"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"California_State_University,_Los_Angeles","wgTitle":"California State University, Los Angeles","wgCurRevisionId":775309817,"wgRevisionId":775309817,"wgArticleId":489557,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Use mdy dates from June 2013","Articles needing additional references from January 2013","All articles needing additional references","Pages using deprecated image syntax","Instanc

### Example: printing all paragraphs:

In [21]:
# finding and printing all paragraph tags:
all_paragraphs = my_soup.findAll('p')

#print(all_paragraphs)

#print(all_paragraphs[0])

<p><b>California State University, Los Angeles</b> (<b>Cal State LA</b>) is a <a href="/wiki/Public_university" title="Public university">public</a> <a class="mw-redirect" href="/wiki/Comprehensive_university" title="Comprehensive university">comprehensive</a> university in the heart of Los Angeles, one of the 23 universities in the <a href="/wiki/California_State_University" title="California State University">California State University</a> (CSU) system. Cal State LA is located in <a class="mw-redirect" href="/wiki/East_Los_Angeles_(region)" title="East Los Angeles (region)">the eastern region</a> of Los Angeles, California, United States, in the <a href="/wiki/University_Hills,_Los_Angeles" title="University Hills, Los Angeles">University Hills</a> district, facing the <a href="/wiki/San_Gabriel_Mountains" title="San Gabriel Mountains">San Gabriel Mountains</a>, at the center of Los Angeles <a href="/wiki/Metropolitan_area" title="Metropolitan area">metropolitan area</a> just five m

In [22]:
# finding and printing all paragraphs (no tag!):
all_paragraphs = my_soup.findAll('p')

for paragraph in all_paragraphs:
    print(paragraph.text)

California State University, Los Angeles (Cal State LA) is a public comprehensive university in the heart of Los Angeles, one of the 23 universities in the California State University (CSU) system. Cal State LA is located in the eastern region of Los Angeles, California, United States, in the University Hills district, facing the San Gabriel Mountains, at the center of Los Angeles metropolitan area just five miles (8 km) east of Downtown Los Angeles.
Cal State LA offers 129 Bachelor's degrees, 112 Master's degrees, 3 Doctoral degrees: a Ph.D. in special education, Doctor of Education (Ed.D), Doctor of Nursing Practice (DNP) and 22 teaching credentials.[7][8] Cal State LA is a Hispanic-serving institution.


Cal State LA has a student body of more than 24,000 students primarily from the greater Los Angeles area,[9] as well as 240,000 alumni. Cal State LA operated year-round on the quarter system with four quarters, each 11 weeks in duration. In fall 2016, the university converted to the

### Example: finding and storing tables:

In [23]:
tables = my_soup.findAll('table')

print(tables)

[<table class="plainlinks metadata ambox ambox-content ambox-Refimprove" role="presentation">
<tr>
<td class="mbox-image">
<div style="width:52px"><a class="image" href="/wiki/File:Question_book-new.svg"><img alt="" data-file-height="399" data-file-width="512" height="39" src="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/50px-Question_book-new.svg.png" srcset="//upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/75px-Question_book-new.svg.png 1.5x, //upload.wikimedia.org/wikipedia/en/thumb/9/99/Question_book-new.svg/100px-Question_book-new.svg.png 2x" width="50"/></a></div>
</td>
<td class="mbox-text"><span class="mbox-text-span">This article <b>needs additional citations for <a href="/wiki/Wikipedia:Verifiability" title="Wikipedia:Verifiability">verification</a></b>. <span class="hide-when-compact">Please help <a class="external text" href="//en.wikipedia.org/w/index.php?title=California_State_University,_Los_Angeles&amp;action=edit">improve thi

In [24]:
# only table2:
table2 = my_soup.findAll('table')[2]

print(table2)

<table class="wikitable">
<tr>
<th></th>
<th>Presidents of Cal State LA</th>
<th>Years as president</th>
</tr>
<tr>
<td>1</td>
<td>P. Victor Peterson</td>
<td>1947–49</td>
</tr>
<tr>
<td>2</td>
<td><a href="/wiki/Howard_S._McDonald" title="Howard S. McDonald">Howard S. McDonald</a></td>
<td>1949–62</td>
</tr>
<tr>
<td>3</td>
<td>Albert D. Graves</td>
<td>1962–63</td>
</tr>
<tr>
<td>4</td>
<td>Franklyn A. Johnson</td>
<td>1963–65</td>
</tr>
<tr>
<td>5</td>
<td>John A. Greenlee</td>
<td>1965–79</td>
</tr>
<tr>
<td>6</td>
<td>James M. Rosser</td>
<td>1979–2013</td>
</tr>
<tr>
<td>7</td>
<td>William A. Covino<sup class="reference" id="cite_ref-lat20130522_20-0"><a href="#cite_note-lat20130522-20">[20]</a></sup></td>
<td>2013–</td>
</tr>
</table>


In [25]:
# Table header:
header = table2.findAll('th')

column_headers = [th.get_text() for th in header]

print(column_headers)


['', 'Presidents of Cal State LA', 'Years as president']


In [26]:
# Table Data (cells):
rows = table2.find_all('tr')[1:]

data_dict = { column_headers[0] : [], column_headers[1] : [], column_headers[2] : [] }

for row in rows:
    cols = row.find_all('td')
    data_dict[column_headers[0]].append( cols[0].get_text() )
    data_dict[column_headers[1]].append( cols[1].get_text() )
    data_dict[column_headers[2]].append( cols[2].get_text() )

print(data_dict)



{'': ['1', '2', '3', '4', '5', '6', '7'], 'Presidents of Cal State LA': ['P. Victor Peterson', 'Howard S. McDonald', 'Albert D. Graves', 'Franklyn A. Johnson', 'John A. Greenlee', 'James M. Rosser', 'William A. Covino[20]'], 'Years as president': ['1947–49', '1949–62', '1962–63', '1963–65', '1965–79', '1979–2013', '2013–']}


In [33]:
data_frame = pd.DataFrame(data_dict, columns = column_headers)

print(data_frame)

     Presidents of Cal State LA Years as president
0  1         P. Victor Peterson            1947–49
1  2         Howard S. McDonald            1949–62
2  3           Albert D. Graves            1962–63
3  4        Franklyn A. Johnson            1963–65
4  5           John A. Greenlee            1965–79
5  6            James M. Rosser          1979–2013
6  7      William A. Covino[20]              2013–


### Example: finding and storing tables directly using pandas:

In [38]:
import html5lib

my_url = 'https://en.wikipedia.org/wiki/California_State_University,_Los_Angeles'

dfs = pd.read_html(my_url,header=0) # all tables

#print(dfs[2])

for df in dfs:
    print(df)


Empty DataFrame
Columns: [Unnamed: 0, This article needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (January 2013) (Learn how and when to remove this template message)]
Index: []
          Unnamed: 0                                         Unnamed: 1
0       Former names  Los Angeles State College of Applied Arts and ...
1              Motto                           Vox Veritas Vita (Latin)
2   Motto in English  "Voice Truth Life" – Speak the truth as a way ...
3               Type                                  Public land-grant
4        Established                                            1947[1]
5          Endowment                            $29.2 million (2016)[2]
6          President                            William A. Covino[3][4]
7            Provost                                    Lynn Mahoney[5]
8     Academic staff                                 

### Example: finding all of the links

In [41]:
links = my_soup.findAll('a')
link_list = []

for l in links:
    link_list.append(l.get('href'))
    
link_list    

[None,
 '#mw-head',
 '#p-search',
 '/wiki/University_of_California,_Los_Angeles',
 '/wiki/California_State_University,_Los_Angeles_(LACMTA_station)',
 '/wiki/File:Question_book-new.svg',
 '/wiki/Wikipedia:Verifiability',
 '//en.wikipedia.org/w/index.php?title=California_State_University,_Los_Angeles&action=edit',
 '/wiki/Help:Introduction_to_referencing_with_Wiki_Markup/1',
 '/wiki/Help:Maintenance_template_removal',
 '/wiki/File:CSULA_seal.png',
 '/wiki/Latin',
 '/wiki/Public_university',
 '/wiki/Land-grant_college',
 '#cite_note-1',
 '/wiki/Financial_endowment',
 '#cite_note-2',
 '/wiki/University_president',
 '#cite_note-3',
 '#cite_note-4',
 '/wiki/Provost_(education)',
 '#cite_note-5',
 '#cite_note-Enrollment-6',
 '/wiki/Undergraduate_education',
 '#cite_note-Enrollment-6',
 '/wiki/Postgraduate_education',
 '#cite_note-Enrollment-6',
 '/wiki/University_Hills,_Los_Angeles',
 '/wiki/School_colors',
 '/wiki/National_Collegiate_Athletic_Association',
 '/wiki/NCAA_Division_II',
 '/wiki

In [40]:
# get all links in the page
link_list = [l.get('href') for l in my_soup.findAll('a')]

link_list

[None,
 '#mw-head',
 '#p-search',
 '/wiki/University_of_California,_Los_Angeles',
 '/wiki/California_State_University,_Los_Angeles_(LACMTA_station)',
 '/wiki/File:Question_book-new.svg',
 '/wiki/Wikipedia:Verifiability',
 '//en.wikipedia.org/w/index.php?title=California_State_University,_Los_Angeles&action=edit',
 '/wiki/Help:Introduction_to_referencing_with_Wiki_Markup/1',
 '/wiki/Help:Maintenance_template_removal',
 '/wiki/File:CSULA_seal.png',
 '/wiki/Latin',
 '/wiki/Public_university',
 '/wiki/Land-grant_college',
 '#cite_note-1',
 '/wiki/Financial_endowment',
 '#cite_note-2',
 '/wiki/University_president',
 '#cite_note-3',
 '#cite_note-4',
 '/wiki/Provost_(education)',
 '#cite_note-5',
 '#cite_note-Enrollment-6',
 '/wiki/Undergraduate_education',
 '#cite_note-Enrollment-6',
 '/wiki/Postgraduate_education',
 '#cite_note-Enrollment-6',
 '/wiki/University_Hills,_Los_Angeles',
 '/wiki/School_colors',
 '/wiki/National_Collegiate_Athletic_Association',
 '/wiki/NCAA_Division_II',
 '/wiki

In [42]:
# all links:
# link_list = [l.get('href') for l in my_soup.findAll('a')]


# Let's keep only the external links:
# it is an external link if it starts with 'http' (and also it is not None!):

[l for l in link_list if l is not None and l[:4] == 'http']

['http://www.calstatela.edu',
 'http://www.calstatela.edu/academic/eep',
 'http://www.equality-of-opportunity.org/',
 'http://www.coolstatela.com/',
 'http://www.calstatela.edu/univ/ppa/media/cslainf1.php',
 'http://www.nacubo.org/Documents/EndowmentFiles/2016-Endowment-Market-Values.pdf',
 'http://www.calstate.edu/administration/bios/presidents/Rosser.shtml',
 'http://www.calstatela.edu/univ/welcome.htm',
 'http://www.calstatela.edu/provost/about-provost',
 'http://www.calstate.edu/as/stat_reports/2016-2017/f16_01.htm',
 'http://degrees.calstate.edu/csu_degree_search2?noCache=311:1358366446',
 'http://degrees.calstate.edu/uploads/55/64/5564d4b6ec1584227ca2d1054c759f0f/Credential-Programs-08212012.pdf',
 'http://www.calstatela.edu/student/prospect.htm',
 'http://web.calstatela.edu/academic/aa/semester/',
 'http://www.calstatela.edu/academic/aa/dcc/indexcollege.htm',
 'http://www.calstatela.edu/academic/eep/aboutEEP.php',
 'http://www.lachsa.org',
 'http://www.calstatela.edu/univ/ppa/me

# Regular Expressions for Cleaning the Text

In [None]:
import re

# Split:
string1 = 'I love Data Science!'

print(re.split(r'\s+', string1)) # splitting based on space(s) excluding the space (delimiter)
# s means space
# * means zero or more
# + means one or more

print(re.split(r'(\s+)', string1)) # splitting based on space(s) including the space (delimiter)

In [None]:
print(re.split(r'[c-f]', string1)) # using c,d,e,f as delimiter

print(re.split(r'[c-f,a]', string1)) # using c,d,e,f,a as delimiter

### Example: Searching for an address:

In [None]:
# Search:
# Example: Searching for an address:
string2 = "There is great restaurant in 123 main st. that serves very good sushi"

print(re.findall(r'\d+', string2)) 

# "findall" to search for something
# \d to search for numbers
# \D to search for anything but digit

In [None]:
print(re.findall(r'\d{1,5}', string2))  # 1 to 5 digits

In [None]:
print(re.findall(r'\d{1,5}\s\w+', string2)) # digit + space + word
# \w to search for a digit or letter
# + means one or more

In [None]:
print(re.findall(r'\d{1,5}\s\w+\s\w+', string2)) # digit + space + word
# second \w+ search for "st" or "ave" ...

In [None]:
print(re.findall(r'\d{1,5}\s\w+\s\w+\.', string2)) # digit + space + word
# . means anything

### Example:

In [None]:
my_url = 'https://en.wikipedia.org/wiki/California_State_University,_Los_Angeles'
#my_url = 'http://www.crummy.com/software/BeautifulSoup'

webpage = requests.get(my_url)

webpage_content = webpage.text # this returns the webpage html content

print(webpage_content)

In [None]:
title = re.findall(r'<title>.*</title>',webpage_content)

print(title)


In [None]:
title_string = ''.join(title)

print(title_string[7:-8])

### For more infor about regular expression see https://docs.python.org/3.3/library/re.html