### Load Libraries

In [None]:
# Import/load libraries

from bs4 import BeautifulSoup
import pandas as pd
import requests
import csv

### Using BeautifulSoup

In [None]:
# Load in the html

url = 'https://en.wikipedia.org/wiki/Comma-separated_values'
r = requests.get(url)
html_content = r.text
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
html_content

In [None]:
# Print out a formatted tree
print(soup.prettify())

In [None]:
# Title of the page
print(soup.title)

In [None]:
# Get attributes
print(soup.title.name)

In [None]:
# Get values
print(soup.title.string)

In [None]:
# Beginning navigation
print(soup.title.parent.name)

In [None]:
# Getting specific values
#print(soup.p)

##### Scrape the table format version (1st interpretation)

In [None]:
#table1 = soup.findAll("table", {"class":"wikitable"})[0]
table1 = soup.findAll(name="table", attrs={"class":"wikitable"})[0]
table1

In [None]:
tr = table1.findAll(['tr'])
tr

In [None]:
csvFile = open("car.csv",'wt', newline='', encoding='utf-8')
writer = csv.writer(csvFile) 

try:   
        for cell in tr:
            #print("***cell***: ", cell)
            th = cell.find_all('th')
            #print("===th===: ", th)
            th_data = [col.text.strip('\n') for col in th]
            #print("===th_data===: ", th_data)
            td = cell.find_all('td')
            #print("===td===: ", td)
            row = [i.text.replace('\n','') for i in td]
            #print("===td_data===: ",row)
            #print("<<<th_data+td_data>>>", th_data+row)
            writer.writerow(th_data+row)      
        
finally:   
    csvFile.close()

In [None]:
# Read from csv file to verify
pd.read_csv('car.csv')

References:
* https://medium.com/analytics-vidhya/web-scraping-html-table-from-wiki-9b18cf169359

##### Scrape the CSV format version (2nd interpretation)

In [None]:
# Get the example table under the header "Example"
ths = soup.find(id="Example")
table = ths.findNext('pre').text  #find 'pre' tag
print(table)

In [None]:
# Write to csv file
f = open('car_alt.csv', 'w')
f.write(table)
f.close()

In [None]:
# Read from csv file to verify
pd.read_csv('car_alt.csv')

### Using Pandas

##### Scrape the table format version (1st interpretation)

In [None]:
# Use Pandas read_html to extract all html tables in website

dfs  = pd.read_html(url)
for df in dfs:
    print("=====", df)

In [None]:
df = dfs[1] # this is the table we want
df

In [None]:
# Write to csv file
df.to_csv('car2.csv', index=False)

# Read from csv file to verify
pd.read_csv('car2.csv')

References:
* https://www.youtube.com/watch?v=ODNMNwgtehk

## Using Selenium

##### Install and Load

In [None]:
# Install Selenium
#!pip install selenium

# Install Chromedriver
# Download from https://chromedriver.chromium.org/downloads and store it on your computer

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import pandas as pd

In [None]:
# Provide path of downloaded Chromedriver; your location may differ
driver = webdriver.Chrome(executable_path="C:\webdrivers\chromedriver.exe")  

url = 'https://en.wikipedia.org/wiki/Comma-separated_values'
driver.get(url)
driver.maximize_window()

Methods of Selenium to find multiple elements on a web page. 

To find single element, replace `elements` by `element`

* find_elements_by_name
* find_elements_by_xpath
* find_elements_by_link_text
* find_elements_by_partial_link_text
* find_elements_by_tag_name
* find_elements_by_class_name
* find_elements_by_css_selector

##### Scrape the table format version (1st interpretation)

In [None]:
# Method 1 - Extract table by xpath
#driver.find_element_by_xpath('//*[@id="mw-content-text"]/div[1]/table[2]').get_attribute('outerHTML')
df_tmp = pd.read_html(driver.find_element_by_xpath('//*[@id="mw-content-text"]/div[1]/table[2]').get_attribute('outerHTML'))[0]
df_tmp

In [None]:
# Method 2 - Extract table by tag name
elements = driver.find_elements_by_tag_name('table') #extract list of elements

for i in range(len(elements)):
    print("=====", elements[i].get_attribute('outerHTML'))

In [None]:
# Extract the table that we want
df_tmp = pd.read_html(elements[1].get_attribute('outerHTML'))[0]  #the second table is the one we want
df_tmp

In [None]:
# Write to CSV file
df_tmp.to_csv('car3.csv', index=False)

# Read from CSV file to verify
pd.read_csv('car3.csv')

##### Scrape the CSV format version (2nd interpretation)

In [None]:
# Extract table by tag name
elements = driver.find_elements_by_xpath('//*[@id="mw-content-text"]/div[1]/pre[1]')

for i in range(len(elements)):
    print("=====", elements[i].get_attribute('outerHTML'))

In [None]:
table = elements[0].text
table

In [None]:
f = open('car3_alt.csv', 'w')
f.write(table)
f.close()

In [None]:
pd.read_csv('car3_alt.csv')