# Web Scraping

## Libraries and pre- requisites
### Main libraries

In [None]:
pip install beautifulsoup4

pip install lxml

pip install requests


### Secondary libraries
pip install --upgrade certifi
pip install html5lib
pip install openpyxl

# headers
- headers mimic browsers
- for header information go to whoishostingthis.com/tools/user-agent
- copy and paste that
- syntax
- dictionary {'key': 'value"}
- headers = {'User-Agent': 'result from webpage'}
- the following should be enough:
- headers = {'User-Agent': 'Mozilla/5.0'}

In [2]:
import requests 
from bs4 import BeautifulSoup

url = 'http://www.espn.com/nba/history/leaders/_/stat/assists'

user_headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36 RuxitSynthetic/1.0 v5595704681 t38550'}

# make request with user_headers
# store request in response
response = requests.get(url, headers = user_headers)
response

<Response [200]>

## response code

In [3]:
# get 200 response that means correct response
# 403 is forbidden
# google html response codes
response.status_code

200

In [6]:
# response = requests.get("https://stackoverflow.com/questions")
response = requests.get("http://192.168.1.68:8069/web#id=38&action=343&model=nomina_base.escenario_nomina&view_type=form&cids=3&menu_id=219")

response.status_code

200

In [7]:
# get soup with response content and parser
soup = BeautifulSoup(response.text, "html.parser")  # extract the text

In [8]:
soup


<!DOCTYPE html>

<html style="height: 100%;">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge,chrome=1" http-equiv="X-UA-Compatible"/>
<meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
<meta content="width=device-width, initial-scale=1, user-scalable=no" name="viewport"/>
<title>Odoo</title>
<link href="/web/static/src/img/favicon.ico" rel="shortcut icon" type="image/x-icon"/>
<link as="font" crossorigin="" href="/web/static/lib/fontawesome/fonts/fontawesome-webfont.woff2?v=4.7.0" rel="preload"/>
<link data-asset-version="cdacdaf" data-asset-xmlid="web.assets_common" href="/web/content/600-cdacdaf/web.assets_common.css" rel="stylesheet" type="text/css"/>
<link data-asset-version="b3d303c" data-asset-xmlid="web.assets_frontend" href="/web/content/601-b3d303c/web.assets_frontend.css" rel="stylesheet" type="text/css"/>
<script id="web.layout.odooscript" type="text/javascript">
                    var odoo = {
                        csrf_token:

In [8]:
# inspect website for table by looking at a row
# get table tag with class element
# the stat  row class was stathead but we need the class corresponding the table tag
# we verify we got a unique element
stat_table = soup.find_all('table', class_ = "tablehead")
len(stat_table)

1

## SSLCertVerificationError
workaround for https error
This restores the same behavior

In [11]:
## SSLCertVerificationError   
url = 'https://infosen.senado.gob.mx/formatos_INAI-INFOSEN/index.php?c=votaciones&a=data'
response = requests.get(url, headers = {'User-Agent': 'Mozilla/5.0'})

response.status_code

SSLError: HTTPSConnectionPool(host='infosen.senado.gob.mx', port=443): Max retries exceeded with url: /formatos_INAI-INFOSEN/index.php?c=votaciones&a=data (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)')))

In [14]:
import ssl

soup = urllib.request.urlopen(
    "https://infosen.senado.gob.mx/formatos_INAI-INFOSEN/index.php?c=votaciones&a=data", context=ssl._create_unverified_context())

response.status_code

200

In [None]:
# decode utf8

pip install chardet

pip install cchardet 

Unicode, Dammit’s guesses will get a lot more accurate if you install the chardet or cchardet

In [15]:
# if you have special characters not showing up correctly 
extracted_soup = soup.read()
print(extracted_soup)

b'<!doctype html>\n<html lang="en">\n<head>\n  <title>Senado de la Rep\xc3\xbablica</title>\n  <!-- Required meta tags -->\n  <meta charset="utf-8">\n  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">\n  <link rel="shortcut icon" type="image/x-icon" href="Webroot/img/favicon.ico" />\n  <!-- Bootstrap CSS -->\n  <link rel="stylesheet" href="Webroot/css/bootstrap.min.css" >\n  <link rel="stylesheet" href="Webroot/css/custom-navbar.css" >\n  <link rel="stylesheet" href="Webroot/css/custom.css" >\n  <!-- Optional JavaScript -->\n  <!-- jQuery first, then Popper.js, then Bootstrap JS -->\n  <script src="Webroot/js/jquery-3.2.1.min.js"  ></script>\n  <script src="Webroot/js/popper.min.js"  ></script>\n  <script src="Webroot/js/bootstrap.min.js"></script>\n  <script>\n    $(function () {\n      $(\'[data-toggle="tooltip"]\').tooltip();\n    });\n  </script>\n  <style>\n  .tooltip-inner {\n    max-width: 450px;\n    width: 450px;\n  }\n</style>\n</head>\n<

In [19]:
# unicodedammit fixes them
decoded_soup = UnicodeDammit(extracted_soup)
print(decoded_soup)

<bs4.dammit.UnicodeDammit object at 0x0517C8F8>


In [20]:
print(decoded_soup.unicode_markup)

<!doctype html>
<html lang="en">
<head>
  <title>Senado de la República</title>
  <!-- Required meta tags -->
  <meta charset="utf-8">
  <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">
  <link rel="shortcut icon" type="image/x-icon" href="Webroot/img/favicon.ico" />
  <!-- Bootstrap CSS -->
  <link rel="stylesheet" href="Webroot/css/bootstrap.min.css" >
  <link rel="stylesheet" href="Webroot/css/custom-navbar.css" >
  <link rel="stylesheet" href="Webroot/css/custom.css" >
  <!-- Optional JavaScript -->
  <!-- jQuery first, then Popper.js, then Bootstrap JS -->
  <script src="Webroot/js/jquery-3.2.1.min.js"  ></script>
  <script src="Webroot/js/popper.min.js"  ></script>
  <script src="Webroot/js/bootstrap.min.js"></script>
  <script>
    $(function () {
      $('[data-toggle="tooltip"]').tooltip();
    });
  </script>
  <style>
  .tooltip-inner {
    max-width: 450px;
    width: 450px;
  }
</style>
</head>
<body>
  <div id="container">
    <nav cl

In [21]:
decoded_soup.original_encoding

'utf-8'

In [23]:
# 401 error - CHECK WHEN FINDING 401
# use a session
import requests

url = 'https://gamefaqs.gamespot.com/'

session = requests.Session()
response = session.get(url, headers={'User-Agent': 'Mozilla/5.0'})
response.status_code

200