## 1 - Installs and imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pprint
import requests

%matplotlib inline
from urllib.request import urlopen
from bs4 import BeautifulSoup

## 2 - URL -> HTML -> *Beautiful Soup* object

In [2]:
url = "https://tenstripes.nl"
html = urlopen(url)

In [3]:
soup = BeautifulSoup(html, 'lxml')
type(soup)

bs4.BeautifulSoup

## 3 - An introduction to *Beautiful soup* commands to extract information from a website

### 3.1 - Get the title

In [4]:
title = soup.title
print(title)

<title>Ten Stripes: ðŸ’› datagedreven storytelling</title>


Let's print only the string part of the title, not the tags

In [5]:
print(soup.title.string)

Ten Stripes: ðŸ’› datagedreven storytelling


3.2 - Print out the text

In [6]:
text = soup.get_text()
print(soup.text)








(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
        new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
        j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
        'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
        })(window,document,'script','dataLayer','GTM-5TLVL37');


















            !function(f,b,e,v,n,t,s)
            {if(f.fbq)return;n=f.fbq=function(){n.callMethod?
                n.callMethod.apply(n,arguments):n.queue.push(arguments)};
                if(!f._fbq)f._fbq=n;n.push=n;n.loaded=!0;n.version='2.0';
                n.queue=[];t=b.createElement(e);t.async=!0;
                t.src=v;s=b.getElementsByTagName(e)[0];
                s.parentNode.insertBefore(t,s)}(window, document,'script',
                'https://connect.facebook.net/en_US/fbevents.js');
            fbq('init', '309501069779382');
            fbq('track', 'PageView');
        







Prettify the indentation (Optional)

In [None]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="nl">
 <head>
  <title>
   Ten Stripes: ðŸ’› datagedreven storytelling
  </title>
  <style id="rocket-critical-css">
  </style>
  <link as="style" data-minify="1" data-rocket-async="style" href="https://tenstripes.nl/wp-content/cache/min/1/4f1f97e8794f23805a436ea46ce650b9.css" media="all" onload="this.onload=null;this.rel='stylesheet'" rel="preload"/>
  <!--	----        	     ---- -->
  <!--	  ----     	       ----
				----         ----
				  ----	   ----
					---- ----				''ZOEK EN GIJ ZULT VINDEN!''
					   ---
					---- ----						- XXX TEN Stripes <3									
				  ----	   ----										
				----         ----
			  ----     	       ----
			----        	     ----

  
		-->
  <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
  <meta content="width=device-width, initial-scale=1, shrink-to-fit=no" name="viewport"/>
  <!-- Google Tag Manager -->
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
        new Date().getTime

### 3.2 - Extract the hyperlinks

Print all the hyperlinks present in the webpage

In [7]:
soup.find_all('a')

[<a class="navbar-brand" href="/">
 <svg id="ten-sripes-logo" viewbox="0 0 103 37" xmlns="http://www.w3.org/2000/svg">
 <path class="st0" d="M30.9 31.8L2.2 3.1l.9-.9 28.7 28.7-.9.9z" id="logo-1"></path>
 <path class="st0" d="M29.1 33.6L.4 4.9l.9-.9L30 32.7l-.9.9z" id="logo-2"></path>
 <path class="st0" d="M32.7 30L4 1.3l.9-.9 28.7 28.7-.9.9z" id="logo-3"></path>
 <path class="st0" d="M21 13.9l-.9-.9L30.9 2.2l.9.9L21 13.9z" id="logo-4"></path>
 <path class="st0" d="M19.2 12.1l-.9-.9L29.1.4l.9.9-10.8 10.8z" id="logo-5"></path>
 <path class="st0" d="M22.8 15.7l-.9-.9L32.7 4l.9.9-10.8 10.8z" id="logo-6"></path>
 <path class="st0" d="M3.1 31.8l-.9-.9L13 20.1l.9.9L3.1 31.8z" id="logo-7"></path>
 <path class="st0" d="M1.3 30l-.9-.9 10.8-10.8.9.9L1.3 30z" id="logo-8"></path>
 <path class="st0" d="M4.9 33.6l-.9-.9 10.8-10.8.9.9L4.9 33.6z" id="logo-9"></path>
 <path class="st2" d="M52,12.6c0.4,0.4,1,0.6,1.7,0.6c0.5,0,1-0.1,1.4-0.4c0.4-0.3,0.7-0.5,0.7-0.8h2.4c-0.4,1.2-0.9,2-1.7,2.5
              

Print only the hyperlinks of the webpage

In [8]:
all_links = soup.find_all("a")
for link in all_links:
    print(link.get("href"))

/
https://tenstripes.nl/werkwijze/
https://tenstripes.nl/werkwijze/onze-diensten/
https://tenstripes.nl/strategie/
https://tenstripes.nl/datagedreven-storytelling/
https://tenstripes.nl/impact-marketing/
https://tenstripes.nl/cases/
https://tenstripes.nl/goede-verhalen/
/kernwaarden-bedrijf-bepalen/
/storytelling/
/missie-visie-formuleren/
https://tenstripes.nl/de-customer-journey-alles-wat-je-moet-weten/
https://tenstripes.nl/duurzaam-ondernemen/
https://tenstripes.nl/het-belang-van-greenwashing/
https://tenstripes.nl/marketingstrategie-checklist/
https://tenstripes.nl/over-ons/
https://tenstripes.nl/het-team/
https://tenstripes.nl/manifest/
https://tenstripes.nl/impact/
https://tenstripes.nl/wall-of-impact/
https://tenstripes.nl/marketing-3-0/
https://tenstripes.nl/partners/
https://tenstripes.nl/vacatures/
https://tenstripes.nl/contact/
https://tenstripes.nl/wordfan/
https://tenstripes.nl/datagedreven-storytelling/
https://tenstripes.nl/marketingstrategie-checklist/
https://tenstrip

### 3.3 - Print out only the table rows

In [9]:
rows = soup.find_all('tr')
print(rows)

[<tr><th class="cookielawinfo-column-1">Cookie</th><th class="cookielawinfo-column-3">Duration</th><th class="cookielawinfo-column-4">Description</th></tr>, <tr class="cookielawinfo-row"><td class="cookielawinfo-column-1">cookielawinfo-checkbox-analytics</td><td class="cookielawinfo-column-3">11 months</td><td class="cookielawinfo-column-4">This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Analytics".</td></tr>, <tr class="cookielawinfo-row"><td class="cookielawinfo-column-1">cookielawinfo-checkbox-functional</td><td class="cookielawinfo-column-3">11 months</td><td class="cookielawinfo-column-4">The cookie is set by GDPR cookie consent to record the user consent for the cookies in the category "Functional".</td></tr>, <tr class="cookielawinfo-row"><td class="cookielawinfo-column-1">cookielawinfo-checkbox-necessary</td><td class="cookielawinfo-column-3">11 months</td><td class="cookielawinfo-column-4">This coo

Fetch all the table tags (alternative command)

In [10]:
all_table = soup.find_all('table')
print(all_table)

[<table class="cookielawinfo-row-cat-table cookielawinfo-winter"><thead><tr><th class="cookielawinfo-column-1">Cookie</th><th class="cookielawinfo-column-3">Duration</th><th class="cookielawinfo-column-4">Description</th></tr></thead><tbody><tr class="cookielawinfo-row"><td class="cookielawinfo-column-1">cookielawinfo-checkbox-analytics</td><td class="cookielawinfo-column-3">11 months</td><td class="cookielawinfo-column-4">This cookie is set by GDPR Cookie Consent plugin. The cookie is used to store the user consent for the cookies in the category "Analytics".</td></tr><tr class="cookielawinfo-row"><td class="cookielawinfo-column-1">cookielawinfo-checkbox-functional</td><td class="cookielawinfo-column-3">11 months</td><td class="cookielawinfo-column-4">The cookie is set by GDPR cookie consent to record the user consent for the cookies in the category "Functional".</td></tr><tr class="cookielawinfo-row"><td class="cookielawinfo-column-1">cookielawinfo-checkbox-necessary</td><td class="c

### 3.4 - Find out a specific tag

In [11]:
print (soup.find_all(re.compile("^div")))

[<div class="container">
<button class="navbar-toggler" data-target="#navbarCollapse" data-toggle="dropdown" data-word="Menu" type="button">
<!-- <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarCollapse" aria-controls="navbarCollapse" aria-expanded="false" aria-label="Toggle navigation"> -->
<span></span>
<span></span>
<span></span>
<span></span>
<span></span>
<span></span>
</button>
<a class="navbar-brand" href="/">
<svg id="ten-sripes-logo" viewbox="0 0 103 37" xmlns="http://www.w3.org/2000/svg">
<path class="st0" d="M30.9 31.8L2.2 3.1l.9-.9 28.7 28.7-.9.9z" id="logo-1"></path>
<path class="st0" d="M29.1 33.6L.4 4.9l.9-.9L30 32.7l-.9.9z" id="logo-2"></path>
<path class="st0" d="M32.7 30L4 1.3l.9-.9 28.7 28.7-.9.9z" id="logo-3"></path>
<path class="st0" d="M21 13.9l-.9-.9L30.9 2.2l.9.9L21 13.9z" id="logo-4"></path>
<path class="st0" d="M19.2 12.1l-.9-.9L29.1.4l.9.9-10.8 10.8z" id="logo-5"></path>
<path class="st0" d="M22.8 15.7l-.9-.9L32.7 4l.9.

### 3.5 - Print out the tags and their lengths

In [12]:
taglist = []

for tag in soup.find_all(True): 
  taglist.append(tag.name)

taglist = list (set(taglist))

for i in range (len(taglist)):
  print ("Length of", taglist[i], "-->", len(soup.select(taglist[i])))

Length of noscript --> 4
Length of main --> 1
Length of body --> 1
Length of section --> 10
Length of footer --> 1
Length of span --> 17
Length of strong --> 1
Length of meta --> 33
Length of li --> 38
Length of table --> 1
Length of head --> 1
Length of header --> 1
Length of h3 --> 2
Length of style --> 7
Length of g --> 12
Length of label --> 8
Length of link --> 21
Length of button --> 4
Length of path --> 67
Length of center --> 1
Length of thead --> 1
Length of a --> 70
Length of html --> 1
Length of ul --> 8
Length of p --> 21
Length of script --> 26
Length of iframe --> 2
Length of tbody --> 1
Length of div --> 147
Length of h2 --> 11
Length of th --> 3
Length of quotient --> 3
Length of rect --> 1
Length of title --> 1
Length of input --> 16
Length of h1 --> 3
Length of img --> 19
Length of h4 --> 1
Length of i --> 6
Length of td --> 18
Length of tr --> 7
Length of br --> 5
Length of form --> 2
Length of em --> 2
Length of nav --> 1
Length of svg --> 5


## 4 - Defining various functions to extract website information

### 4.1 - Scrape the title of the page

In [22]:
def get_title(soup):
    """Scrape page title."""
    title = None
    if soup.title.string:
        title = soup.title.string
    elif soup.find("meta", property="og:title"):
        title = soup.find("meta", property="og:title").get('content')
    elif soup.find("meta", property="twitter:title"):
        title = soup.find("meta", property="twitter:title").get('content')
    elif soup.find("h1"):
        title = soup.find("h1").string
    return title

In [23]:
get_title (soup)

'Ten Stripes: ðŸ’› datagedreven storytelling'

### 4.2 - Scrape the description of the page

In [24]:
def get_description(soup):
    """Scrape page description."""
    description = None
    if soup.find("meta", property="description"):
        description = soup.find("meta", property="description").get('content')
    elif soup.find("meta", property="og:description"):
        description = soup.find("meta", property="og:description").get('content')
    elif soup.find("meta", property="twitter:description"):
        description = soup.find("meta", property="twitter:description").get('content')
    elif soup.find("p"):
        description = soup.find("p").contents
    return description

In [25]:
get_description (soup)

'Oprechte Ã©n ijzersterke verhalen op basis van een datagedreven marketing 3.0 strategie. ps. Het liefst voor bedrijven met duurzame ambities.'

### 4.3 - Retreive the image/logo of the website

In [26]:
def get_image(soup):
    """Scrape share image."""
    image = None
    if soup.find("meta", property="image"):
        image = soup.find("meta", property="image").get('content')
    elif soup.find("meta", property="og:image"):
        image = soup.find("meta", property="og:image").get('content')
    elif soup.find("meta", property="twitter:image"):
        image = soup.find("meta", property="twitter:image").get('content')
    elif soup.find("img", src=True):
        image = soup.find_all("img").get('src')
    return image

In [27]:
get_image (soup)

'https://tenstripes.nl/media/Ten-Stripes-Logo-1-1024x358.png'

### 4.4 - Get the site name

In [28]:
def get_site_name(soup, url):
    """Scrape site name."""
    if soup.find("meta", property="og:site_name"):
        site_name = soup.find("meta", property="og:site_name").get('content')
    elif soup.find("meta", property='twitter:title'):
        site_name = soup.find("meta", property="twitter:title").get('content')
    else:
        site_name = url.split('//')[1]
        return site_name.split('/')[0].rsplit('.')[1].capitalize()
    return site_name

In [29]:
get_site_name (soup, url)

'Ten Stripes'

### 4.5 - Retrieve the favicon

In [32]:
def get_favicon(soup, url):
    """Scrape favicon."""
    if soup.find("link", attrs={"rel": "icon"}):
        favicon = soup.find("link", attrs={"rel": "icon"}).get('href')
    elif soup.find("link", attrs={"rel": "shortcut icon"}):
        favicon = soup.find("link", attrs={"rel": "shortcut icon"}).get('href')
    else:
        favicon = f'{url.rstrip("/")}/favicon.ico'
    return favicon

In [33]:
get_favicon (soup, url)

'https://tenstripes.nl/wp-content/themes/TenStripes/assets/icons/favicon-32x32.png'

### 4.6 - Get the theme color of the website

In [34]:
def get_theme_color(soup):
    """Scrape brand color."""
    if soup.find("meta", property="theme-color"):
        color = soup.find("meta", property="theme-color").get('content')
        return color
    return None

In [35]:
get_theme_color (soup)

### 4.7 - Scrape the associated metadata of the page

In [36]:
def scrape_page_metadata(url):
    """Scrape target URL for metadata."""
    headers = {
        'Access-Control-Allow-Origin': '*',
        'Access-Control-Allow-Methods': 'GET',
        'Access-Control-Allow-Headers': 'Content-Type',
        'Access-Control-Max-Age': '3600',
        'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }
    pp = pprint.PrettyPrinter(indent=4)
    r = requests.get(url, headers=headers)
    html = BeautifulSoup(r.content, 'html.parser')
    metadata = {
        'title': get_title(html),
        'description': get_description(html),
        'image': get_image (html),
        'favicon': get_favicon(html, url),
        'color': get_theme_color(html),
        'url': url
        }
    pp.pprint(metadata)
    return metadata

In [37]:
scrape_page_metadata(url)

{   'color': None,
    'description': 'Oprechte Ã©n ijzersterke verhalen op basis van een '
                   'datagedreven marketing 3.0 strategie. ps. Het liefst voor '
                   'bedrijven met duurzame ambities.',
    'favicon': 'https://tenstripes.nl/wp-content/themes/TenStripes/assets/icons/favicon-32x32.png',
    'image': 'https://tenstripes.nl/media/Ten-Stripes-Logo-1-1024x358.png',
    'title': 'Ten Stripes: ðŸ’› datagedreven storytelling',
    'url': 'https://tenstripes.nl'}


{'color': None,
 'description': 'Oprechte Ã©n ijzersterke verhalen op basis van een datagedreven marketing 3.0 strategie. ps. Het liefst voor bedrijven met duurzame ambities.',
 'favicon': 'https://tenstripes.nl/wp-content/themes/TenStripes/assets/icons/favicon-32x32.png',
 'image': 'https://tenstripes.nl/media/Ten-Stripes-Logo-1-1024x358.png',
 'title': 'Ten Stripes: ðŸ’› datagedreven storytelling',
 'url': 'https://tenstripes.nl'}