# URL harvesting
Fetching URL:s from websites.

In [4]:
""" 
Crawling websites for URLs
"""
from urllib.request import urlopen, Request
import requests
from bs4 import BeautifulSoup
import datetime
import random
import re
import time

sec_timeout = 60*1   # 1 minute from now
while_timeout = time.time() + sec_timeout
random.seed(datetime.datetime.now())

def get_links(site, pageUrl):
    try:
        html = urlopen(Request('{0}{1}'.format(site, pageUrl), headers={'User-Agent': 'Webperf.se Crawler'}))
        bs = BeautifulSoup(html, 'html.parser')

        return bs.find_all('a', href=re.compile('^/')) # only URLs starting with /
    except Exception as e:
        print(site, pageUrl, '\n', e)
        return None

def check_for_redirect(url):
    try:
        r = requests.head(url, allow_redirects=True, timeout = 5)
        return r.url, r.history, r.status_code, r.headers['Content-Type']
    except:
        return None

def harvest_links(site = 'https://www.vgregion.se', initial_page = '/', max_pages = 50):
    links = get_links(site, initial_page)
    i = 0
    urls = []

    print('Looking for at most {} URLs at {} for {} seconds'.format(max_pages, site, sec_timeout))

    while len(links) > 0 and i < max_pages:
        newPage = links[random.randint(0, len(links)-1)].attrs['href']
        
        if site + newPage not in urls and 'mailto' not in newPage and '#' not in newPage and newPage != None and 'http' not in newPage and '.pdf' not in newPage and '.docx' not in newPage and '.pptx' not in newPage:
            
            check_redir = check_for_redirect(site + newPage)
            if(check_redir is not None and site in check_redir[0] and check_redir[2] == 200 and 'text/html' in check_redir[3]):
                i += 1
                print(i, newPage)
                urls.append(site + newPage)
                
                new_links = get_links(site, newPage)
                if new_links is not None:
                    links = new_links
            else:
                print('Redirection, content-type or status code error detected. URL skipped.\n', check_redir) # check_redir[0] do not work for all content-types
        
        if time.time() > while_timeout:
            break
    
    return urls

harvest_links(site='http://www.regionhalland.se', max_pages=25)

Looking for at most 25 URLs at http://www.regionhalland.se for 60 seconds
1 /vard-halsa/hitta-din-vard/sjukvardsradgivningen/
2 /vard-halsa/sjukdomar-och-behandlingar/
3 /om-region-halland/
4 /om-region-halland/sa-styrs-region-halland/folj-fullmaktige-live/
5 /om-region-halland/nyheter/
6 /om-region-halland/nyheter/valkommen-att-nominera-arets-medarbetare-i-region-halland/
7 /om-region-halland/postdiariet1/
8 /om-region-halland/att-jobba-hos-oss/
9 /
10 /om-region-halland/sa-styrs-region-halland/anslagstavla/
11 /om-region-halland/att-jobba-hos-oss/lediga-jobb1/lediga-jobb/
12 /sidhuvud/bestall-ladda-ner/
13 /sidhuvud/bestall-ladda-ner/kultur1/regionbiblioteket/
14 /vard-halsa/
15 /sidhuvud/om-cookies-pa-regionhallandse/
16 /om-region-halland/organisation/dataskydd/
17 /utveckling-och-tillvaxt/
18 /utveckling-och-tillvaxt/for-organisationer/
19 /kultur/
20 /kultur/upplev-kultur/kulturevenemang/rum-for-dans-people-what-people1/
21 /sidhuvud/akuta-telefonnummer/
22 /kontakta-oss/kontakta

['http://www.regionhalland.se/vard-halsa/hitta-din-vard/sjukvardsradgivningen/',
 'http://www.regionhalland.se/vard-halsa/sjukdomar-och-behandlingar/',
 'http://www.regionhalland.se/om-region-halland/',
 'http://www.regionhalland.se/om-region-halland/sa-styrs-region-halland/folj-fullmaktige-live/',
 'http://www.regionhalland.se/om-region-halland/nyheter/',
 'http://www.regionhalland.se/om-region-halland/nyheter/valkommen-att-nominera-arets-medarbetare-i-region-halland/',
 'http://www.regionhalland.se/om-region-halland/postdiariet1/',
 'http://www.regionhalland.se/om-region-halland/att-jobba-hos-oss/',
 'http://www.regionhalland.se/',
 'http://www.regionhalland.se/om-region-halland/sa-styrs-region-halland/anslagstavla/',
 'http://www.regionhalland.se/om-region-halland/att-jobba-hos-oss/lediga-jobb1/lediga-jobb/',
 'http://www.regionhalland.se/sidhuvud/bestall-ladda-ner/',
 'http://www.regionhalland.se/sidhuvud/bestall-ladda-ner/kultur1/regionbiblioteket/',
 'http://www.regionhalland.se/