Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
248 lines (204 sloc) 10.1 KB
# coding: utf-8
""" This file is where things are stuffed away. Probably you don't ever need to alter these definitions.
"""
import sys
import os.path
import uuid
import dateutil.parser
import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import gzip
import requests
import json
# internal
import _privatekeys as privatekeys
i = 0 # global iterator
def writeFile(file, content):
"""Writes a file at given location
Attributes: file for location, content for the file's contents
"""
f = open(file, 'w')
f.write(content)
f.close()
def delete_file(file):
os.remove(file)
def getUniqueId(length=5):
return str(uuid.uuid1()).replace('-', '')[:length]
def getKey(item):
return item[0]
def fetchUrlsFromSitemap(url, limit=None):
"""Given a URL of a sitemap or sitemapindex the contained URLs are returned as a list with tuples. Optional to limit the age of URLs.
Attributes: url (string), limit (datetime)
"""
# Documentation for sitemaps - https://www.sitemaps.org
found_urls = list()
sitemap = httpRequestGetContent(url)
global i
if limit is not None:
limit = dateutil.parser.parse(limit).replace(tzinfo=None) # converts to same format
if ('<sitemapindex' in str(sitemap)): # is the sitemap itself an index of sitemaps
sitemap_content = BeautifulSoup(sitemap, "html.parser")
for url in sitemap_content.findAll("loc"):
print("Siteindex found. Including URL:s from sitemap: '{0}'".format(url.text))
# fetching sitemap
sitemap_from_index = httpRequestGetContent(url.text)
sitemap_iteration = BeautifulSoup(sitemap_from_index, "html.parser")
for lvl1_url in sitemap_iteration.findAll("url"):
date = None
if (".pdf" not in lvl1_url.text.lower()) and (
".jpg" not in lvl1_url.text.lower()) and (
".mp4" not in lvl1_url.text.lower()) and (
".mp3" not in lvl1_url.text.lower()) and (
".txt" not in lvl1_url.text.lower()) and (
".png" not in lvl1_url.text.lower()) and (
".gif" not in lvl1_url.text.lower()) and (
".svg" not in lvl1_url.text.lower()) and (
".eps" not in lvl1_url.text.lower()) and (
".doc" not in lvl1_url.text.lower()) and (
".docx" not in lvl1_url.text.lower()) and (
".xls" not in lvl1_url.text.lower()) and (
".js" not in lvl1_url.text.lower()) and (
".css" not in lvl1_url.text.lower()) and (
".xlsx" not in lvl1_url.text.lower()) and (
".ttf" not in lvl1_url.text.lower()) and (
".eot" not in lvl1_url.text.lower()) and (
".bak" not in lvl1_url.text.lower()) and (
".woff" not in lvl1_url.text.lower()) and (
"javascript:" not in lvl1_url.text.lower()) and (
"tel:" not in lvl1_url.text.lower()) and (
"mailto:" not in lvl1_url.text.lower()) and (
"#" not in lvl1_url.text.lower()):
if lvl1_url.lastmod is not None:
date = dateutil.parser.parse(lvl1_url.lastmod.string).replace(tzinfo=None)
if limit is not None and date is not None and date > limit:
date_and_url = (lvl1_url.lastmod.string, lvl1_url.loc.string)
found_urls.append(
date_and_url) # if date (lastmod) is missing the URL will not be checked
print(
'Found {0} URLs from multiple sitemaps in the siteindex you provided.'.format(
len(found_urls)))
return sorted(found_urls, key=getKey, reverse=True)
else:
soup = BeautifulSoup(sitemap, "html.parser")
for url in soup.findAll("url"):
date = None
if url.lastmod is not None:
date = dateutil.parser.parse(url.lastmod.string).replace(tzinfo=None)
if limit is not None and date is not None and date > limit:
date_and_url = (url.lastmod.string, url.loc.string)
found_urls.append(
date_and_url) # if date (lastmod) is missing the URL will not be checked
print('Found {0} URLs in the sitemap you provided.'.format(len(found_urls)))
return sorted(found_urls, key=getKey, reverse=True)
def fetchUrlsFromPage(url, num_limit=None, local_only=True):
"""Given a URL contained URLs are returned as a list with tuples. Optional to number of URLs and if to only include URLs within the local website.
Attributes: url (string), num_limit (integer), local_only (bool)
"""
main_url = urlparse(url)
found_urls = list()
page = httpRequestGetContent(url)
soup = BeautifulSoup(page, "html.parser")
i = 0
for the_url in soup.find_all('a', href=True):
if (".pdf" not in the_url['href'].lower()) and (
".jpg" not in the_url['href'].lower()) and (
".mp4" not in the_url['href'].lower()) and (
".mp3" not in the_url['href'].lower()) and (
".txt" not in the_url['href'].lower()) and (
".png" not in the_url['href'].lower()) and (
".gif" not in the_url['href'].lower()) and (
".svg" not in the_url['href'].lower()) and (
".eps" not in the_url['href'].lower()) and (
".doc" not in the_url['href'].lower()) and (
".docx" not in the_url['href'].lower()) and (
".xls" not in the_url['href'].lower()) and (
".js" not in the_url['href'].lower()) and (
".css" not in the_url['href'].lower()) and (
".xlsx" not in the_url['href'].lower()) and (
".ttf" not in the_url['href'].lower()) and (
".eot" not in the_url['href'].lower()) and (
".bak" not in the_url['href'].lower()) and (
".woff" not in the_url['href'].lower()) and (
"javascript:" not in the_url['href'].lower()) and (
"tel:" not in the_url['href'].lower()) and (
"callto:" not in the_url['href'].lower()) and (
"mailto:" not in the_url['href'].lower()) and (
"#" not in the_url['href'].lower()):
found_url = urlparse(the_url['href'])
if local_only and (len(found_url.netloc) is 0 or found_url.netloc is main_url.netloc):
if len(found_url.netloc) is 0:
found_url = urljoin(url, found_url.geturl())
if found_url not in found_urls: # making the entries unique
found_urls.append(found_url)
i+=1
if num_limit is not None:
found_urls = found_urls[:num_limit]
print('Found {0} URLs on the page you provided, returning {1} of them.'.format(i, len(found_urls)))
return found_urls[:num_limit]
def getGzipedContentFromUrl(url):
"""
Fetching a gziped file from Internet, unpacks it and returns its contents.
"""
unique_id = getUniqueId(5)
file_name = 'tmp/file-{0}.gz'.format(unique_id)
try:
r = requests.get(url, stream=True)
with open(file_name, 'wb') as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
with gzip.open(file_name, 'rb') as f:
file_content = f.read()
return file_content
except SSLError:
if 'http://' in url: # trying the same URL over SSL/TLS
return getGzipedContentFromUrl(url.replace('http://', 'https://'))
else:
return None
except:
print(
'Error! Unfortunately the request for URL "{0}" either timed out or failed for other reason(s). The timeout is set to {1} seconds.\nMessage:\n{2}'.format(
url, timeout_in_seconds, sys.exc_info()[0]))
return None
def httpRequestGetContent(url):
"""Trying to fetch the response content
Attributes: url, as for the URL to fetch
"""
if '.gz' in url or '.gzip' in url:
# the url indicates that it is compressed using Gzip
return getGzipedContentFromUrl(url)
timeout_in_seconds = 30
try:
a = requests.get(url)
return a.text
except requests.exceptions.SSLError:
if 'http://' in url: # trying the same URL over SSL/TLS
print('Info: Trying SSL before giving up.')
return httpRequestGetContent(url.replace('http://', 'https://'))
except requests.exceptions.ConnectionError:
print(
'Connection error! Unfortunately the request for URL "{0}" failed.\nMessage:\n{1}'.format(url, sys.exc_info()[0]))
pass
except:
print(
'Error! Unfortunately the request for URL "{0}" either timed out or failed for other reason(s). The timeout is set to {1} seconds.\nMessage:\n{2}'.format(url, timeout_in_seconds, sys.exc_info()[0]))
pass
def is_sitemap(content):
"""Check a string to see if its content is a sitemap or siteindex.
Attributes: content (string)
"""
if 'http://www.sitemaps.org/schemas/sitemap/' in content or '<sitemapindex' in content:
return True
return False
"""
If file is executed on itself then call a definition, mostly for testing purposes
"""
if __name__ == '__main__':
# fetchUrlsFromSitemap('http://webbstrategiforalla.se/sitemap.xml')
# tmp = fetchUrlsFromSitemap('http://www.varberg.se/sitemap.xml', '2017-02-17T06:19:00+01:00')
# print(len(tmp))
# for bla in tmp:
# print('{0} lastmod for {1}'.format(bla[0], bla[1]))
for url in fetchUrlsFromPage('https://www.arbetsformedlingen.se/', 20):
print(url)
# httpRequestGetContent('http://vgregion.se')