Permalink
Browse files

Check for string, fetch URLs from page, URL harvesting

Also more fault tolerant.
  • Loading branch information...
marcusosterberg committed Nov 27, 2017
1 parent f1ca4fe commit 415c409547a6fec4966314129415f18f2272806c
Showing with 107 additions and 14 deletions.
  1. +15 −0 checks/content.py
  2. +20 −1 default.py
  3. +72 −13 helper.py
View
@@ -103,6 +103,21 @@ def content_check(check_url, strategy='mobile'):
return return_dict
def find_string(to_find, check_url):
check_url = check_url.strip()
try:
get_content = helper.httpRequestGetContent(check_url)
# soup = BeautifulSoup(get_content, "html.parser") # to use if not checking HTML code
if to_find in get_content:
return True
else:
return False
except:
print('Error! Unfortunately the request for URL "{0}" failed, message:\n{1}'.format(check_url, sys.exc_info()[0]))
pass
# För svenska
# Under 25 Barnböcker.
View
@@ -13,7 +13,7 @@
import test
import helper
from checks.google_pagespeed import google_pagespeed_check
from checks.content import content_check # uncomment this line to try the preview of content checks
# from checks.content import content_check, find_string # uncomment this line to try the preview of content checks
# local variables
# url_for_mainProcess = 'http://vgregion.se/'
@@ -65,6 +65,20 @@ def oneOffProcess(file, test_regime='httpStatusCodeCheck'):
is_sitemap = helper.is_sitemap(helper.httpRequestGetContent(url))
print('Is sitemap: {0}'.format(is_sitemap))
output_file += '{0}, {1}, {2}\n'.format(url.replace('\n', ''), status_code, is_sitemap)
elif test_regime == 'urlHarvest':
"""
Fetches URLs from a page's content
"""
i = 0
print('Harvesting URLs from {0}'.format(url))
try:
for found_url in helper.fetchUrlsFromPage(url, 50):
output_file += '{0}\n'.format(found_url)
i+=1
except:
print('Error! The URL {0} failed.'.format(url))
pass
#print('Found {0} URLs from {1}'.format(i,url))
elif test_regime == 'googlePageSpeed':
check_page = google_pagespeed_check(url)
if bool(check_page):
@@ -88,6 +102,11 @@ def oneOffProcess(file, test_regime='httpStatusCodeCheck'):
for key, value in content_check(url).items():
output_file = output_file + '{0},{1},{2}\n'.format(url, key, value)
i = i + 1
elif test_regime == 'findString':
searching = find_string('piwik', url)
print("{0}. Checking for string in URL '{1}' - {2}".format(i, url, searching))
output_file = output_file + '{0},{1}\n'.format(url, searching)
i = i + 1
# sleep(time_to_sleep_in_seconds) # sleeping for n seconds
View
@@ -7,6 +7,7 @@
import dateutil.parser
import datetime
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
import gzip
import requests
import json
@@ -78,7 +79,11 @@ def fetchUrlsFromSitemap(url, limit=None):
".ttf" not in lvl1_url.text.lower()) and (
".eot" not in lvl1_url.text.lower()) and (
".bak" not in lvl1_url.text.lower()) and (
".woff" not in lvl1_url.text.lower()):
".woff" not in lvl1_url.text.lower()) and (
"javascript:" not in lvl1_url.text.lower()) and (
"tel:" not in lvl1_url.text.lower()) and (
"mailto:" not in lvl1_url.text.lower()) and (
"#" not in lvl1_url.text.lower()):
if lvl1_url.lastmod is not None:
date = dateutil.parser.parse(lvl1_url.lastmod.string).replace(tzinfo=None)
if limit is not None and date is not None and date > limit:
@@ -106,6 +111,58 @@ def fetchUrlsFromSitemap(url, limit=None):
return sorted(found_urls, key=getKey, reverse=True)
def fetchUrlsFromPage(url, num_limit=None, local_only=True):
"""Given a URL contained URLs are returned as a list with tuples. Optional to number of URLs and if to only include URLs within the local website.
Attributes: url (string), num_limit (integer), local_only (bool)
"""
main_url = urlparse(url)
found_urls = list()
page = httpRequestGetContent(url)
soup = BeautifulSoup(page, "html.parser")
i = 0
for the_url in soup.find_all('a', href=True):
if (".pdf" not in the_url['href'].lower()) and (
".jpg" not in the_url['href'].lower()) and (
".mp4" not in the_url['href'].lower()) and (
".mp3" not in the_url['href'].lower()) and (
".txt" not in the_url['href'].lower()) and (
".png" not in the_url['href'].lower()) and (
".gif" not in the_url['href'].lower()) and (
".svg" not in the_url['href'].lower()) and (
".eps" not in the_url['href'].lower()) and (
".doc" not in the_url['href'].lower()) and (
".docx" not in the_url['href'].lower()) and (
".xls" not in the_url['href'].lower()) and (
".js" not in the_url['href'].lower()) and (
".css" not in the_url['href'].lower()) and (
".xlsx" not in the_url['href'].lower()) and (
".ttf" not in the_url['href'].lower()) and (
".eot" not in the_url['href'].lower()) and (
".bak" not in the_url['href'].lower()) and (
".woff" not in the_url['href'].lower()) and (
"javascript:" not in the_url['href'].lower()) and (
"tel:" not in the_url['href'].lower()) and (
"callto:" not in the_url['href'].lower()) and (
"mailto:" not in the_url['href'].lower()) and (
"#" not in the_url['href'].lower()):
found_url = urlparse(the_url['href'])
if local_only and (len(found_url.netloc) is 0 or found_url.netloc is main_url.netloc):
if len(found_url.netloc) is 0:
found_url = urljoin(url, found_url.geturl())
if found_url not in found_urls: # making the entries unique
found_urls.append(found_url)
i+=1
if num_limit is not None:
found_urls = found_urls[:num_limit]
print('Found {0} URLs on the page you provided, returning {1} of them.'.format(i, len(found_urls)))
return found_urls[:num_limit]
def getGzipedContentFromUrl(url):
"""
Fetching a gziped file from Internet, unpacks it and returns its contents.
@@ -149,16 +206,18 @@ def httpRequestGetContent(url):
a = requests.get(url)
return a.text
except SSLError:
except requests.exceptions.SSLError:
if 'http://' in url: # trying the same URL over SSL/TLS
print('Info: Trying SSL before giving up.')
return httpRequestGetContent(url.replace('http://', 'https://'))
else:
return None
except requests.exceptions.ConnectionError:
print(
'Connection error! Unfortunately the request for URL "{0}" failed.\nMessage:\n{1}'.format(url, sys.exc_info()[0]))
pass
except:
print(
'Error! Unfortunately the request for URL "{0}" either timed out or failed for other reason(s). The timeout is set to {1} seconds.\nMessage:\n{2}'.format(
url, timeout_in_seconds, sys.exc_info()[0]))
pass #borde vara None?
'Error! Unfortunately the request for URL "{0}" either timed out or failed for other reason(s). The timeout is set to {1} seconds.\nMessage:\n{2}'.format(url, timeout_in_seconds, sys.exc_info()[0]))
pass
def is_sitemap(content):
@@ -177,12 +236,12 @@ def is_sitemap(content):
"""
if __name__ == '__main__':
# fetchUrlsFromSitemap('http://webbstrategiforalla.se/sitemap.xml')
tmp = fetchUrlsFromSitemap('http://www.varberg.se/sitemap.xml',
'2017-02-17T06:19:00+01:00')
print(len(tmp))
# tmp = fetchUrlsFromSitemap('http://www.varberg.se/sitemap.xml', '2017-02-17T06:19:00+01:00')
# print(len(tmp))
for bla in tmp:
print('{0} lastmod for {1}'.format(bla[0], bla[1]))
# print('Tjo')
# for bla in tmp:
# print('{0} lastmod for {1}'.format(bla[0], bla[1]))
for url in fetchUrlsFromPage('https://www.arbetsformedlingen.se/', 20):
print(url)
# httpRequestGetContent('http://vgregion.se')

0 comments on commit 415c409

Please sign in to comment.