Permalink
Switch branches/tags
Nothing to show
Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
executable file 134 lines (115 sloc) 3.03 KB
#!/usr/bin/env python
#############################################
# GetRobots - Get titles for all #
# robots.txt URI's. #
# Author: Slurpgeit #
# Github: https://github.com/Slurpgeit #
#############################################
## Imports
import sys
import urllib2
import time
import socket
# Check for BeautifulSoup
try:
from bs4 import BeautifulSoup
except:
print 'Please install the BS4 python module.'
sys.exit()
## Basic stuff
# The helptext
helptext = """\
==========\n\
GetRobots\n\
==========\n\
Usage: """ + sys.argv[0] + """ -u <url> <options>\n\
-u <url> The URL to use\n\
-P <ip>:<port> Use HTTP proxy\n\
-t <timeout> Timeout (Default is 1 second)
-v Verbose (Show HTTP errors)
-h Display this help\
"""
# See if URL is set
try:
url = urllib2.unquote(sys.argv[sys.argv.index('-u') + 1])
if ('http://' not in url) and ('https://' not in url):
url = 'http://' + url
except:
print helptext
sys.exit()
# Check if URL resolves
try:
rawurl = url.strip('http://')
rawurl = url.strip('https://')
socket.gethostbyname(rawurl)
except:
print '\033[91m[-]\033[0m Could not revolve "' + url + '"'
sys.exit()
# Parse command line
if ('-h' in sys.argv) or ('--help' in sys.argv):
print helptext
sys.exit()
if '-P' in sys.argv:
proxyaddr = urllib2.unquote(sys.argv[sys.argv.index('-P') + 1])
proxy = urllib2.ProxyHandler({'http': proxyaddr})
opener = urllib2.build_opener(proxy)
urllib2.install_opener(opener)
if '-t' in sys.argv:
timeout = int(urllib2.unquote(sys.argv[sys.argv.index('-t') + 1]))
else:
timeout = 1
if '-v' in sys.argv:
verbose = 1
## Functions
# Get all "Disallow: " lines
def getdisallow(url):
disallow = []
try:
req = urllib2.urlopen(url + '/robots.txt')
status = req.getcode()
print '\033[92m[+]\033[0m ' + str(status) + ' | robots.txt'
except Exception, error:
try:
print '\033[91m[-]\033[0m ' + str(error.code) + ' | robots.txt'
except:
print '\033[91m[-]\033[0m Error: ' + str(error)
sys.exit()
robots = req.readlines()
for line in robots:
if 'Disallow:' in line:
if ('Disallow: /\n' not in line) and ('Disallow: /*\n' not in line):
disallow.append(line)
return disallow
# Get all URI titles
def getcontent(uri):
for uri in uris:
try:
req = urllib2.urlopen(url + uri)
status = str(req.getcode())
html = BeautifulSoup(req)
try:
title = html.title.string
title = title.strip('\n')
title = title.strip('\t')
except:
title = 'No title'
print '\033[92m[+]\033[0m ' + status + ' | ' + uri.rstrip() + ' | "' + title.rstrip() + '" | ' + url + uri.rstrip()
except urllib2.URLError, error:
try:
if verbose:
print '\033[91m[-]\033[0m ' + str(error.code) + ' | ' + uri.rstrip()
except:
pass
time.sleep(timeout)
## Running code
# Fill 'uris' array
uris = []
disallow = getdisallow(url)
if not disallow:
print 'No "Disallow:" entries in robots.txt.'
else:
for entry in disallow:
entry = entry.strip('Disallow: ')
uris.append(entry)
# Request all URI's
getcontent(uris)