-
Notifications
You must be signed in to change notification settings - Fork 3
/
spider.py
executable file
·197 lines (162 loc) · 8.1 KB
/
spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/usr/bin/env python3
# ----------------------------------------------------------------
# Modules required for basic functionality
import argparse
import logging
import os
import sys
from functools import reduce
import operator
import HTTPClient
import CorpusTable
import Features
import Normalisation
# ----------------------------------------------------------------
# Pluggable module imports
#
# Add new modules below if you write any!
#
from filter import DuplicateFilter, MinimumLengthFilter, MaximumLengthFilter, URLCountFilter, MetadataRegexpFilter
from urlfilter import HTTPURLFilter, PreciseDuplicateURLFilter
from endcondition import CorpusSizeEndCondition, RuntimeEndCondition, SampleEndCondition
from fitness import SimplicityURLRank, SampleURLRank, HumanReadableURLRank
# ----------------------------------------------------------------
# Parse command-line arguments
#
parser = argparse.ArgumentParser(description="A web crawler demo for the UCREL Summer School 2016")
parser.add_argument('-seeds', action="store", dest="list")
parser.add_argument('-db', action="store", dest="dbdir", default="output")
parser.add_argument('-loglevel', action="store", dest="loglevel", default="INFO")
args = parser.parse_args()
# ----------------------------------------------------------------
# Configure logging to console and to file using the logging framework.
#
log_level = logging.getLevelName(args.loglevel.upper())
logging.basicConfig(format='%(levelname)7s - %(name)s - %(asctime)s: %(message)s',
filename='run.log', level=log_level)
console = logging.StreamHandler()
console.setFormatter(logging.Formatter('%(levelname)7s - %(name)-8s: %(message)s'))
logging.getLogger('').addHandler(console)
log = logging.getLogger('main')
# ----------------------------------------------------------------
# Load various components, and configure the modules that control
# the crawling process
#
corpus_table = CorpusTable.CorpusTable(args.dbdir) # Storage layer
spider = HTTPClient.HTTPClient() # Retrieval code
url_normaliser = Normalisation.URLNormaliser() # URL normaliser
feature_extractor = Features.Features(url_normaliser, ['title', 'h1']) # Feature extractor
# URL Fitness Function
#url_rank_function = SimplicityURLRank.SimplicityURLRank() # Prefer simple URLs
#url_rank_function = SampleURLRank.SampleURLRank() # Sample code
url_rank_function = HumanReadableURLRank.HumanReadableURLRank() # Prefer human-readable URLs
page_filters = [ # Filters for page rejection
# FuzzyDuplicateFilter.FuzzyDuplicateFilter(corpus_table), # Fuzzy hash using ssdeep
DuplicateFilter.DuplicateFilter(corpus_table), # Perfect duplicate checker
MinimumLengthFilter.MinimumLengthFilter(100), # Min length
MaximumLengthFilter.MaximumLengthFilter(800000), # Max length
URLCountFilter.URLCountFilter(0, 1000), # URL count
MetadataRegexpFilter.MetadataRegexpFilter('content_type', 'text\/(x?html|plain)'), # Content type
]
url_filters = [ # Filters for URL rejection
HTTPURLFilter.HTTPURLFilter(),
PreciseDuplicateURLFilter.PreciseDuplicateURLFilter(corpus_table)
]
end_conditions = [ # End conditions
CorpusSizeEndCondition.CorpusSizeEndCondition(100),
RuntimeEndCondition.RuntimeEndCondition(3600),
SampleEndCondition.SampleEndCondition()
]
# ----------------------------------------------------------------
# Load initial URLs if a seed list is given
#
if args.list is not None:
log.info("Reading seed URLs from %s" % args.list)
with open(args.list) as f:
for line in f:
url = url_normaliser.normalise(line.rstrip())
accepted = [f.accept(url) for f in url_filters]
log.debug("%s out of %s URL filters accepted the URL" % (sum(accepted), len(url_filters)))
if sum(accepted) == len(url_filters):
corpus_table.insert_url(url, url_rank_function.goodness(url))
# ----------------------------------------------------------------
# Main crawling loop
#
cont = True
while cont:
# ------------------------------------------------------------
# Summarise the state in the logs, and update counters
#
corpus_size = corpus_table.output_count()
available_urls = corpus_table.url_count(False)
log.info("%i URLs downloaded; %i available" % (corpus_size, available_urls))
# ------------------------------------------------------------
# If we can't read anything, quit.
# This is the one hard-coded end condition
if available_urls == 0:
log.fatal("No available URLs found. Shutting down.")
cont = False
continue
# ------------------------------------------------------------
# Select the best URL from the database
url, url_id, goodness, depth = corpus_table.best_url()
log.info("URL chosen -- goodness: %s, depth: %s" % (goodness, depth))
corpus_table.update_url(url_id)
# ------------------------------------------------------------
# Make a HTTP request for the page and contents
log.info("Retrieving %s..." % url)
page, body = spider.get_page(url, corpus_table)
# If retrieval failed, continue onto next URL
if page is None or body is None:
continue
# ------------------------------------------------------------
# Extract features from the page and request data
log.info("Performing feature extraction...")
body, metadata = feature_extractor.get_page_metadata(page, body)
# ------------------------------------------------------------
# Run filters to remove pages with undesirable content
log.info("Applying accept/reject page filters...")
accept = True
for f in page_filters:
accept = accept & f.accept(body, metadata)
log.info("Filter %s -- accept? %s" % (f.name, accept))
if not accept:
log.warning("Rejected page by filter: %s" % f.name)
break
if not accept:
continue
log.info("Page accepted!")
# ------------------------------------------------------------
# Run filters to remove undesirable forward links
log.info("Applying accept/reject URL filters...")
for f in url_filters:
count_pre = len(metadata['urls'])
metadata['urls'] = list(filter(lambda u: f.accept(u), metadata['urls']))
count_post = len(metadata['urls'])
log.info("URL Filter %s rejected %i urls" % (f.name, count_pre - count_post))
if count_post == 0:
break
# ------------------------------------------------------------
# Insert URLs into DB
log.info("Inserting %i URLs" % len(metadata['urls']))
urls = metadata.pop('urls')
for url in urls:
corpus_table.insert_url(url, url_rank_function.goodness(url), depth + 1)
# ------------------------------------------------------------
# Insert page data into DB and onto disk
log.info("Inserting/writing page data...")
metadata['url_id'] = url_id
corpus_table.insert_page(metadata, body)
# ------------------------------------------------------------
# Test for end conditions
for e in end_conditions:
end = e.end(corpus_table, body, metadata)
log.info("End by condition '%s'? %s" % (e.name, end))
if end:
log.info("End condition reached. Quitting...")
cont = False
# Clean up database
log.debug("Disconnecting database...")
corpus_table.disconnect
# Tell people we didn't crash
log.info("Done. Exiting under normal circumstances.")