/
urls.go
81 lines (69 loc) · 2.16 KB
/
urls.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
package spider
import (
"net/url"
"strings"
"github.com/temoto/robotstxt"
)
// Seener is something which can check if a URL has ever been seen.
type Seener interface {
Seen(*url.URL) bool
}
type urlPredicate func(*url.URL) bool
// Seen adapts a urlPredicate to the Seener interface
func (p urlPredicate) Seen(input *url.URL) bool {
return p(input)
}
// filter a list of urls based on the predicate.
func filter(predicate urlPredicate, urls []*url.URL) []*url.URL {
output := make([]*url.URL, 0, len(urls))
for _, url := range urls {
if predicate(url) {
output = append(output, url)
}
}
return output
}
// createIsInternalPredicate creates a predicate which tests if the url is internal.
// If we're following subdomains, we check based on the suffix of the host, otherwise
// we exact match on the Hostname.
func createIsInternalPredicate(root *url.URL, followSubdomains bool) urlPredicate {
return func(input *url.URL) bool {
if followSubdomains {
return strings.HasSuffix(input.Hostname(), root.Hostname())
}
return input.Hostname() == root.Hostname()
}
}
// createNotSeenPredicate creates a predicate which is true when a URL has not been
// seen before, according to the given seener.
func createNotSeenPredicate(seener Seener) urlPredicate {
return func(input *url.URL) bool {
return !seener.Seen(input)
}
}
// createShouldRequestByRobotsPredicate creates a predicate which tests if we should follow
// a URL based on the info from the robots.txt.
func createShouldRequestByRobotsPredicate(ua string, r *robotstxt.RobotsData) urlPredicate {
return func(input *url.URL) bool {
if r == nil {
return true
}
return r.TestAgent(input.Path, ua)
}
}
type urlTransform func(*url.URL) *url.URL
// mapURLs transforms a collection of urls with the transform.
func mapURLs(f urlTransform, urls []*url.URL) []*url.URL {
out := make([]*url.URL, len(urls))
for i, url := range urls {
out[i] = f(url)
}
return out
}
// createAbsoluteTransformer creates a transform which resolves the url
// relative to the given root.
func createAbsoluteTransformer(root *url.URL) urlTransform {
return func(input *url.URL) *url.URL {
return root.ResolveReference(input)
}
}