/
link_extractor.go
131 lines (109 loc) · 3.31 KB
/
link_extractor.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
package crawler
import (
"context"
"net/url"
"regexp"
"github.com/PacktPublishing/Hands-On-Software-Engineering-with-Golang/Chapter07/pipeline"
)
var (
exclusionRegex = regexp.MustCompile(`(?i)\.(?:jpg|jpeg|png|gif|ico|css|js)$`)
baseHrefRegex = regexp.MustCompile(`(?i)<base.*?href\s*?=\s*?"(.*?)\s*?"`)
findLinkRegex = regexp.MustCompile(`(?i)<a.*?href\s*?=\s*?"\s*?(.*?)\s*?".*?>`)
nofollowRegex = regexp.MustCompile(`(?i)rel\s*?=\s*?"?nofollow"?`)
)
type linkExtractor struct {
netDetector PrivateNetworkDetector
}
func newLinkExtractor(netDetector PrivateNetworkDetector) *linkExtractor {
return &linkExtractor{
netDetector: netDetector,
}
}
func (le *linkExtractor) Process(ctx context.Context, p pipeline.Payload) (pipeline.Payload, error) {
payload := p.(*crawlerPayload)
relTo, err := url.Parse(payload.URL)
if err != nil {
return nil, err
}
content := payload.RawContent.String()
// Search page content for a <base> tag and resolve it to an abs URL.
if baseMatch := baseHrefRegex.FindStringSubmatch(content); len(baseMatch) == 2 {
if base := resolveURL(relTo, ensureHasTrailingSlash(baseMatch[1])); base != nil {
relTo = base
}
}
// Find the unique set of links from the document, resolve them and
// add them to the payload.
seenMap := make(map[string]struct{})
for _, match := range findLinkRegex.FindAllStringSubmatch(content, -1) {
link := resolveURL(relTo, match[1])
if !le.retainLink(relTo.Hostname(), link) {
continue
}
// Truncate anchors and drop duplicates
link.Fragment = ""
linkStr := link.String()
if _, seen := seenMap[linkStr]; seen {
continue
}
// Skip URLs that point to files that cannot contain html content.
if exclusionRegex.MatchString(linkStr) {
continue
}
seenMap[linkStr] = struct{}{}
if nofollowRegex.MatchString(match[0]) {
payload.NoFollowLinks = append(payload.NoFollowLinks, linkStr)
} else {
payload.Links = append(payload.Links, linkStr)
}
}
return payload, nil
}
func (le *linkExtractor) retainLink(srcHost string, link *url.URL) bool {
// Skip links that could not be resolved
if link == nil {
return false
}
// Skip links with non http(s) schemes
if link.Scheme != "http" && link.Scheme != "https" {
return false
}
// Keep links to the same host
if link.Hostname() == srcHost {
return true
}
// Skip links that resolve to private networks
if isPrivate, err := le.netDetector.IsPrivate(link.Host); err != nil || isPrivate {
return false
}
return true
}
func ensureHasTrailingSlash(s string) string {
if s[len(s)-1] != '/' {
return s + "/"
}
return s
}
// resolveURL expands target into an absolute URL using the following rules:
// - targets starting with '//' are treated as absolute URLs that inherit the
// protocol from relTo.
// - targets starting with '/' are absolute URLs that are appended to the host
// from relTo.
// - all other targets are assumed to be relative to relTo.
//
// If the target URL cannot be parsed, an nil URL wil be returned.
func resolveURL(relTo *url.URL, target string) *url.URL {
tLen := len(target)
if tLen == 0 {
return nil
}
if tLen >= 1 && target[0] == '/' {
if tLen >= 2 && target[1] == '/' {
target = relTo.Scheme + ":" + target
}
}
if targetURL, err := url.Parse(target); err == nil {
return relTo.ResolveReference(targetURL)
}
return nil
}