-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape.go
98 lines (79 loc) · 2.22 KB
/
scrape.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
package main
import (
"github.com/PuerkitoBio/goquery"
"log"
"time"
)
const (
baseURL = "https://www.gov.uk"
)
// HighwayCode represents the root node
type HighwayCode struct {
Chapters []Chapter `xml:Chapters`
}
// Chapter represents a chapter of the highway code
type Chapter struct {
Title string `xml:"Title"`
Summary string `xml:"Summary"`
URL string `xml:"Url"`
Sections []Section `xml:"Sections"`
}
// Section represents a section of a chapter of the highway code
type Section struct {
Title string `xml:"SectionTitle"`
Content struct {
Inner string `xml:",cdata"`
} `xml:"Content"`
}
func getChapters() []Chapter {
doc, err := goquery.NewDocument(baseURL + "/guidance/the-highway-code")
if err != nil {
log.Fatal(err)
}
foundChapters := doc.Find("article#content ol.section-list li a")
chapters := make([]Chapter, foundChapters.Length())
foundChapters.Each(func(index int, item *goquery.Selection) {
link, _ := item.Attr("href")
chapters[index].URL = baseURL + link
spans := item.Find("span")
chapters[index].Title = spans.Eq(0).Text()
chapters[index].Summary = spans.Eq(1).Text()
})
return chapters
}
func getSections(link string) []Section {
doc, err := goquery.NewDocument(link)
if err != nil {
log.Fatal(err)
}
foundSections := doc.Find("div.gem-c-govspeak.govuk-govspeak h2")
sections := make([]Section, foundSections.Length())
foundSections.Each(func(index int, item *goquery.Selection) {
sections[index].Title = item.Text()
var untilNext *goquery.Selection
if index == foundSections.Length()-1 {
untilNext = item.NextAll()
} else {
untilNext = item.NextUntilSelection(foundSections.Eq(index + 1))
}
var htmlContent string
untilNext.Each(func(ind int, selectedItem *goquery.Selection) {
htmlSelected, _ := goquery.OuterHtml(selectedItem)
htmlContent += htmlSelected
})
sections[index].Content.Inner = htmlContent
})
return sections
}
// Scrape gets the full highway code from the gov uk website
func Scrape() HighwayCode {
chapters := getChapters()
pause := time.Duration(100) * time.Millisecond
for i := range chapters {
chapters[i].Sections = getSections(chapters[i].URL)
time.Sleep(pause)
}
return HighwayCode{
Chapters: chapters,
}
}