Skip to content

Commit

Permalink
in the middle of major refactor, working on URLContext
Browse files Browse the repository at this point in the history
  • Loading branch information
mna committed Mar 27, 2013
1 parent 8d554c6 commit ed9e419
Show file tree
Hide file tree
Showing 9 changed files with 255 additions and 140 deletions.
2 changes: 2 additions & 0 deletions crawl_test.go
Expand Up @@ -12,6 +12,8 @@ import (
"time"
)

// TODO : Test Panic in visit, filter, etc.

func TestAllSameHost(t *testing.T) {
opts := NewOptions(nil)
opts.SameHostOnly = true
Expand Down
21 changes: 0 additions & 21 deletions crawler.go
Expand Up @@ -9,15 +9,6 @@ import (
"sync"
)

type Context struct {
URL *url.URL
NormalizedURL *url.URL
Origin EnqueueOrigin
SourceURL *url.URL
NormalizedSourceURL *url.URL
State interface{}
}

// Communication from worker to the master crawler, about the crawling of a URL
type workerResponse struct {
host string
Expand All @@ -33,18 +24,6 @@ type workerCommand struct {
head bool
}

// EnqueueOrigin indicates to the crawler and the Filter extender function
// the origin of this URL.
type EnqueueOrigin int

const (
EoSeed EnqueueOrigin = iota // Seed URLs have this source
EoHarvest // URLs harvested from a visit to a page have this source
EoRedirect // URLs enqueued from a fetch redirection have this source by default
EoError // URLs enqueued after an error
EoCustomStart // Custom EnqueueOrigins should start at this value instead of iota
)

// Communication from extender to crawler about an URL to enqueue
type CrawlerCommand struct {
URL *url.URL
Expand Down
47 changes: 47 additions & 0 deletions enums.go
@@ -0,0 +1,47 @@
package gocrawl

// Enum indicating the reason why the crawling ended.
type EndReason uint8

const (
ErDone EndReason = iota
ErMaxVisits
ErError
)

// Enum indicating the kind of the crawling error.
type CrawlErrorKind uint8

const (
CekFetch CrawlErrorKind = iota
CekParseRobots
CekHttpStatusCode
CekReadBody
CekParseBody
CekParseSeed
CekParseNormalizedSeed
CekProcessLinks
CekParseRedirectUrl
)

// Enum indicating the head request override mode (the default mode is specified
// in the Options of the crawler).
type HeadRequestMode uint8

const (
HrmDefault HeadRequestMode = iota
HrmRequest
HrmIgnore
)

// Enum indicating to the crawler and the Filter extender function
// the origin of the URL to crawl.
type EnqueueOrigin int

const (
EoSeed EnqueueOrigin = iota // Seed URLs have this source
EoHarvest // URLs harvested from a visit to a page have this source
EoRedirect // URLs enqueued from a fetch redirection have this source by default
EoError // URLs enqueued after an error
EoCustomStart // Custom EnqueueOrigins should start at this value instead of iota
)
38 changes: 38 additions & 0 deletions errors.go
@@ -0,0 +1,38 @@
package gocrawl

import (
"errors"
)

// Crawl error information.
type CrawlError struct {
Ctx *URLContext
Err error
Kind CrawlErrorKind
msg string
}

var (
// The error returned when a redirection is requested, so that the
// worker knows that this is not an actual Fetch error, but a request to
// enqueue the redirect-to URL.
EnqueueRedirectError = errors.New("redirection not followed")
)

// Implementation of the error interface.
func (this CrawlError) Error() string {
if this.Err != nil {
return this.Err.Error()
}
return this.msg
}

// Create a new CrawlError based on a source error.
func newCrawlError(ctx *URLContext, e error, kind CrawlErrorKind) *CrawlError {
return &CrawlError{ctx, e, kind, ""}
}

// Create a new CrawlError with the specified message.
func newCrawlErrorMessage(ctx *URLContext, msg string, kind CrawlErrorKind) *CrawlError {
return &CrawlError{ctx, nil, kind, msg}
}
136 changes: 38 additions & 98 deletions ext.go
Expand Up @@ -5,69 +5,9 @@ import (
"github.com/PuerkitoBio/goquery"
"log"
"net/http"
"net/url"
"time"
)

// Flag indicating why the crawler ended.
type EndReason uint8

const (
ErDone EndReason = iota
ErMaxVisits
ErError
)

// Flag indicating the source of the crawl error.
type CrawlErrorKind uint8

const (
CekFetch CrawlErrorKind = iota
CekParseRobots
CekHttpStatusCode
CekReadBody
CekParseBody
CekParseSeed
CekParseNormalizedSeed
CekProcessLinks
CekParseRedirectUrl
)

// Flag indicating the head request override mode
type HeadRequestMode uint8

const (
HrmDefault HeadRequestMode = iota
HrmRequest
HrmIgnore
)

// Crawl error information.
type CrawlError struct {
Err error
Kind CrawlErrorKind
URL *url.URL // TODO : Context?
msg string
}

// Implementation of the error interface.
func (this CrawlError) Error() string {
if this.Err != nil {
return this.Err.Error()
}
return this.msg
}

// Create a new CrawlError based on a source error.
func newCrawlError(e error, kind CrawlErrorKind, u *url.URL) *CrawlError {
return &CrawlError{e, kind, u, ""}
}

// Create a new CrawlError with the specified message.
func newCrawlErrorMessage(msg string, kind CrawlErrorKind, u *url.URL) *CrawlError {
return &CrawlError{nil, kind, u, msg}
}

// Delay information: the Options delay, the Robots.txt delay, and the last delay used.
type DelayInfo struct {
OptsDelay time.Duration
Expand All @@ -78,42 +18,42 @@ type DelayInfo struct {
// Fetch information: the duration of the fetch, the returned status code, whether or
// not it was a HEAD request, and whether or not it was a robots.txt request.
type FetchInfo struct {
Ctx *URLContext
Duration time.Duration
StatusCode int
HeadRequest bool
RobotsRequest bool
HeadRequest bool // in Ctx? NO! This is to say if this particular fetch was a HEAD request.
RobotsRequest bool // In Ctx? (IsRobots)
}

// Extension methods required to provide an extender instance.
type Extender interface {
Start(map[*url.URL]interface{})
// Start, End, Error and Log are not related to a specific URL, so they don't
// receive a URLContext struct.
Start(interface{}) interface{}
End(EndReason)
Error(*CrawlError)
Log(LogFlags, LogFlags, string)

// ComputeDelay is related to a Host only, not to a URLContext, although the FetchInfo
// is related to a URLContext (holds a ctx field).
ComputeDelay(string, *DelayInfo, *FetchInfo) time.Duration
Fetch(*Context, string, bool) (*http.Response, error)
RequestGet(*Context, *http.Response) bool
RequestRobots(*Context, string) (bool, []byte)
FetchedRobots(*Context, *http.Response)

Filter(*Context, bool) (bool, int, HeadRequestMode)
Enqueued(*Context)
Visit(*Context, *http.Response, *goquery.Document) (map[*url.URL]interface{}, bool)
Visited(*Context, []*url.URL)
Disallowed(*Context)
}

// The error type returned when a redirection is requested, so that the
// worker knows that this is not an actual Fetch error, but a request to
// enqueue the redirect-to URL.
type EnqueueRedirectError struct {
msg string
}
// All other extender methods are executed in the context of an URL, and thus
// receive an URLContext struct as first argument.
Fetch(*URLContext, string, bool) (*http.Response, error)
RequestGet(*URLContext, *http.Response) bool
RequestRobots(*URLContext, string) ([]byte, bool)
FetchedRobots(*URLContext, *http.Response)

// Implement the error interface
func (this *EnqueueRedirectError) Error() string {
return this.msg
// TODO : Does it make sense to receive the priority here? Or set it on enqueue?
// Or it can override it here, but set it on Visit() or when Enqueing via the chan?
// Better (?): Set priority in the URLContext, can be changed/set in any call, and
// actually used only when enqueing into the crawler enqueue channel. Same for head request mode?
Filter(*URLContext, bool) bool
Enqueued(*URLContext)
Visit(*URLContext, *http.Response, *goquery.Document) (interface{}, bool)
Visited(*URLContext, interface{})
Disallowed(*URLContext)
}

// The default HTTP client used by DefaultExtender's fetch requests (this is thread-safe).
Expand All @@ -133,7 +73,7 @@ var HttpClient = &http.Client{CheckRedirect: func(req *http.Request, via []*http
// For all other URLs, do NOT follow redirections, the default Fetch() implementation
// will ask the worker to enqueue the new (redirect-to) URL. Returning an error
// will make httpClient.Do() return a url.Error, with the URL field containing the new URL.
return &EnqueueRedirectError{"redirection not followed"}
return EnqueueRedirectError
}}

// Default working implementation of an extender.
Expand All @@ -143,7 +83,7 @@ type DefaultExtender struct {

// Return the same seeds as those received (those that were passed
// to Run() initially).
func (this *DefaultExtender) Start(seeds []string) []string {
func (this *DefaultExtender) Start(seeds interface{}) interface{} {
return seeds
}

Expand Down Expand Up @@ -193,7 +133,7 @@ func (this *DefaultExtender) ComputeDelay(host string, di *DelayInfo, lastFetch
// while processing the original URL, so that it knows that there is no more
// redirection HTTP code, and another time when the actual destination URL is
// fetched to be visited).
func (this *DefaultExtender) Fetch(u *url.URL, userAgent string, headRequest bool) (*http.Response, error) {
func (this *DefaultExtender) Fetch(ctx *URLContext, userAgent string, headRequest bool) (*http.Response, error) {
var reqType string

// Prepare the request with the right user agent
Expand All @@ -202,43 +142,43 @@ func (this *DefaultExtender) Fetch(u *url.URL, userAgent string, headRequest boo
} else {
reqType = "GET"
}
req, e := http.NewRequest(reqType, u.String(), nil)
req, e := http.NewRequest(reqType, ctx.URL.String(), nil)
if e != nil {
return nil, e
}
req.Header["User-Agent"] = []string{userAgent}
req.Header.Set("User-Agent", userAgent)
return HttpClient.Do(req)
}

// Ask the worker to actually request the URL's body (issue a GET), unless
// the status code is not 2xx.
func (this *DefaultExtender) RequestGet(headRes *http.Response) bool {
func (this *DefaultExtender) RequestGet(ctx *URLContext, headRes *http.Response) bool {
return headRes.StatusCode >= 200 && headRes.StatusCode < 300
}

// Ask the worker to actually request (fetch) the Robots.txt document.
func (this *DefaultExtender) RequestRobots(u *url.URL, robotAgent string) (request bool, data []byte) {
return true, nil
func (this *DefaultExtender) RequestRobots(ctx *URLContext, robotAgent string) (data []byte, doRequest bool) {
return nil, true
}

// FetchedRobots is a no-op.
func (this *DefaultExtender) FetchedRobots(res *http.Response) {}
func (this *DefaultExtender) FetchedRobots(ctx *URLContext, res *http.Response) {}

// Enqueue the URL if it hasn't been visited yet.
func (this *DefaultExtender) Filter(u *url.URL, from *url.URL, isVisited bool, origin EnqueueOrigin) (enqueue bool, priority int, headRequest HeadRequestMode) {
return !isVisited, 0, HrmDefault
func (this *DefaultExtender) Filter(ctx *URLContext, isVisited bool) bool {
return !isVisited
}

// Enqueued is a no-op.
func (this *DefaultExtender) Enqueued(u *url.URL, from *url.URL) {}
func (this *DefaultExtender) Enqueued(ctx *URLContext) {}

// Ask the worker to harvest the links in this page.
func (this *DefaultExtender) Visit(res *http.Response, doc *goquery.Document) (harvested []*url.URL, findLinks bool) {
func (this *DefaultExtender) Visit(ctx *URLContext, res *http.Response, doc *goquery.Document) (harvested interface{}, findLinks bool) {
return nil, true
}

// Visited is a no-op.
func (this *DefaultExtender) Visited(u *url.URL, harvested []*url.URL) {}
func (this *DefaultExtender) Visited(ctx *URLContext, harvested interface{}) {}

// Disallowed is a no-op.
func (this *DefaultExtender) Disallowed(u *url.URL) {}
func (this *DefaultExtender) Disallowed(ctx *URLContext) {}
6 changes: 3 additions & 3 deletions popchannel.go
Expand Up @@ -2,19 +2,19 @@ package gocrawl

// The pop channel is a stacked channel used by workers to pop the next URL(s)
// to process.
type popChannel chan []*workerCommand
type popChannel chan []*URLContext

// Constructor to create and initialize a popChannel
func newPopChannel() popChannel {
// The pop channel is stacked, so only a buffer of 1 is required
// see http://gowithconfidence.tumblr.com/post/31426832143/stacked-channels
return make(chan []*workerCommand, 1)
return make(chan []*URLContext, 1)
}

// The stack function ensures the specified URLs are added to the pop channel
// with minimal blocking (since the channel is stacked, it is virtually equivalent
// to an infinitely buffered channel).
func (this popChannel) stack(cmd ...*workerCommand) {
func (this popChannel) stack(cmd ...*URLContext) {
toStack := cmd
for {
select {
Expand Down

0 comments on commit ed9e419

Please sign in to comment.