Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
in the middle of major refactor, working on URLContext
- Loading branch information
Showing
9 changed files
with
255 additions
and
140 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
package gocrawl | ||
|
||
// Enum indicating the reason why the crawling ended. | ||
type EndReason uint8 | ||
|
||
const ( | ||
ErDone EndReason = iota | ||
ErMaxVisits | ||
ErError | ||
) | ||
|
||
// Enum indicating the kind of the crawling error. | ||
type CrawlErrorKind uint8 | ||
|
||
const ( | ||
CekFetch CrawlErrorKind = iota | ||
CekParseRobots | ||
CekHttpStatusCode | ||
CekReadBody | ||
CekParseBody | ||
CekParseSeed | ||
CekParseNormalizedSeed | ||
CekProcessLinks | ||
CekParseRedirectUrl | ||
) | ||
|
||
// Enum indicating the head request override mode (the default mode is specified | ||
// in the Options of the crawler). | ||
type HeadRequestMode uint8 | ||
|
||
const ( | ||
HrmDefault HeadRequestMode = iota | ||
HrmRequest | ||
HrmIgnore | ||
) | ||
|
||
// Enum indicating to the crawler and the Filter extender function | ||
// the origin of the URL to crawl. | ||
type EnqueueOrigin int | ||
|
||
const ( | ||
EoSeed EnqueueOrigin = iota // Seed URLs have this source | ||
EoHarvest // URLs harvested from a visit to a page have this source | ||
EoRedirect // URLs enqueued from a fetch redirection have this source by default | ||
EoError // URLs enqueued after an error | ||
EoCustomStart // Custom EnqueueOrigins should start at this value instead of iota | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
package gocrawl | ||
|
||
import ( | ||
"errors" | ||
) | ||
|
||
// Crawl error information. | ||
type CrawlError struct { | ||
Ctx *URLContext | ||
Err error | ||
Kind CrawlErrorKind | ||
msg string | ||
} | ||
|
||
var ( | ||
// The error returned when a redirection is requested, so that the | ||
// worker knows that this is not an actual Fetch error, but a request to | ||
// enqueue the redirect-to URL. | ||
EnqueueRedirectError = errors.New("redirection not followed") | ||
) | ||
|
||
// Implementation of the error interface. | ||
func (this CrawlError) Error() string { | ||
if this.Err != nil { | ||
return this.Err.Error() | ||
} | ||
return this.msg | ||
} | ||
|
||
// Create a new CrawlError based on a source error. | ||
func newCrawlError(ctx *URLContext, e error, kind CrawlErrorKind) *CrawlError { | ||
return &CrawlError{ctx, e, kind, ""} | ||
} | ||
|
||
// Create a new CrawlError with the specified message. | ||
func newCrawlErrorMessage(ctx *URLContext, msg string, kind CrawlErrorKind) *CrawlError { | ||
return &CrawlError{ctx, nil, kind, msg} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.