Skip to content

Commit

Permalink
- Fix batch handler
Browse files Browse the repository at this point in the history
- Support both form and JSON input for extract and feed endpoints
- Remove URL fragment when cleaning URLs
- Add some debug logging to help diagnose encoding issues with urls in JSON responses
  • Loading branch information
efixler committed Mar 13, 2024
1 parent 030a40d commit 32261b3
Show file tree
Hide file tree
Showing 10 changed files with 315 additions and 133 deletions.
5 changes: 2 additions & 3 deletions README.md
Expand Up @@ -257,7 +257,7 @@ there's an error fetching or parsing the requested content.
In all other cases, requests should return a 200 status code, and any errors received when fetching a resource
will be included in the returned JSON payload.

#### feed [GET]
#### feed [GET, POST]

Feed parses an RSS or Atom feed and returns the parsed results for each of the item links in the feed.

Expand All @@ -283,7 +283,7 @@ These params work for any endpoint

### Healthchecks

`scrape` currently supports two healthchecks
`scrape` has two healthchecks:

#### /.well-known/health

Expand Down Expand Up @@ -376,7 +376,6 @@ The `docker-run` make target will mount a local folder called `docker/data` and
- Outbound request pacing
- Expose outbound request options (headers, timeouts, etc)
- Headless fallback for pages that require Javascript
- Update ServeMux usage for 1.22
- Explore performance optimizations, e.g.
- Batch request parallelization
- zstd compression for stored resources
Expand Down
61 changes: 43 additions & 18 deletions internal/server/middleware.go
Expand Up @@ -9,6 +9,7 @@ import (
"log/slog"
"net/http"
nurl "net/url"
"strings"
)

type middleware func(http.HandlerFunc) http.HandlerFunc
Expand All @@ -29,34 +30,58 @@ func Chain(h http.HandlerFunc, m ...middleware) http.HandlerFunc {
func MaxBytes(n int64) middleware {
return func(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
r.Body = http.MaxBytesReader(w, r.Body, n)
if r.Method != http.MethodGet {
r.Body = http.MaxBytesReader(w, r.Body, n)
}
next(w, r)
}
}
}

// //cType := strings.TrimSpace(strings.Split(r.Header.Get("Content-Type"), ";")[0])
// Anything that's not a GET and not a form is assumed to be JSON
// This is imperfect but it allows for requests that don't send a content-type
// header or inadvertently use text/plain
func isJSON(r *http.Request) bool {
if r.Method == http.MethodGet {
return false
}
if strings.HasPrefix(r.Header.Get("Content-Type"), "application/x-www-form-urlencoded") {
return false
}
return true
}

func ParseSingle() middleware {
return func(next http.HandlerFunc) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
url := r.FormValue("url")
if url == "" {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte("No URL provided"))
return
}
netUrl, err := nurl.ParseRequestURI(url)
if err != nil {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte(fmt.Sprintf("Invalid URL provided: %q, %s", url, err)))
return
}
slog.Debug("ParseSingle", "url", netUrl, "params", netUrl.Query(), "encoding", r.Header.Get("Content-Type"))
pp := r.FormValue("pp") == "1"
v := &singleRequest{
URL: netUrl,
PrettyPrint: pp,
v := new(singleURLRequest)
if isJSON(r) {
decoder := json.NewDecoder(r.Body)
decoder.DisallowUnknownFields()
err := decoder.Decode(v)
if !assertDecode(err, w) {
return
}
} else {
url := r.FormValue("url")
if url == "" {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte("No URL provided"))
return
}
netUrl, err := nurl.Parse(url)
if err != nil {
w.WriteHeader(http.StatusBadRequest)
w.Write([]byte(fmt.Sprintf("Invalid URL provided: %q, %s", url, err)))
return
}
v.URL = netUrl
}
if pp {
v.PrettyPrint = true
}
slog.Debug("ParseSingle", "url", v.URL, "pp", v.PrettyPrint, "encoding", r.Header.Get("Content-Type"))
r = r.WithContext(context.WithValue(r.Context(), payloadKey{}, v))
next(w, r)
}
Expand Down
75 changes: 73 additions & 2 deletions internal/server/middleware_test.go
Expand Up @@ -106,7 +106,7 @@ func Test413OnDecodeJSONBody(t *testing.T) {
}
}

func TestParseSingle(t *testing.T) {
func TestParseSingleGet(t *testing.T) {
t.Parallel()
tests := []struct {
name string
Expand All @@ -129,7 +129,7 @@ func TestParseSingle(t *testing.T) {
recorder := httptest.NewRecorder()
m := ParseSingle()
m(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
pp, ok := r.Context().Value(payloadKey{}).(*singleRequest)
pp, ok := r.Context().Value(payloadKey{}).(*singleURLRequest)
if !ok {
t.Fatalf("[%s] ParseSingle, expected payload, got %v", tt.name, pp)
}
Expand All @@ -143,3 +143,74 @@ func TestParseSingle(t *testing.T) {
}
}
}

func TestParseSingleJSON(t *testing.T) {
t.Parallel()
tests := []struct {
name string
body string
expectStatus int
}{
{
name: "valid",
body: `{"url":"http://example.com"}`,
expectStatus: 200,
},
{
name: "invalid",
body: `{"urls":["http://example.com"]}`,
expectStatus: 400,
},
}
for _, tt := range tests {
req := httptest.NewRequest("POST", "http://example.com", strings.NewReader(tt.body))
recorder := httptest.NewRecorder()
m := ParseSingle()
m(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
pp, ok := r.Context().Value(payloadKey{}).(*singleURLRequest)
if !ok {
t.Fatalf("[%s] ParseSingle, expected payload, got %v", tt.name, pp)
}
}))(recorder, req)
response := recorder.Result()
if response.StatusCode != tt.expectStatus {
t.Fatalf("[%s] ParseSingle, expected status %d, got %d", tt.name, tt.expectStatus, response.StatusCode)
}
}
}

func TestParseSinglePostForm(t *testing.T) {
t.Parallel()
tests := []struct {
name string
body string
expectStatus int
}{
{
name: "valid",
body: "url=http://example.com",
expectStatus: 200,
},
{
name: "invalid",
body: "urls=http://example.com",
expectStatus: 400,
},
}
for _, tt := range tests {
req := httptest.NewRequest("POST", "http://example.com", strings.NewReader(tt.body))
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
recorder := httptest.NewRecorder()
m := ParseSingle()
m(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
pp, ok := r.Context().Value(payloadKey{}).(*singleURLRequest)
if !ok {
t.Fatalf("[%s] ParseSingle, expected payload, got %v", tt.name, pp)
}
}))(recorder, req)
response := recorder.Result()
if response.StatusCode != tt.expectStatus {
t.Fatalf("[%s] ParseSingle, expected status %d, got %d", tt.name, tt.expectStatus, response.StatusCode)
}
}
}
2 changes: 1 addition & 1 deletion internal/server/pages/index.html
Expand Up @@ -16,7 +16,7 @@
</head>
<body>
<p>
<form id="urlForm" action="/extract" method="GET" name="scrape" target="data_frame">
<form id="urlForm" action="/extract" method="POST" name="scrape" target="data_frame">
<label for="url">Enter a URL:</label>
<input type="submit" value="Hit It">
<input type="url" name="url" id="url" value="https://" size="96" maxlength="200" pattern="https?://.*" required title="URL">
Expand Down
42 changes: 42 additions & 0 deletions internal/server/payloads.go
@@ -0,0 +1,42 @@
package server

import (
"encoding/json"
"errors"
nurl "net/url"
)

type BatchRequest struct {
Urls []string `json:"urls"`
}

type singleURLRequest struct {
URL *nurl.URL `json:"url"`
PrettyPrint bool `json:"pp,omitempty"`
}

var errNoURL = errors.New("URL is required")

func (sur *singleURLRequest) UnmarshalJSON(b []byte) error {
type alias singleURLRequest
asur := &struct {
URL string `json:"url"`
*alias
}{
alias: (*alias)(sur),
}
if err := json.Unmarshal(b, asur); err != nil {
return err
}
if asur.URL == "" {
return errNoURL
}
var err error
if sur.URL, err = nurl.Parse(asur.URL); err != nil {
return err
}
if !sur.URL.IsAbs() {
return errors.New("URL must be absolute")
}
return nil
}
75 changes: 75 additions & 0 deletions internal/server/payloads_test.go
@@ -0,0 +1,75 @@
package server

import (
"encoding/json"
"strings"
"testing"
)

func TestUnmarshalSingleUrlRequest(t *testing.T) {
t.Parallel()
tests := []struct {
name string
body string
expectURLString string
expectPP bool
expectErr bool
}{
{
name: "valid",
body: `{"url":"http://example.com"}`,
expectURLString: "http://example.com",
expectPP: false,
expectErr: false,
},
{
name: "missing url",
body: `{"urls":["http://example.com"]}`,
expectURLString: "",
expectPP: false,
expectErr: true,
},
{
name: "non-absolute url",
body: `{"url":"example/foo"}`,
expectURLString: "",
expectPP: false,
expectErr: true,
},
{
name: "url with fragment",
body: `{"url":"http://example.com#fragment"}`,
expectURLString: "http://example.com#fragment",
expectPP: false,
expectErr: false,
},
}
for _, tt := range tests {
var sur singleURLRequest
err := sur.UnmarshalJSON([]byte(tt.body))
if (err != nil) != tt.expectErr {
t.Fatalf("[%s] UnmarshalSingleUrlRequest, expected error %v, got %v", tt.name, tt.expectErr, err)
}
if tt.expectURLString != "" && sur.URL.String() != tt.expectURLString {
t.Errorf("[%s] UnmarshalSingleUrlRequest, expected URL %s, got %s", tt.name, tt.expectURLString, sur.URL.String())
}
if sur.PrettyPrint != tt.expectPP {
t.Errorf("[%s] UnmarshalSingleUrlRequest, expected PrettyPrint %v, got %v", tt.name, tt.expectPP, sur.PrettyPrint)
}
// now run the same test but with json.Decoder
reader := strings.NewReader(tt.body)
decoder := json.NewDecoder(reader)
decoder.DisallowUnknownFields()
surD := new(singleURLRequest)
err = decoder.Decode(surD)
if (err != nil) != tt.expectErr {
t.Fatalf("[%s] json.Decoder.Decode, expected error %v, got %v", tt.name, tt.expectErr, err)
}
if tt.expectURLString != "" && surD.URL.String() != tt.expectURLString {
t.Errorf("[%s] json.Decoder.Decode, expected URL %s, got %s", tt.name, tt.expectURLString, surD.URL.String())
}
if surD.PrettyPrint != tt.expectPP {
t.Errorf("[%s] json.Decoder.Decode, expected PrettyPrint %v, got %v", tt.name, tt.expectPP, surD.PrettyPrint)
}
}
}

0 comments on commit 32261b3

Please sign in to comment.