/
watchdog.go
177 lines (152 loc) · 5.17 KB
/
watchdog.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
package main
import (
"errors"
"fmt"
"io"
"net"
"net/http"
"net/url"
"os"
"time"
)
/*
This module is here because we've had issues with proxy-chaining when
the child-parent link is using HTTP2 (ie: child with HTTPS
proxy_chain_child_url setting).
A watchdog is now set (the parent watches its child) in this exact situation.
After a few times (ex: 10 days) the multiplexed HTTP2 connection between
the parent and one child may hangs, as the parent became unable to
read child responses. The parent is then unable to serve ANY request for
this child (as all requests are multiplexed in the same HTTP2 link).
This is caused by deadlock in x/net/http2, see related golang issues:
* https://github.com/golang/go/issues/32388
* https://github.com/golang/go/issues/33425
* https://github.com/golang/go/issues/39812
The x/net/http2 API does not provides access to the underlying connections,
so we have no clean way to kill the faulty HTTP2 link.
The best (current) way for us to mitigate this situation is to detect
this deadlock ("child is responding [TCP] but not with HTTP2") and kill
the whole proccess. Systemd (or whatever service manager) will instantly
restart the proxy, causing a short downtime every two weeks or so. Better
than a full lock of a child :(
Since then a new symptom appeared, where the front itself is fully locked.
(dump is full of Lock() calls). We extend the watchdog to also check
the front itself.
*/
// dialTest will attempt a TCP connection to host in the urlIn,
// with default HTTP/HTTPS port number if not provided in the URL
// returns nil error if connection was successful
func dialTest(urlIn string, timeout time.Duration) error {
urlObj, err := url.Parse(urlIn)
if err != nil {
return err
}
host := ""
port := ""
if urlObj.Port() != "" {
// port is explicit
host, port, err = net.SplitHostPort(urlObj.Host)
if err != nil {
return err
}
} else {
host = urlObj.Host
port = "80"
if urlObj.Scheme == ProtoHTTPS {
port = "443"
}
}
conn, err := net.DialTimeout("tcp", net.JoinHostPort(host, port), timeout)
if err != nil {
return err
}
if conn != nil {
conn.Close()
return nil
}
return errors.New("no error, but no connection either")
}
func watchProxy(url string, log *Log) error {
// check if we get a TCP connection to this proxy
err := dialTest(url, 5*time.Second)
if err != nil {
log.Errorf("watchdog: proxy %s seems down (%s)", url, err)
return nil
}
// -- proxy seems up, so let's see if we can get an HTTP2 response
req, err := http.NewRequest("GET", url, nil)
if err != nil {
log.Errorf("watchdog: %s", err.Error())
return nil
}
req.Header.Set(WatchDogHeaderName, "true")
http2Client := &http.Client{
// create our own connection
Transport: &http.Transport{
DisableKeepAlives: true,
},
// up to a few times per day, anwser needs ~5.01s (for unknown reasons yet)
// so we set a 10s timeout
Timeout: time.Duration(10 * time.Second),
}
start := time.Now()
response2, err := http2Client.Do(req)
if err != nil {
if err, ok := err.(net.Error); ok && err.Timeout() { // is timeout?
// OK, we have a REAL problem with this proxy, here!
log.Infof("watchdog: HTTP2 timeout (%s) for %s", time.Since(start), url)
return err
}
log.Errorf("watchdog: HTTP2: %s (%s)", err.Error(), url) // another error
return nil
}
defer response2.Body.Close()
// (might remove this trace in the future, it's here to investigate 5s timeouts)
log.Tracef("watchdog: HTTP2 response for %s in %s", url, time.Since(start))
// drain response
_, err = io.ReadAll(response2.Body)
if err != nil {
// we've also seen timeout related issues* thru this drain, so we consider this as a fatal failure
// * "context deadline exceeded (Client.Timeout or context cancellation while reading body)"
return fmt.Errorf("HTTP2 drain: %s", err.Error())
}
// everything is OK for this proxy
return nil
}
func watchProxies(ddb *DomainDatabase, selfUrl url.URL, log *Log) {
// find all children proxies configured with HTTPS
children := ddb.GetChildren()
log.Tracef("watchdog: checking children (%d) + ourself", len(children))
for _, childURL := range children {
urlObj, _ := url.Parse(childURL)
if urlObj.Scheme == ProtoHTTPS {
err := watchProxy(childURL, log)
if err != nil {
log.Errorf("watchdog: %s", err.Error())
log.Errorf("FATAL: watchdog unable to contact child using HTTP2 while child seems up, possible chain deadlock! Exiting process for a forced restart.")
os.Exit(200)
}
}
}
if selfUrl.Scheme == ProtoHTTPS {
// remove port from selfUrl (we want to contact our proxy server, not the API)
selfUrl.Host = selfUrl.Hostname()
err := watchProxy(selfUrl.String(), log)
if err != nil {
log.Error(err.Error())
log.Errorf("FATAL: watchdog unable to contact ourself using HTTP2, possible deadlock! Exiting process for a forced restart.")
os.Exit(200)
}
}
log.Trace("watchdog: end")
}
// InstallWatchdog will check HTTP2 links to our children (as a parent proxy)
// and to ourself (we may increase frequency later)
func InstallWatchdog(ddb *DomainDatabase, selfUrl *url.URL, log *Log) {
go func() {
for {
time.Sleep(30 * time.Second)
watchProxies(ddb, *selfUrl, log)
}
}()
}