-
Notifications
You must be signed in to change notification settings - Fork 5.5k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Trainer library discover master by etcd #2551
Changes from 5 commits
bff8d8d
d80bd05
475f77d
c1cfcdc
3c88fd2
cfeac09
ff3324c
83e7e45
77cbc55
f691bf6
e6492ff
ca6a114
f5ed683
0792515
d4c06ed
3b02067
0d8a0ed
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
CMakeCache.txt | ||
cmake_install.cmake | ||
CMakeFiles | ||
go |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,6 +54,13 @@ func (a addresser) Address() string { | |
return string(a) | ||
} | ||
|
||
//export paddle_new_etcd_master_client | ||
func paddle_new_etcd_master_client(etcdEndpoints *C.char, bufSize int) C.paddle_master_client { | ||
p := C.GoString(etcdEndpoints) | ||
c := master.NewEtcdClient(addresser(p), bufSize) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe just There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
return add(c) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 感觉把 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
} | ||
|
||
//export paddle_new_master_client | ||
func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client { | ||
a := C.GoString(addr) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,14 +1,19 @@ | ||
package master | ||
|
||
import ( | ||
"context" | ||
"os" | ||
"strings" | ||
"time" | ||
|
||
"github.com/PaddlePaddle/Paddle/go/connection" | ||
"github.com/PaddlePaddle/recordio" | ||
"github.com/coreos/etcd/clientv3" | ||
log "github.com/sirupsen/logrus" | ||
) | ||
|
||
const masterAddrPath = "/master" | ||
|
||
// Addresser provide the address of the master server. | ||
type Addresser interface { | ||
Address() string | ||
|
@@ -20,6 +25,51 @@ type Client struct { | |
ch chan []byte | ||
} | ||
|
||
// MasterAddresser provide master address | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment function name must be the same to function name There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. |
||
type masterAddresser struct { | ||
client *clientv3.Client | ||
timeout time.Duration | ||
endpoints []string | ||
} | ||
|
||
// Address return the address | ||
func (m masterAddresser) Address() string { | ||
for { | ||
ctx, cancel := context.WithTimeout(context.Background(), m.timeout) | ||
resp, err := m.client.Get(ctx, masterAddrPath) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Clients should use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I do not think There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I originally thought client can poll etcd to get the latest master address. But I think @typhoonzero 's idea is better. func NewClient(addr Addresser, bufSize int) To: func NewClient(addrCh <-chan string, bufSize int) And maybe change moniterMaster to: func (c *Client) monitorMaster(addrCh <-chan string) {
lastMaster := ""
for curMaster := range addrCh {
// connect to the new address once address changed.
if curMaster != lastMaster {
if curMaster == "" {
err := c.conn.Close()
if err != nil {
log.Errorln(err)
}
} else {
err := c.conn.Connect(curMaster)
if err != nil {
log.Errorln(err)
// connect to addr failed, set
// to last known addr in order
// to retry next time.
curMaster = lastMaster
}
}
}
lastMaster = curMaster
}
} There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Yancey1989 Thanks! type Store interface {
Save([]byte) error
Load() ([]byte, error)
} We have an implementation of |
||
cancel() | ||
if err != nil { | ||
log.Errorf("Fetch master addr failed, reconnecting to etcd, %v", err) | ||
err := m.client.Close() | ||
if err != nil { | ||
log.Errorln(err) | ||
time.Sleep(m.timeout) | ||
continue | ||
} | ||
// reconnect to etcd server | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seem that etcd have a some kind of retry method for |
||
m.client, err = clientv3.New(clientv3.Config{ | ||
Endpoints: m.endpoints, | ||
DialTimeout: m.timeout, | ||
}) | ||
if err != nil { | ||
log.Errorf("Reconnecting etcd failed, sleep for %d seconds ...\n%v", m.timeout, err) | ||
time.Sleep(m.timeout) | ||
continue | ||
} | ||
continue | ||
} | ||
kvs := resp.Kvs | ||
if len(kvs) == 0 { | ||
log.Infoln("Waiting for master be ready ...") | ||
time.Sleep(m.timeout) | ||
continue | ||
} | ||
mAddr := kvs[0].Value | ||
log.Debugf("Fetched master address: %s\n", mAddr) | ||
return string(mAddr) | ||
} | ||
} | ||
|
||
// NewClient creates a new Client. | ||
// | ||
// bufSize is the record buffer size. NextRecord will read from this | ||
|
@@ -33,6 +83,32 @@ func NewClient(addr Addresser, bufSize int) *Client { | |
return c | ||
} | ||
|
||
// NewEtcdClient create a new master client by etcd | ||
// | ||
// endpoints is the endpoints for etcd and separated by ",", such as | ||
// "172.0.1.0:2379,172.0.1.1:2379" | ||
// timeout is the timeout for etcd calls | ||
// bufSize is the record buffer size. NextRecord will read from this buffer. | ||
func NewEtcdClient(endpoints string, timeout int, bufSize int) *Client { | ||
t := time.Second * time.Duration(timeout) | ||
ep := strings.Split(endpoints, ",") | ||
cli, err := clientv3.New(clientv3.Config{ | ||
Endpoints: ep, | ||
DialTimeout: t, | ||
}) | ||
if err != nil { | ||
log.Errorf("Init etcd connection failed: %v", err) | ||
panic(err) | ||
} | ||
log.Debugf("Connected to etcd: %s\n", endpoints) | ||
mAddresser := masterAddresser{ | ||
client: cli, | ||
timeout: t, | ||
endpoints: ep, | ||
} | ||
return NewClient(mAddresser, bufSize) | ||
} | ||
|
||
func (c *Client) getRecords() { | ||
for { | ||
t, err := c.getTask() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do we still need this files? Since the build result is now moved to build dir
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, done.