forked from dragonflyoss/Dragonfly2
/
training.go
98 lines (82 loc) · 2.63 KB
/
training.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
/*
* Copyright 2023 The Dragonfly Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package training
import (
"context"
"golang.org/x/sync/errgroup"
logger "github.com/XDTD/Dragonfly2/internal/dflog"
managerclient "github.com/XDTD/Dragonfly2/pkg/rpc/manager/client"
"github.com/XDTD/Dragonfly2/trainer/config"
"github.com/XDTD/Dragonfly2/trainer/storage"
)
//go:generate mockgen -destination mocks/training_mock.go -source training.go -package mocks
// Training defines the interface to train GNN and MLP model.
type Training interface {
// Train begins training GNN and MLP model.
Train(context.Context, string, string) error
}
// training implements Training interface.
type training struct {
// Trainer service config.
config *config.Config
// Storage interface.
storage storage.Storage
// Manager service clent.
managerClient managerclient.V2
}
// New returns a new Training.
func New(cfg *config.Config, managerClient managerclient.V2, storage storage.Storage) Training {
return &training{
config: cfg,
storage: storage,
managerClient: managerClient,
}
}
// Train begins training GNN and MLP model.
func (t *training) Train(ctx context.Context, ip, hostname string) error {
eg, ctx := errgroup.WithContext(ctx)
eg.Go(func() error {
return t.trainGNN(ctx, ip, hostname)
})
eg.Go(func() error {
return t.trainMLP(ctx, ip, hostname)
})
// Wait for all train tasks to complete.
if err := eg.Wait(); err != nil {
logger.Errorf("training failed: %v", err)
return err
}
// TODO Clean up training data.
return nil
}
// TODO Add training GNN logic.
// trainGNN trains GNN model.
func (t *training) trainGNN(ctx context.Context, ip, hostname string) error {
// 1. Get training data from storage.
// 2. Preprocess training data.
// 2. Train GNN model.
// 3. Upload GNN model to manager service.
return nil
}
// TODO Add training MLP logic.
// trainMLP trains MLP model.
func (t *training) trainMLP(ctx context.Context, ip, hostname string) error {
// 1. Get training data from storage.
// 2. Preprocess training data.
// 2. Train MLP model.
// 3. Upload MLP model to manager service.
return nil
}