forked from dgraph-io/dgraph
/
merge_shards.go
107 lines (95 loc) · 2.75 KB
/
merge_shards.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
/*
* Copyright (C) 2017 Dgraph Labs, Inc. and Contributors
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
package bulk
import (
"fmt"
"os"
"path/filepath"
"sort"
"strings"
"github.com/dgraph-io/dgraph/x"
)
func mergeMapShardsIntoReduceShards(opt options) {
mapShards := shardDirs(opt.TmpDir)
var reduceShards []string
for i := 0; i < opt.ReduceShards; i++ {
shardDir := filepath.Join(opt.TmpDir, "shards", fmt.Sprintf("shard_%d", i))
x.Check(os.MkdirAll(shardDir, 0755))
reduceShards = append(reduceShards, shardDir)
}
// Heuristic: put the largest map shard into the smallest reduce shard
// until there are no more map shards left. Should be a good approximation.
for _, shard := range mapShards {
sortBySize(reduceShards)
x.Check(os.Rename(shard, filepath.Join(
reduceShards[len(reduceShards)-1], filepath.Base(shard))))
}
}
func shardDirs(tmpDir string) []string {
dir, err := os.Open(filepath.Join(tmpDir, "shards"))
x.Check(err)
shards, err := dir.Readdirnames(0)
x.Check(err)
dir.Close()
for i, shard := range shards {
shards[i] = filepath.Join(tmpDir, "shards", shard)
}
// Allow largest shards to be shuffled first.
sortBySize(shards)
return shards
}
func filenamesInTree(dir string) []string {
var fnames []string
x.Check(filepath.Walk(dir, func(path string, fi os.FileInfo, err error) error {
if err != nil {
return err
}
if strings.HasSuffix(path, ".map") {
fnames = append(fnames, path)
}
return nil
}))
return fnames
}
type sizedDir struct {
dir string
sz int64
}
// sortBySize sorts the input directories by size of their content (biggest to smallest).
func sortBySize(dirs []string) {
sizedDirs := make([]sizedDir, len(dirs))
for i, dir := range dirs {
sizedDirs[i] = sizedDir{dir: dir, sz: treeSize(dir)}
}
sort.SliceStable(sizedDirs, func(i, j int) bool {
return sizedDirs[i].sz > sizedDirs[j].sz
})
for i := range sizedDirs {
dirs[i] = sizedDirs[i].dir
}
}
func treeSize(dir string) int64 {
var sum int64
x.Check(filepath.Walk(dir, func(path string, info os.FileInfo, err error) error {
if err != nil {
return err
}
sum += info.Size()
return nil
}))
return sum
}