forked from habeanf/yap
/
malearn.go
95 lines (83 loc) · 2.47 KB
/
malearn.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
package app
import (
// "yap/nlp/format/lattice"
// nlp "yap/nlp/types"
"yap/nlp/parser/ma"
// "yap/util"
// "fmt"
"log"
// "os"
"github.com/gonuts/commander"
"github.com/gonuts/flag"
)
var (
latFile, rawFile, conlluFile, dataFile string
useConllU bool // TODO: whatever i don't care anymore
maxPOS, maxMSRPerPOS int
)
func MALearnConfigOut() {
log.Println("Configuration")
if useConllU {
log.Printf("CoNLL-U:\t%s", conlluFile)
} else {
log.Printf("Lattice:\t%s", latFile)
log.Printf("Raw:\t\t%s", rawFile)
}
log.Printf("Limit:\t%v", limit)
log.Println()
log.Printf("Output:\t%s", dataFile)
log.Println()
}
func MALearn(cmd *commander.Command, args []string) error {
var REQUIRED_FLAGS []string
useConllU = len(conlluFile) > 0
if useConllU {
useConllU = true
REQUIRED_FLAGS = []string{"conllu", "out"}
} else {
REQUIRED_FLAGS = []string{"lattice", "raw", "out"}
}
VerifyFlags(cmd, REQUIRED_FLAGS)
MALearnConfigOut()
log.Println("Starting learning for data-driven morphological analyzer")
maData := new(ma.MADict)
maData.Language = "Test"
maData.MaxTopPOS = maxPOS
maData.MaxMSRsPerPOS = maxMSRPerPOS
var (
numLearned int
err error
)
if useConllU {
numLearned, err = maData.LearnFromConllU(conlluFile, limit)
} else {
numLearned, err = maData.LearnFromLat(latFile, rawFile, limit)
}
if err != nil {
log.Println("Got error learning", err)
return err
}
log.Println("Learned", numLearned, "new tokens")
maData.WriteFile(dataFile)
return nil
}
func MALearnCmd() *commander.Command {
cmd := &commander.Command{
Run: MALearn,
UsageLine: "malearn <file options> [arguments]",
Short: "generate a data-driven morphological analysis dictionary for a set of files",
Long: `
generate a data-driven morphological analysis dictionary for a set of files
$ ./yap malearn -lattice <lattice file> -raw <raw file> [options]
`,
Flag: *flag.NewFlagSet("malearn", flag.ExitOnError),
}
cmd.Flag.StringVar(&latFile, "lattice", "", "Lattice-format input file")
cmd.Flag.StringVar(&rawFile, "raw", "", "raw sentences input file")
cmd.Flag.StringVar(&conlluFile, "conllu", "", "CoNLL-U-format input file")
cmd.Flag.StringVar(&dataFile, "out", "", "output file")
cmd.Flag.IntVar(&maxMSRPerPOS, "maxmsrperpos", 5, "For OOV tokens, max MSRs per POS to add")
cmd.Flag.IntVar(&maxPOS, "maxpos", 5, "For OOV tokens, max POS to add")
cmd.Flag.IntVar(&limit, "limit", 0, "limit training set")
return cmd
}