/
hansard.go
208 lines (191 loc) · 5.83 KB
/
hansard.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
package hansard
import (
"fmt"
"regexp"
"strings"
)
type HansardType int
const (
HANSARD_INVALID HansardType = iota
HANSARD_SPOKEN
HANSARD_WRITTEN
HANSARD_DEBATE
)
type HansardQuestion struct {
QuestionNum string
PageNumStart int
PageNumEnd int
}
type HansardDocument struct {
StateAssemblySession string
HansardType HansardType
HansardQuestions []HansardQuestion
}
type ErrorQuestionsHasInvalid struct {
badQuestionsCount int
}
func (e *ErrorQuestionsHasInvalid) Error() string {
return fmt.Sprintf("Has %d bad Questions!", e.badQuestionsCount)
}
func NewHansardDocument(sessionName string, pdfPath string) (*HansardDocument, error) {
// Load the PDFDoc; should we check length?
pdfDoc, err := NewPDFDocument(pdfPath, nil)
if err != nil {
return nil, err
}
// Process the HansardDoc ..
hansardDoc := HansardDocument{StateAssemblySession: sessionName}
cerr := NewHansardDocumentContent(pdfDoc, &hansardDoc)
if cerr != nil {
return nil, cerr
}
// Any post processing??
return &hansardDoc, nil
}
func detectHansardType(firstPage PDFPage) (HansardType, error) {
for _, rowContent := range firstPage.PDFTxtSameLines {
normalizedContent := strings.ToLower(rowContent)
// Look out for pertanyaan
hasQuestion, err := regexp.MatchString("pertanyaan", normalizedContent)
if err != nil {
return HANSARD_INVALID, err
}
if hasQuestion {
// Has potential; do the further checks ..
// Look out for mulut
hasSpokenHansardType, serr := regexp.MatchString("mulut", normalizedContent)
if serr != nil {
return HANSARD_INVALID, serr
}
// If found match; get out IMMEDIATELY!
if hasSpokenHansardType {
return HANSARD_SPOKEN, nil
}
// Look out for tulis
hasWrittenHansardType, werr := regexp.MatchString("tulis", normalizedContent)
if werr != nil {
return HANSARD_INVALID, werr
}
if hasWrittenHansardType {
return HANSARD_WRITTEN, nil
}
}
}
// If get here without a match, no type FOUND! INVALID default ..
return HANSARD_INVALID, nil
}
func NewHansardDocumentContent(pdfDoc *PDFDocument, hansardDoc *HansardDocument) error {
// validation checks
if len(pdfDoc.Pages) < 1 {
return fmt.Errorf("Needs at least one page for valid Hansard!!! Found: %d", len(pdfDoc.Pages))
}
// Detect HansardType from the first page ... and fill it up ..
// DEBUG
//spew.Dump(pdfDoc.Pages[0])
hansardType, derr := detectHansardType(pdfDoc.Pages[0])
if derr != nil {
return derr
}
// DEBUG
//fmt.Println("TYPE: ", hansardType)
hansardDoc.HansardType = hansardType
// Extract out Questions metadata for all pages ..
hansardQuestions := make([]HansardQuestion, 0, 20)
hqerr := NewHansardQuestions(pdfDoc, &hansardQuestions)
if hqerr != nil {
return hqerr
}
// Fill up the questions ...
hansardDoc.HansardQuestions = hansardQuestions
// DEBUG
//spew.Dump(hansardDoc)
// All OK?
return nil
}
func isStartOfQuestionSection(currentPage PDFPage) bool {
// Look out for pertanyaan pattern .. is duplicated
// TODO: Should sync it up somehow; having it embedded so deep here might
// not be too good ..
hansardType, err := detectHansardType(currentPage)
if err != nil {
panic(err)
}
if hansardType == HANSARD_INVALID {
return false
}
// More sophisticated checks later ?? If ever ..
// At this point we know if it is SPOKEN, WRITTEn etc?
// Look for question number pattern
// Look for topic
// Look for who ask for it ..
return true
}
func extractQuestionNum(rowContent string) (string, error) {
// Setup regexp once
re := regexp.MustCompile(`(?i)^.*?(\d+).*bertanya\s+kepada.*$`)
// TODO: Might have to break up line cases; what other special characters will appear here?
// It fails with the content; but will it appear in real life? ==> "\n\n\n 50 bertanya kepada yab menteri besar Azmin ALI "
sm := re.FindStringSubmatch(rowContent)
if sm != nil {
// DEBUG:
//fmt.Println("FOUND NUM: ", sm[1])
return sm[1], nil
}
return "", nil
}
func NewHansardQuestions(pdfDoc *PDFDocument, hansardQuestions *[]HansardQuestion) error {
if pdfDoc == nil {
return fmt.Errorf("pdfDoc is nil!")
}
var hansardQuestion *HansardQuestion
var badQuestionsCount int
// Iterate through all pages
for _, r := range pdfDoc.Pages {
if isStartOfQuestionSection(r) {
// Special case: first round ..
if hansardQuestion != nil {
// Before append; let's check previous and flag if got bad question
if hansardQuestion.QuestionNum == "0" {
badQuestionsCount++
}
// Otherwise, attach as per needed ..
*hansardQuestions = append(*hansardQuestions, *hansardQuestion)
}
// Init a new hansardQuestion struct
hansardQuestion = &HansardQuestion{QuestionNum: "0", PageNumStart: r.PageNo}
for _, rowContent := range r.PDFTxtSameLines {
foundQuestionNum, exerr := extractQuestionNum(rowContent)
// DEBUG ..
//fmt.Println(fmt.Sprintf("FOUND Question %s in page %d", foundQuestionNum, r.PageNo))
if exerr != nil {
return exerr
}
if foundQuestionNum != "" {
// fill it in, the foundQuestionNum; need to strip?
hansardQuestion.QuestionNum = strings.TrimSpace(foundQuestionNum)
break
}
}
}
// Put some protection checks ..
if hansardQuestion != nil {
// Update end page num as we go along ..
hansardQuestion.PageNumEnd = r.PageNo
}
}
// Special case: last line; code below probably can be refactored!
if hansardQuestion != nil {
// Before append; let's check previous and flag if got bad question
if hansardQuestion.QuestionNum == "0" {
badQuestionsCount++
}
// Otherwise, attach as per needed ..
*hansardQuestions = append(*hansardQuestions, *hansardQuestion)
}
// If have badQuestionsCount; flag it; NOT fatal; but to be handled by caller
if badQuestionsCount > 0 {
return fmt.Errorf("NewHansardQuestions FAIL: %w", &ErrorQuestionsHasInvalid{badQuestionsCount})
}
// Reached here; all OK and peachy!
return nil
}