-
Notifications
You must be signed in to change notification settings - Fork 0
/
extract-text.ps1
37 lines (28 loc) · 877 Bytes
/
extract-text.ps1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
using namespace iText.Layout
using namespace iText.Layout.Element
using namespace iText.Kernel.Pdf
using namespace iText.Kernel.Pdf.Canvas.Parser
using namespace iText.Kernel.Pdf.Canvas.Parser.Listener
param (
[string] $pdfName
)
if (-not (test-path $pdfName)) {
write-host "$pdfName does not exist"
return
}
. ./add-types.ps1
#
# A PdfReader reads and parses a PDF document.
#
$pdfReader = [PdfReader]::new($pdfName);
$pdfDocument = [iText.Kernel.Pdf.PdfDocument]::new($pdfReader)
$totalPages = $pdfDocument.GetNumberOfPages()
write-host "Number of pages: $totalPages"
[ITextExtractionStrategy] $strategy = [SimpleTextExtractionStrategy]::new()
for ($p = 1; $p -le $totalPages; $p++) {
write-host " page: $p"
$page = $pdfDocument.GetPage($p)
$text = [PdfTextExtractor]::GetTextFromPage($page, $strategy);
write-host $text
}
$pdfReader.Close();