Permalink
Browse files

Slightly more robust page linking, still plenty to do however

  • Loading branch information...
Palmr committed Oct 20, 2015
1 parent a22ff1c commit b3546ca164b69165a0506243bb161db9529905cd
@@ -6,4 +6,6 @@
target
graph-db/*
resources/tbontb-regular.pdf
resources/*
itext-rups/*
@@ -0,0 +1,84 @@
node {
diameter: 50px;
color: #A5ABB6;
border-color: #9AA1AC;
border-width: 2px;
text-color-internal: #FFFFFF;
font-size: 10px;
caption: 'Page {page_number}';
}
relationship {
color: #A5ABB6;
shaft-width: 1px;
font-size: 8px;
padding: 3px;
text-color-external: #000000;
text-color-internal: #FFFFFF;
caption: '<type>';
}
node.Page {
color: #68BDF6;
border-color: #5CA8DB;
text-color-internal: #FFFFFF;
caption: '{book_page_label}';
}
node.Ending {
color: #6DCE9E;
border-color: #60B58B;
text-color-internal: #FFFFFF;
caption: '{page_number}';
}
node.EndPage {
color: #FF756E;
border-color: #E06760;
text-color-internal: #FFFFFF;
}
node.Ignore {
color: #A5ABB6;
border-color: #9AA1AC;
text-color-internal: #FFFFFF;
}
node.ImagePage {
color: #6DCE9E;
border-color: #60B58B;
text-color-internal: #FFFFFF;
}
node.SubBook {
color: #FFD86E;
border-color: #EDBA39;
text-color-internal: #604A0E;
diameter: 50px;
}
node.SubBook.Ending {
color: #6DCE9E;
border-color: #60B58B;
text-color-internal: #FFFFFF;
caption: '{page_number}';
}
node.SubBook.EndPage {
color: #FF756E;
border-color: #E06760;
text-color-internal: #FFFFFF;
}
node.SubBook.Ignore {
color: #A5ABB6;
border-color: #9AA1AC;
text-color-internal: #FFFFFF;
}
node.SubBook.ImagePage {
color: #6DCE9E;
border-color: #60B58B;
text-color-internal: #FFFFFF;
}
@@ -1,113 +1,229 @@
package uk.co.palmr;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.PdfReaderContentParser;
import com.itextpdf.text.pdf.parser.SimpleTextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.factories.RomanAlphabetFactory;
import com.itextpdf.text.factories.RomanNumberFactory;
import com.itextpdf.text.pdf.*;
import com.itextpdf.text.pdf.parser.*;
import org.neo4j.graphdb.*;
import org.neo4j.graphdb.factory.GraphDatabaseFactory;
import org.neo4j.register.Register;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class App {
private enum RelationshipTypes implements RelationshipType {
ContinuesTo,
RedirectsTo
Continues,
Choice
}
private enum NodeTypes implements Label {
private enum PageTypes implements Label {
Page,
Ending
ImagePage,
EndPage,
SubBook,
Ignore
}
private static final int PAGE_OFFSET = 4;
private static final int SUB_BOOK_OFFSET = 24;
private static final String PDF_PAGE_NUMBER = "pdf_page_number";
private static final String BOOK_PAGE_LABEL = "book_page_label";
public static void main(String[] args) throws IOException {
PdfReader reader = new PdfReader("C:\\Users\\npalmer\\git-projects\\Adventurer\\resources\\tbontb-regular.pdf");
PdfReader reader = new PdfReader(App.class.getClassLoader().getResource("").getPath() + "\\..\\..\\resources\\tbontb-regular.pdf");
GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabaseBuilder("C:\\Users\\npalmer\\git-projects\\Adventurer\\graph-db").newGraphDatabase();
GraphDatabaseService graphDb = new GraphDatabaseFactory().newEmbeddedDatabaseBuilder(App.class.getClassLoader().getResource("").getPath() + "\\..\\..\\graph-db").newGraphDatabase();
registerShutdownHook(graphDb);
System.out.println("Attempting to clear database from last run");
try (Transaction ignored = graphDb.beginTx();
Result result = graphDb.execute("MATCH (n)\n" +
"OPTIONAL MATCH (n)-[r]-()\n" +
"DELETE n,r")) {
System.out.println(result.resultAsString());
ignored.success();
}
System.out.println("Attempting to populate database");
try (Transaction tx = graphDb.beginTx()) {
long[] pageNodeIdArray = new long[reader.getNumberOfPages() + 1];
for (int page = 1 + PAGE_OFFSET; page <= reader.getNumberOfPages(); page++) {
Node pageNode = graphDb.createNode(NodeTypes.Page);
pageNode.setProperty("page_number", resolveBookPageNumber(page));
pageNode.setProperty("pdf_page_number", page);
pageNodeIdArray[page] = pageNode.getId();
System.out.println("Creating page nodes");
String[] bookPageLabels = getPageLabels(reader);
Map<PdfObject, String> pdfPageToBookPageLabel = new HashMap<>(reader.getNumberOfPages());
for (int pdfPageNumber = 1; pdfPageNumber <= reader.getNumberOfPages(); pdfPageNumber++) {
Node pageNode = graphDb.createNode(PageTypes.Page);
pageNode.setProperty(PDF_PAGE_NUMBER, pdfPageNumber);
pageNode.setProperty(BOOK_PAGE_LABEL, bookPageLabels[pdfPageNumber - 1]);
pdfPageToBookPageLabel.put(reader.getPageN(pdfPageNumber), bookPageLabels[pdfPageNumber - 1]);
}
PdfReaderContentParser parser = new PdfReaderContentParser(reader);
TextExtractionStrategy strategy;
for (int page = 1 + PAGE_OFFSET; page <= reader.getNumberOfPages(); page++) {
strategy = parser.processContent(page, new SimpleTextExtractionStrategy());
System.out.println(strategy.getResultantText());
System.out.println("");
System.out.println("Looping over pages");
PdfReaderContentParser contentParser = new PdfReaderContentParser(reader);
Map<String, PdfObject> linkDestinations = reader.getNamedDestinationFromStrings();
Node thisPage = graphDb.getNodeById(pageNodeIdArray[page]);
for (int pdfPageNumber = 1; pdfPageNumber <= reader.getNumberOfPages(); pdfPageNumber++) {
// int pdfPageNumber = 433;
// {
Node thisPage = graphDb.findNode(PageTypes.Page, PDF_PAGE_NUMBER, pdfPageNumber);
if (thisPage == null) {
throw new RuntimeException("Couldn't find a node for PDF page: " + pdfPageNumber);
}
if (strategy.getResultantText().contains("THE END")) {
thisPage.removeLabel(NodeTypes.Page);
thisPage.addLabel(NodeTypes.Ending);
System.out.println("End Page");
String thisBookPageLabel = (String)thisPage.getProperty(BOOK_PAGE_LABEL);
boolean isSubBook = false;
if (thisBookPageLabel.startsWith("G")) {
// All of the sub book (The Murder of Gonzago) page labels start with G
thisPage.addLabel(PageTypes.SubBook);
}
if (!thisBookPageLabel.matches(".*\\d+.*")) {
// If there's no digits in the label, ignore the page
thisPage.addLabel(PageTypes.Ignore);
}
else {
// TODO the choice part of the regex doesn't work multi-line properly, ideally use annotations and check for preceeding text blocks?
Pattern pageLinkPattern = Pattern.compile("(.*?)turn to page ([0-9]+)", Pattern.CASE_INSENSITIVE);
Matcher pageLinkMatcher = pageLinkPattern.matcher(strategy.getResultantText());
boolean linksfound = false;
while (pageLinkMatcher.find()) {
System.out.println("Choice: " + pageLinkMatcher.group(1));
System.out.println("Links to: " + pageLinkMatcher.group(2));
Node targetPage = graphDb.getNodeById(pageNodeIdArray[resolvePDFPageNumber(Integer.valueOf(pageLinkMatcher.group(2)))]);
Relationship relationship = thisPage.createRelationshipTo(targetPage, RelationshipTypes.RedirectsTo);
relationship.setProperty("choice", pageLinkMatcher.group(1));
// Check for inter-page-links
PdfDictionary pageDict = reader.getPageN(pdfPageNumber);
PdfArray annotArray = pageDict.getAsArray(PdfName.ANNOTS);
boolean linksOut = false;
if (annotArray != null) {
for (int i = 0; i < annotArray.size(); i++) {
PdfDictionary annotDict = annotArray.getAsDict(i);
if (PdfName.LINK == annotDict.get(PdfName.SUBTYPE)) {
if (annotDict.contains(PdfName.A)) {
String lDestination = ((PdfDictionary) reader.getPdfObject(annotDict.getAsIndirectObject(PdfName.A))).getAsString(PdfName.D).toString();
if (linkDestinations.containsKey(lDestination)) {
PdfArray destinationInfoArray = (PdfArray) linkDestinations.get(lDestination);
PdfIndirectReference destinationReference = destinationInfoArray.getAsIndirectObject(0); // TODO, this could actually be an integer for the case of Remote Destinations
PdfObject targetPdfPage = PdfReader.getPdfObject(destinationReference);
// Create link if it hasn't already been made between these pages (Split-line links mean two annots with the same dest)
Node targetPage = graphDb.findNode(PageTypes.Page, BOOK_PAGE_LABEL, pdfPageToBookPageLabel.get(targetPdfPage));
boolean existingRelationship = false;
for (Relationship r : thisPage.getRelationships(Direction.BOTH, RelationshipTypes.Choice)) {
existingRelationship |= r.getEndNode().getId() == targetPage.getId();
}
if (!existingRelationship) {
thisPage.createRelationshipTo(targetPage, RelationshipTypes.Choice);
linksOut = true;
}
}
else {
System.out.println("Found link to unknown: " + lDestination);
}
}
else {
System.out.println("Unhandled");
}
}
}
}
System.out.println("Rel created from " + relationship.getStartNode().getProperty("page_number") + "[" + relationship.getStartNode().getProperty("pdf_page_number") + "] to " + relationship.getEndNode().getProperty("page_number") + "[" + relationship.getEndNode().getProperty("pdf_page_number") + "]");
System.out.println("");
linksfound = true;
// Do some context scanning to find page types
FontGroupingTextExtractionStrategy strategy = contentParser.processContent(pdfPageNumber, new FontGroupingTextExtractionStrategy(false));
if (strategy.getImageCount() > 0 && (strategy.getTextValues().size() == 0 || (strategy.getTextValues().size() == 1 && strategy.getTextValues().get(0).toString().equals(bookPageLabels[pdfPageNumber])))) {
// If the page is nothing but an image, label it as an image page (Typically an ending comic)
thisPage.addLabel(PageTypes.ImagePage);
}
else {
// If not an image page perhaps there's text to parse?
for (StringBuilder textBlock : strategy.getTextValues()) {
if (textBlock.toString().matches("THE END(!!)?")) {
thisPage.addLabel(PageTypes.EndPage);
}
}
if (!linksfound && page != reader.getNumberOfPages()) {
Node targetPage = graphDb.getNodeById(pageNodeIdArray[page + 1]);
thisPage.createRelationshipTo(targetPage, RelationshipTypes.ContinuesTo);
}
// Link to next page if no other relationships from this node
if (!linksOut && !thisPage.hasLabel(PageTypes.EndPage)) {
Node nextPage = graphDb.findNode(PageTypes.Page, PDF_PAGE_NUMBER, pdfPageNumber+1);
if (nextPage != null) {
thisPage.createRelationshipTo(nextPage, RelationshipTypes.Continues);
}
}
System.out.println("");
System.out.println("------------");
System.out.println("");
}
reader.close();
tx.success();
}
graphDb.shutdown();
// Query e.g. MATCH (p:Page {page_number: 22})-[*..10]-(p2) RETURN p, p2
}
reader.close();
graphDb.shutdown();
private static int resolveBookPageNumber(int pdfPageNumber) {
if (pdfPageNumber <= 414) {
return pdfPageNumber - PAGE_OFFSET;
}
else {
return pdfPageNumber - PAGE_OFFSET - SUB_BOOK_OFFSET;
}
// Query e.g. MATCH r=(s:SubBook {book_page_label: "Gi"})-[*..10]->(e:SubBook :EndPage) RETURN r
}
private static int resolvePDFPageNumber(int bookPageNumber) {
if (bookPageNumber <= 414) {
return bookPageNumber + PAGE_OFFSET;
}
else {
return bookPageNumber + PAGE_OFFSET + SUB_BOOK_OFFSET;
/**
* Retrieves the page labels from a PDF as an array of String objects.
* @param reader a PdfReader object that has the page labels you want to retrieve
* @return a String array or <code>null</code> if no page labels are present
*/
public static String[] getPageLabels(PdfReader reader) {
int n = reader.getNumberOfPages();
PdfDictionary dict = reader.getCatalog();
PdfDictionary labels = (PdfDictionary)PdfReader.getPdfObjectRelease(dict.get(PdfName.PAGELABELS));
if (labels == null)
return null;
String[] labelstrings = new String[n];
HashMap<Integer, PdfObject> numberTree = PdfNumberTree.readTree(labels);
int pagecount = 1;
Integer current;
String prefix = "";
char type = 'D';
for (int i = 0; i < n; i++) {
current = Integer.valueOf(i);
if (numberTree.containsKey(current)) {
PdfDictionary d = (PdfDictionary)PdfReader.getPdfObjectRelease(numberTree.get(current));
if (d.contains(PdfName.ST)) {
pagecount = ((PdfNumber)d.get(PdfName.ST)).intValue();
}
else {
pagecount = 1;
}
if (d.contains(PdfName.P)) {
prefix = ((PdfString)d.get(PdfName.P)).toUnicodeString();
}
else {
prefix = ""; // NP - See page 596 of pdf ref, prefix is for that range only
}
if (d.contains(PdfName.S)) {
type = ((PdfName)d.get(PdfName.S)).toString().charAt(1);
}
else {
type = 'e';
}
}
switch(type) {
default:
labelstrings[i] = prefix + pagecount;
break;
case 'R':
labelstrings[i] = prefix + RomanNumberFactory.getUpperCaseString(pagecount);
break;
case 'r':
labelstrings[i] = prefix + RomanNumberFactory.getLowerCaseString(pagecount);
break;
case 'A':
labelstrings[i] = prefix + RomanAlphabetFactory.getUpperCaseString(pagecount);
break;
case 'a':
labelstrings[i] = prefix + RomanAlphabetFactory.getLowerCaseString(pagecount);
break;
case 'e':
labelstrings[i] = prefix;
break;
}
pagecount++;
}
return labelstrings;
}
private static void registerShutdownHook(final GraphDatabaseService graphDb) {
Oops, something went wrong.

0 comments on commit b3546ca

Please sign in to comment.