Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,12 @@
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.11</version>
</dependency>

<dependency>
<groupId>org.mockito</groupId>
Expand Down
8 changes: 8 additions & 0 deletions src/main/java/edu/tamu/app/model/Resource.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

import static edu.tamu.app.Initialization.HOST;

import java.io.File;

import static edu.tamu.app.Initialization.ASSETS_PATH;

import javax.persistence.Column;
import javax.persistence.Entity;
import javax.persistence.FetchType;
Expand Down Expand Up @@ -82,4 +86,8 @@ public void setMimeType(String mimeType) {
this.mimeType = mimeType;
}

public File getFile() {
return new File(ASSETS_PATH + path);
}

}
81 changes: 71 additions & 10 deletions src/main/java/edu/tamu/app/service/suggestor/NALTSuggestor.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,15 @@

import org.apache.commons.io.FileUtils;
import org.apache.commons.io.IOUtils;
import org.apache.log4j.Logger;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.springframework.beans.factory.annotation.Autowired;

import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;

import edu.tamu.app.controller.DocumentController;
import edu.tamu.app.model.Document;
import edu.tamu.app.model.ProjectSuggestor;
import edu.tamu.app.model.Resource;
Expand All @@ -32,6 +36,8 @@ public class NALTSuggestor implements Suggestor {

private ProjectSuggestor projectSuggestor;

private static final Logger logger = Logger.getLogger(NALTSuggestor.class);

public NALTSuggestor(ProjectSuggestor projectSuggestor) {
this.projectSuggestor = projectSuggestor;
}
Expand All @@ -40,19 +46,13 @@ public NALTSuggestor(ProjectSuggestor projectSuggestor) {
public List<Suggestion> suggest(Document document) {

List<Suggestion> suggestions = new ArrayList<Suggestion>();

String fullText = getFullText(document);

// TODO: throw exception and handle using controller advice
try {
StringBuilder textBuilder = new StringBuilder();

for (Resource resource : resourceRepo.findAllByDocumentProjectNameAndDocumentNameAndMimeType(document.getProject().getName(), document.getName(), "text/plain")) {
File file = File.createTempFile(resource.getName(), Long.toString(System.nanoTime()));
file.deleteOnExit();
FileUtils.copyURLToFile(new URL(resource.getUrl()), file);
textBuilder.append(FileUtils.readFileToString(file, StandardCharsets.UTF_8).toLowerCase());
textBuilder.append("\n\n");
}

JsonNode payloadNode = objectMapper.readTree(fetchNALTSuggestions(textBuilder.toString())).get("payload");
JsonNode payloadNode = objectMapper.readTree(fetchNALTSuggestions(fullText)).get("payload");

JsonNode termOccurrenceArrayNode = payloadNode.get("ArrayList<TermOccurrence>") != null ? payloadNode.get("ArrayList<TermOccurrence>") : payloadNode.get("ArrayList");

Expand Down Expand Up @@ -111,4 +111,65 @@ public String getSubjectLabel() {
return projectSuggestor.getSettingValues("subjectLabel").get(0);
}

private String getFullText(Document document) {
StringBuilder textBuilder = new StringBuilder();

List<Resource> textResources = resourceRepo.findAllByDocumentProjectNameAndDocumentNameAndMimeType(document.getProject().getName(), document.getName(), "text/plain");

if (textResources.size() > 0) {
logger.info("Retrieving fulltext of Document " + document.getName() + " from " + textResources.size() + " plaintext file(s).");
for (Resource resource : textResources) {
File file;
try {
file = File.createTempFile(resource.getName(), Long.toString(System.nanoTime()));
file.deleteOnExit();
FileUtils.copyURLToFile(new URL(resource.getUrl()), file);
textBuilder.append(FileUtils.readFileToString(file, StandardCharsets.UTF_8).toLowerCase());
textBuilder.append("\n\n");
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
} else {
logger.info("No plaintext found for Document " + document.getName() + " - trying to retrieve from PDFs");
List<Resource> pdfResources = resourceRepo.findAllByDocumentProjectNameAndDocumentNameAndMimeType(document.getProject().getName(), document.getName(), "application/pdf");
if (pdfResources.size() > 0) {
PDFTextStripper textStripper;
try {
textStripper = new PDFTextStripper();
for (Resource pdfResource : pdfResources) {
textBuilder.append(textStripper.getText(getDocument(pdfResource)));
logger.debug("Got PDF text " + textBuilder.toString());
}
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
logger.info("No PDFs found for Document " + document.getName() + " - unable to retrieve fulltext.");
}
}

return textBuilder.toString();

}

/**
* get the PDDocument for a PDF resource
*
* @return the PDDocument for the PDF on disk
* @throws IOException
* - if an I/O problem occurs
*/
public PDDocument getDocument(Resource pdfResource) throws IOException {
PDDocument result = null;
File file = pdfResource.getFile();
if (file != null) {
if (!file.canRead())
throw new IllegalArgumentException("PDF document is unreadable" + file.getPath());
result = PDDocument.load(file);
}
return result;
}
}
Loading