Skip to content

Commit

Permalink
patch for tesseract-ocr#660 textonly_pdf
Browse files Browse the repository at this point in the history
  • Loading branch information
Wikinaut committed Jan 20, 2017
1 parent a979494 commit 5e80891
Show file tree
Hide file tree
Showing 7 changed files with 40 additions and 24 deletions.
4 changes: 2 additions & 2 deletions api/capi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,9 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu
return new TessHOcrRenderer(outputbase, font_info);
}

TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir)
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, bool textonly)
{
return new TessPDFRenderer(outputbase, datadir);
return new TessPDFRenderer(outputbase, datadir, textonly);
}

TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase)
Expand Down
2 changes: 1 addition & 1 deletion api/capi.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ TESS_API void TESS_CALL TessDeleteBlockList(BLOCK_LIST* block_list);
TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info);
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir);
TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, bool textonly);
TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase);
TESS_API TessResultRenderer* TESS_CALL TessBoxTextRendererCreate(const char* outputbase);

Expand Down
41 changes: 26 additions & 15 deletions api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,10 +178,12 @@ const int kCharWidth = 2;
* PDF Renderer interface implementation
**********************************************************************/

TessPDFRenderer::TessPDFRenderer(const char* outputbase, const char *datadir)
TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir,
bool textonly)
: TessResultRenderer(outputbase, "pdf") {
obj_ = 0;
datadir_ = datadir;
textonly_ = textonly;
offsets_.push_back(0);
}

Expand Down Expand Up @@ -326,7 +328,11 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api,
pdf_str.add_str_double("", prec(width));
pdf_str += " 0 0 ";
pdf_str.add_str_double("", prec(height));
pdf_str += " 0 0 cm /Im1 Do Q\n";
pdf_str += " 0 0 cm";
if (!textonly_) {
pdf_str += " /Im1 Do";
}
pdf_str += " Q\n";

int line_x1 = 0;
int line_y1 = 0;
Expand Down Expand Up @@ -837,6 +843,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix,
bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
size_t n;
char buf[kBasicBufSize];
char buf2[kBasicBufSize];
Pix *pix = api->GetInputImage();
char *filename = (char *)api->GetInputName();
int ppi = api->GetSourceYResolution();
Expand All @@ -845,6 +852,9 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
double width = pixGetWidth(pix) * 72.0 / ppi;
double height = pixGetHeight(pix) * 72.0 / ppi;

snprintf(buf2, sizeof(buf2), "XObject << /Im1 %ld 0 R >>\n", obj_ + 2);
const char *xobject = (textonly_) ? "" : buf2;

// PAGE
n = snprintf(buf, sizeof(buf),
"%ld 0 obj\n"
Expand All @@ -855,19 +865,18 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
" /Contents %ld 0 R\n"
" /Resources\n"
" <<\n"
" /XObject << /Im1 %ld 0 R >>\n"
" %s"
" /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n"
" /Font << /f-0-0 %ld 0 R >>\n"
" >>\n"
">>\n"
"endobj\n",
obj_,
2L, // Pages object
width,
height,
obj_ + 1, // Contents object
obj_ + 2, // Image object
3L); // Type0 Font
2L, // Pages object
width, height,
obj_ + 1, // Contents object
xobject, // Image object
3L); // Type0 Font
if (n >= sizeof(buf)) return false;
pages_.push_back(obj_);
AppendPDFObject(buf);
Expand Down Expand Up @@ -904,13 +913,15 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) {
objsize += strlen(b2);
AppendPDFObjectDIY(objsize);

char *pdf_object;
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
return false;
if (!textonly_) {
char *pdf_object = nullptr;
if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) {
return false;
}
AppendData(pdf_object, objsize);
AppendPDFObjectDIY(objsize);
delete[] pdf_object;
}
AppendData(pdf_object, objsize);
AppendPDFObjectDIY(objsize);
delete[] pdf_object;
return true;
}

Expand Down
8 changes: 4 additions & 4 deletions api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ class TESS_API TessPDFRenderer : public TessResultRenderer {
public:
// datadir is the location of the TESSDATA. We need it because
// we load a custom PDF font from this location.
TessPDFRenderer(const char *outputbase, const char *datadir);
TessPDFRenderer(const char* outputbase, const char* datadir, bool textonly);

protected:
virtual bool BeginDocumentHandler();
Expand All @@ -196,20 +196,20 @@ class TESS_API TessPDFRenderer : public TessResultRenderer {
private:
// We don't want to have every image in memory at once,
// so we store some metadata as we go along producing
// PDFs one page at a time. At the end that metadata is
// PDFs one page at a time. At the end, that metadata is
// used to make everything that isn't easily handled in a
// streaming fashion.
long int obj_; // counter for PDF objects
GenericVector<long int> offsets_; // offset of every PDF object in bytes
GenericVector<long int> pages_; // object number for every /Page object
const char *datadir_; // where to find the custom font
bool textonly_; // skip images if set
// Bookkeeping only. DIY = Do It Yourself.
void AppendPDFObjectDIY(size_t objectsize);
// Bookkeeping + emit data.
void AppendPDFObject(const char *data);
// Create the /Contents object for an entire page.
static char* GetPDFTextObjects(TessBaseAPI* api,
double width, double height);
char* GetPDFTextObjects(TessBaseAPI* api, double width, double height);
// Turn an image into a PDF object. Only transcode if we have to.
static bool imageToPDFObj(Pix *pix, char *filename, long int objnum,
char **pdf_object, long int *pdf_object_size);
Expand Down
6 changes: 4 additions & 2 deletions api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -348,8 +348,10 @@ void PreloadRenderers(

api->GetBoolVariable("tessedit_create_pdf", &b);
if (b) {
renderers->push_back(
new tesseract::TessPDFRenderer(outputbase, api->GetDatapath()));
bool textonly;
api->GetBoolVariable("textonly_pdf", &textonly);
renderers->push_back(new tesseract::TessPDFRenderer(
outputbase, api->GetDatapath(), textonly));
}

api->GetBoolVariable("tessedit_write_unlv", &b);
Expand Down
2 changes: 2 additions & 0 deletions ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,6 +389,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
this->params()),
BOOL_MEMBER(textonly_pdf, false, "Invisible text only for PDF",
this->params()),
STRING_MEMBER(unrecognised_char, "|",
"Output char for unidentified blobs", this->params()),
INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
Expand Down
1 change: 1 addition & 0 deletions ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1026,6 +1026,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
BOOL_VAR_H(textonly_pdf, false, "Invisible text only for PDF");
STRING_VAR_H(unrecognised_char, "|",
"Output char for unidentified blobs");
INT_VAR_H(suspect_level, 99, "Suspect marker level");
Expand Down

0 comments on commit 5e80891

Please sign in to comment.