In [15]:
!pip install pytesseract
!pip install jiwer
!pip install pdf2image
!apt install poppler-utils

!apt install tesseract-ocr # Install Tesseract OCR
!apt install libtesseract-dev

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 29 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, Dataset
import numpy as np
import cv2
import pytesseract
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
import os # Import the os module
from pdf2image import convert_from_path

# Dataset Class
class OCRDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        image = cv2.imread(self.image_paths[idx], cv2.IMREAD_GRAYSCALE)
        image = cv2.resize(image, (128, 32))
        if self.transform:
            image = self.transform(image)
        label = self.labels[idx]
        return image, label

# Model: CRNN
class CRNN(nn.Module):
    def __init__(self, num_classes):
        super(CRNN, self).__init__()
        self.cnn = nn.Sequential(
            nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        self.rnn = nn.LSTM(128, 256, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(512, num_classes)

    def forward(self, x):
        x = self.cnn(x)
        x = x.squeeze(2).permute(0, 2, 1)
        x, _ = self.rnn(x)
        x = self.fc(x)
        return x

# Transformer OCR
class TransformerOCR:
    def __init__(self):
        self.processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
        self.model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

    def recognize_text(self, image):
        inputs = self.processor(images=image, return_tensors="pt").pixel_values
        generated_ids = self.model.generate(inputs)
        text = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        return text

# Evaluation Metrics
from jiwer import wer, cer

def evaluate(preds, labels):
    wer_score = wer(labels, preds)
    cer_score = cer(labels, preds)
    return wer_score, cer_score

# Training (Placeholder)
def train():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = CRNN(num_classes=100).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CTCLoss()
    for epoch in range(10):
        # Training loop placeholder
        pass
    return model

# OCR Prediction with Tesseract
def tesseract_ocr(image_path):
    # Check if the image file exists
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image file not found: {image_path}") # Raise a FileNotFoundError if the image file does not exist

    image = cv2.imread(image_path)

    # Check if the image was loaded successfully
    if image is None:
        raise ValueError(f"Could not read image file: {image_path}") # Raise a ValueError if the image file could not be read

    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    text = pytesseract.image_to_string(gray)
    return text

# Function to convert PDF to images and perform OCR
def process_pdf(pdf_path):
    # Convert PDF to images
    images = convert_from_path(pdf_path)

    all_text = ""  # Store all extracted text

    for i, image in enumerate(images):
        # Save the image temporarily
        image_path = f"temp_page_{i}.jpg"
        image.save(image_path, "JPEG")

        # Perform OCR on the image
        try:
            tesseract_text = tesseract_ocr(image_path)
            trocr = TransformerOCR()
            transformer_text = trocr.recognize_text(cv2.imread(image_path))

            print(f"Page {i + 1}:")
            print("Tesseract OCR Output:", tesseract_text)
            print("Transformer OCR Output:", transformer_text)
            all_text += tesseract_text + transformer_text  # Append to all_text

        except (FileNotFoundError, ValueError) as e:
            print(f"Error processing page {i + 1}: {e}")

        finally:
            # Remove the temporary image file
            os.remove(image_path)

    return all_text  # Return all extracted text


# Example Usage
if __name__ == "__main__":
    pdf_path = "/content/Buendia - Instruccion.pdf"  # Replace with your PDF file path
    extracted_text = process_pdf(pdf_path)
    print("\nAll Extracted Text:\n", extracted_text)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/224 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

generation_config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Page 1:
Tesseract OCR Output: Lowe

KN
INFINITAM ‘AMABLE
' NINO TESUS.

| SNCS Vos , Dulcifsimo Nifio
GANS | Jesus , queno folo OS Ex1/ai.33:
(GemYe)|| dignafteis de llamaros 38.

| Do@or de los Nifios, ©*4“* *
fino tambien de afsif- *”

tir como Nifio entre los Do&ores,

fe confagra humilde efta pequefa
Iaftruccion de los Nifios. Es afsi,

que ella tambien fe dirige a laju-

ventud ; pero aefta, como recuer-

do delo que aprendio, alos Ni

ios , como precifa explicacion de

lo que deben eftudiar. Por efte fo-
lagitulo.es muy vueftra 5 sy por

fer para Nifios, que confiais a la
educacion de vueftra Compafia,

lo es mucho mas. En Vos, ( Divi-

no Exemplar de todas las virta-

des ) tienen abreviado el mas {e-

q 2 gure

 

 
 

Transformer OCR Output: 0 0


Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Page 2:
Tesseract OCR Output: i nstruccion de christiana y politica cortesania con

Luc. ibid.

P fal.114.6.
em 118.130,
», &i8.8.
Mattb.19.
14.
Marci, 10,
14.

Matt. 18.
2. PC.

Dios y con los hombres

guro diffeno ag edad : Ia Reli-
gion paracon Dios en la devora
afsiftécia 4 los Templos;la piedad
con los Padres en la obediencia
mas rendida; gy: modettia, y de~
feo de faber con los mayores,
guftando mas de olr, y pregun-
tar,que de definir,y refolver.Bien
que efto en vuettra infinita Sabi-.
duria fue foberana dignacion , y
en la natural ignorancia de los
Nifios es indifpenfable necefsi-
dad.

Ni tienen folamente en Vos
el diffefio , la luz ,y el exemplo,
fino tambien el amor, y protec-

  
  

cion. Vos, como fingularsMaef-

tro dé los Nifios , les dais enten-
dimiento , y comunicais la fabi-

. duria. Vos les prometeis el Reyno.
de los Cielos , y os indignais con

quien les aparta de Vos, y les

proponeis por norma del can-

dor , inocencia , y chriftiana hu-
3

‘mildad. Vueftro amo

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Page 3:
Tesseract OCR Output: erianza ‘dé la Nifiéz: ‘Afsi fea;
Divinifsimo Nifo , por vueftra
pracia, afsi fea, a yueftra mas
yor gloria. Amen.

CENSUS

KN
CENSURA DEL. R. P, ANTONIO CO,
* dornia de la Compania de Fefus , Maef-
tro que fué de Theologta , Examinador,
Synodal de los ObiPados de Gerona , Ura
gel,y Barcelona, Oc,

E orden del Iluftre Sefor Don Frans
cifco de Baftéro y de Vilana, Dr. en
ambos Drechos , Canonigo , y Saeriftan
Dignidad de la Santa Iglefia de Gerona, y;
Vicario General por el lluftrifsimo Sefor,
D. Balthafar de Baftéro y Lledo , Obifpal
de Gerona,del Confejo de fu Mageftad,&c4
He vifto un Librico, cuyo tituloes: In/4
truccion de Chriftiana ,y Politica Corte/aa
nia, &c. Su Author D. Faufto Aguftin dé
Buendia , Colegial que fue en el Imperial
de Cordellas, &c. Y brevemente digo,
no folo que nada contiene contra lake, yj
buenas coftumbres , fino que muy atento,
el Author con entrambas , defcribe ,y en-
fefiatanculta, y difcreta la Virtud, cos
mo fanta la Policia

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Page 4:
Tesseract OCR Output: Caballero muy Santos. Por todo lo qual,
a mas de la licencia , que folicita ,’ meu
rece el:Cortefano Zelo del Author muchas
gracias de quantos fe intereffan en‘ tan
‘primorofa , como neceffaria educacion
de la primera edad de los'‘Nobles : de cu-
yo acierto fe deriva la primera utilidad,
y decoro 4 la Republica. Afsi lo ‘fien-
to, falvo, &c. En efte Colegio de San
Martin de Ja Compafia de Jefus de Gero-
na, a 15. de Julio de 1740.

Antonio Codornin, de Ia *
Compania de Fefus.

Die 15. Fulii, 1740.
« Imprimatur.

‘De Baftéro Vic, Gen. & Offic.

CENSUs

OK N |
CENSURA DEL R, P;: MARIANO ALBE-
rich de laCompaiiia de Fefus, Catbedra-
~tico antes de Theologia en el Colegio de
Barcelona, ( oy Prefecto de fus Eftudios )
Retor que fue del Colegio de Cordellas , y
de Gerona , Calificador del Santo Oficio,

Examinador Synodal de muchos Obifpa-
dos, Orc. A a , :

M, P. S.

E orden de V. A. he vifto el Librito,

que conel titulo de Politica,y Chri/-

tiana Cortefania,

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Page 5:
Tesseract OCR Output: i nstruccion de christiana y politica cortesania con Dios y con los hombres

hemos de dar en otro mas fevero tribus
nal. Quien a efte pequeio volumen mis
diere por el cuerpo, no le dara la eftima4
cion, que fe merece: pero quien atendie4
re ala mucha alma, que encierra , forma-
ra el debido concepto de fu preciofidad.
Cofas ay , que fe celebran por lo agigan4
tado , como el Colofo : y las ay tambien,
como el Nifo de Bruto , que tuvieron
igual aplaufo, por aver reducido cafi 4
puntos indivifibles todo el artificio de
cofas grandes. Afsi ay Authores , que
han Ilenado de libros grandes los eftan-
tes, acreditandofe Hafta con el oro , que
luce en fus cubiertas : otros , con no
menor acierto,han dado a luz libritos mas
familiares, que fe dexan tratar a todas
horas , companeros infeparables de fus
duefios : los quales ya en las cubiertas
llevan aquella infcripcion , 0 elogio , que
did el acuminofo Marcial a fus efcritos:
Me manus una capit. Y como feria ne-
ceda

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 384,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": false,
  "transformers_version": "4.48.3"
}

Config of the decoder: <class 'transformers.models.trocr.modeling_trocr.TrOCRForCausalLM'> is overwritten by shared decoder config: TrOCRConfig {
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_cross_attention": true,
  "attention_dropout": 0.0,
  "bos_token_id": 0,
  "classifier_dropout": 0.0,
  "cross_attention_hidden_size": 768,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder

Page 6:
Tesseract OCR Output: los hombres

_ SUMA DEL PRIVILEGIO.

Ts licencia de fu Mageftad Don
Faufto Aguftin de Buendia para que
por una vez pueda imprimir efte Libro,
intitulado : Inffruccion de Chriftiana , y
Politica Cortefania , como mas largamen-
te confta. de fu original. Madrid , y Julio
a 8. de 1740.

D. Pedro Manuel de Contreras,
TASSA., Lm:
“‘Affaron los Sefores del Confejo efte
, Libro , intitulado: Inffruccion de
Chriftiana , y Politica Cortefania, 4 {eis
maravedis cada pliego , como contfta de fu
original. Madrid,y Agofto 4 22. de 1740.

 

 

7 .

D. Pedro Manuel de Contreras,
FRE DE ERRATAS.
Ste Libro, inticulado: Inftruccion de
_s Chriftiana,y Politica Cortefania, cor-
re{ponde en todo cof fu original. Madrid
a 20. de Agofto de 1740.

 

 

 

_ OO ee

Lic. D. Manuel Licardo de Ribera,
Correct. Gen. por fu Mageftad.
| MOTI-

MOTIVO DEL AUTHOR, Y RAZON

de la Obra.

A devocion , y afec&to, que defde mis
tiernos aos profefse a la Sagrada
Religion de la Compania de Jes