In [1]:
!pip install -q transformers torch pillow

In [1]:
from transformers import AutoProcessor, AutoModelForCausalLM
from PIL import Image
import torch
import json
import os
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Usando dispositivo: {device}")

Usando dispositivo: cuda


In [4]:
print("Cargando modelo GIT...")
processor = AutoProcessor.from_pretrained("microsoft/git-large-textcaps")
model = AutoModelForCausalLM.from_pretrained("microsoft/git-large-textcaps")
model.to(device)
print("Modelo cargado exitosamente!")

Cargando modelo GIT...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


tokenizer_config.json:   0%|          | 0.00/453 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Modelo cargado exitosamente!


In [5]:
OCR_path = '/content/drive/MyDrive/Imagenes Proyecto/OCR'
IC_path = '/content/drive/MyDrive/Imagenes Proyecto/IC'
annotations_path = '/content/drive/MyDrive/Imagenes Proyecto/annotations.json'

# Rutas de guardado separadas
results_ocr_path = '/content/drive/MyDrive/Imagenes Proyecto/results_ocr_git.json'
results_ic_path = '/content/drive/MyDrive/Imagenes Proyecto/results_ic_git.json'

In [6]:
# Si genera texto coherente = está preentrenado
from PIL import Image

image_path = '/content/drive/MyDrive/Imagenes Proyecto/IC/02.jpg'
img = Image.open(image_path).convert("RGB")

inputs = processor(images=img, return_tensors="pt").to(device)
generated = model.generate(pixel_values=inputs.pixel_values, max_length=30)
caption = processor.batch_decode(generated[0], skip_special_tokens=True)

print(caption)

['', 'a', 'train', 'track', 'with', 'a', 'building', 'in', 'the', 'background', 'and', 'a', 'blue', 'sky', 'with', 'clouds', '.', '']


In [7]:
with open(annotations_path, 'r', encoding='utf-8') as f:
    annotations = json.load(f)

In [14]:
def process_ocr_images(image_folder, annotations_dict, output_path):
    """
    Procesa imágenes para extracción de texto (OCR)

    Args:
        image_folder: carpeta con imágenes OCR
        annotations_dict: diccionario con anotaciones manuales
        output_path: donde guardar resultados

    Returns:
        dict con resultados de OCR
    """
    # Obtener imágenes
    images = sorted([os.path.join(image_folder, img)
                    for img in os.listdir(image_folder)
                    if img.lower().endswith(('.jpg', '.jpeg', '.png'))])

    print(f"\n{'='*60}")
    print(f"PROCESANDO OCR - {len(images)} imágenes")
    print(f"{'='*60}\n")

    # Prompts para OCR
    ocr_prompts = [
        "",
        "the text says"
    ]

    ocr_results = {}

    for idx, image_path in enumerate(images, 1):
        image_name = os.path.basename(image_path)
        print(f"[{idx}/{len(images)}] {image_name}")

        try:
            # Cargar imagen
            img = Image.open(image_path).convert("RGB")

            # Obtener anotación manual
            manual_ocr = annotations_dict.get(image_name, {}).get("ocr", "")

            # Procesar con ambos prompts
            prompt_outputs = []

            for prompt in ocr_prompts:
                # Preparar inputs
                inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)

                # Generar
                generated_ids = model.generate(
                    pixel_values=inputs.pixel_values,
                    max_length=100,
                    num_beams=4,
                    early_stopping=True
                )

                # Decodificar
                output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

                prompt_outputs.append({
                    "prompt": prompt,
                    "output": output
                })

                print(f"'{prompt[:30]}...'")
                print(f"Output: {output}")

            # Guardar resultado
            ocr_results[image_name] = {
                "image_path": image_path,
                "manual_ocr": manual_ocr,
                "prompt_outputs": prompt_outputs
            }

            # Guardar después de cada imagen (seguridad)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump({
                    "model": "microsoft/git-large-textcaps",
                    "task": "OCR",
                    "total_images": len(images),
                    "processed_images": idx,
                    "results": ocr_results
                }, f, indent=4, ensure_ascii=False)

            print(f"Guardado en {output_path}\n")

        except Exception as e:
            print(f"ERROR: {e}\n")
            ocr_results[image_name] = {
                "error": str(e)
            }
            continue

    print(f"\n{'='*60}")
    print(f"OCR COMPLETADO: {len(ocr_results)}/{len(images)} imágenes")
    print(f"Guardado en: {output_path}")
    print(f"{'='*60}\n")

    return ocr_results


In [9]:
def process_ic_images(image_folder, annotations_dict, output_path):
    """
    Procesa imágenes para generación de descripciones (IC)

    Args:
        image_folder: carpeta con imágenes IC
        annotations_dict: diccionario con anotaciones manuales
        output_path: donde guardar resultados

    Returns:
        dict con resultados de IC
    """
    # Obtener imágenes
    images = sorted([os.path.join(image_folder, img)
                    for img in os.listdir(image_folder)
                    if img.lower().endswith(('.jpg', '.jpeg', '.png'))])

    print(f"\n{'='*60}")
    print(f"PROCESANDO IMAGE CAPTIONING - {len(images)} imágenes")
    print(f"{'='*60}\n")

    # Prompts para IC en 3 niveles
    ic_prompts = {
        "basic": {
            "prompt": "",  # Sin prompt para básico
            "max_length": 30
        },
        "intermediate": {
            "prompt": "describe this image in detail",
            "max_length": 60
        },
        "advanced": {
            "prompt": "provide a detailed and comprehensive description of this image, including objects, actions, colors, and context",
            "max_length": 100
        }
    }

    ic_results = {}

    for idx, image_path in enumerate(images, 1):
        image_name = os.path.basename(image_path)
        print(f"[{idx}/{len(images)}] {image_name}")

        try:
            # Cargar imagen
            img = Image.open(image_path).convert("RGB")

            # Obtener anotaciones manuales
            manual_ic = annotations_dict.get(image_name, {}).get("ic", {})

            # Procesar en 3 niveles
            prompt_outputs = []

            for level, config in ic_prompts.items():
                prompt = config["prompt"]
                max_len = config["max_length"]

                # Preparar inputs
                if prompt:
                    inputs = processor(images=img, text=prompt, return_tensors="pt").to(device)
                else:
                    inputs = processor(images=img, return_tensors="pt").to(device)

                # Generar
                generated_ids = model.generate(
                    pixel_values=inputs.pixel_values,
                    max_length=max_len,
                    num_beams=4,
                    early_stopping=True
                )

                # Decodificar
                output = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()

                prompt_outputs.append({
                    "level": level,
                    "prompt": prompt if prompt else "[no prompt - basic captioning]",
                    "output": output
                })

                print(f"  ├─ {level.upper()}: {output}")

            # Guardar resultado
            ic_results[image_name] = {
                "image_path": image_path,
                "manual_ic": manual_ic,
                "prompt_outputs": prompt_outputs
            }

            # Guardar después de cada imagen (seguridad)
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump({
                    "model": "microsoft/git-large-textcaps",
                    "task": "Image_Captioning",
                    "total_images": len(images),
                    "processed_images": idx,
                    "results": ic_results
                }, f, indent=4, ensure_ascii=False)

            print(f"Guardado en {output_path}\n")

        except Exception as e:
            print(f"ERROR: {e}\n")
            ic_results[image_name] = {
                "error": str(e)
            }
            continue

    print(f"\n{'='*60}")
    print(f"IC COMPLETADO: {len(ic_results)}/{len(images)} imágenes")
    print(f"Guardado en: {output_path}")
    print(f"{'='*60}\n")

    return ic_results


In [15]:
print("\nINICIANDO PROCESAMIENTO DE OCR...\n")
ocr_results = process_ocr_images(OCR_path, annotations, results_ocr_path)


INICIANDO PROCESAMIENTO DE OCR...


PROCESANDO OCR - 15 imágenes

[1/15] 20220101_035959.jpg
'...'
Output: a blue sign on a white wall that says " for order del. com ".
'the text says...'
Output: a blue sign on a white wall that says " for order del. com ".
Guardado en /content/drive/MyDrive/Imagenes Proyecto/results_ocr_git.json

[2/15] 20230214_122422.jpg
'...'
Output: a large sign on a building that says " el museum ".
'the text says...'
Output: a large sign on a building that says " el museum ".
Guardado en /content/drive/MyDrive/Imagenes Proyecto/results_ocr_git.json

[3/15] 20230430_150424.jpg
'...'
Output: a sign that says ' smilodon ' on it
'the text says...'
Output: a sign that says ' smilodon ' on it
Guardado en /content/drive/MyDrive/Imagenes Proyecto/results_ocr_git.json

[4/15] 20230511_133754.jpg
'...'
Output: a black and white informational plaque with the words memory historica on it.
'the text says...'
Output: a black and white informational plaque with the words memo

In [11]:
print("\n INICIANDO PROCESAMIENTO DE IC...\n")
ic_results = process_ic_images(IC_path, annotations, results_ic_path)


 INICIANDO PROCESAMIENTO DE IC...


PROCESANDO IMAGE CAPTIONING - 15 imágenes

[1/15] 01.jpg
  ├─ BASIC: a gold coin that says 500 pesos on it
  ├─ INTERMEDIATE: a gold coin that says 500 pesos on it
  ├─ ADVANCED: a gold coin that says 500 pesos on it
Guardado en /content/drive/MyDrive/Imagenes Proyecto/results_ic_git.json

[2/15] 02.jpg
  ├─ BASIC: an empty railroad track in the middle of a desert.
  ├─ INTERMEDIATE: an empty railroad track in the middle of a desert.
  ├─ ADVANCED: an empty railroad track in the middle of a desert.
Guardado en /content/drive/MyDrive/Imagenes Proyecto/results_ic_git.json

[3/15] 03.jpg
  ├─ BASIC: an older woman ' s hands are resting on a metal bar.
  ├─ INTERMEDIATE: an older woman ' s hands are resting on a metal bar.
  ├─ ADVANCED: an older woman ' s hands are resting on a metal bar.
Guardado en /content/drive/MyDrive/Imagenes Proyecto/results_ic_git.json

[4/15] 04.jpg
  ├─ BASIC: the sun is setting over the water and the sun is setting.
  ├─ INT