In [1]:
!pip install transformers torch Pillow tqdm
!git clone https://github.com/deepseek-ai/DeepSeek-VL2.git
%cd DeepSeek-VL2
!pip install -e .
%cd ..

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [4]:
%cd /content/DeepSeek-VL2

/content/DeepSeek-VL2


In [9]:
import os
import torch
import PIL.Image
from tqdm import tqdm
from transformers import AutoModelForCausalLM
from deepseek_vl2.models import DeepseekVLV2Processor, DeepseekVLV2ForCausalLM
from google.colab import files
import glob

class HandwrittenTextExtractor:
    def __init__(self, model_path: str, chunk_size: int = 512, device: str = "cuda"):
        """Initialize the text extractor with DeepSeek VL2 model."""
        self.device = device
        self.dtype = torch.bfloat16
        self.chunk_size = chunk_size

        print(f"Loading model and processor from {model_path}...")
        self.processor = DeepseekVLV2Processor.from_pretrained(
            model_path, trust_remote_code=True
        )
        self.model = DeepseekVLV2ForCausalLM.from_pretrained(
            model_path, trust_remote_code=True, torch_dtype=self.dtype
        ).to(device).eval()

        self.tokenizer = self.processor.tokenizer
        print("Model loaded successfully!")

    def process_image(self, image_path: str) -> str:
        """Extract handwritten text from an image."""
        try:
            pil_image = PIL.Image.open(image_path).convert("RGB")

            conversation = [
                {
                    "role": "<|User|>",
                    "content": "<image>\nPlease transcribe all handwritten text from this image.",
                    "images": [image_path]
                },
                {"role": "<|Assistant|>", "content": ""}
            ]

            prepare_inputs = self.processor(
                conversations=conversation,
                images=[pil_image],
                force_batchify=True,
                system_prompt=""
            ).to(self.device)

            with torch.no_grad():
                inputs_embeds, past_key_values = self.model.incremental_prefilling(
                    input_ids=prepare_inputs.input_ids,
                    images=prepare_inputs.images,
                    images_seq_mask=prepare_inputs.images_seq_mask,
                    images_spatial_crop=prepare_inputs.images_spatial_crop,
                    attention_mask=prepare_inputs.attention_mask,
                    chunk_size=self.chunk_size
                )

                outputs = self.model.generate(
                    inputs_embeds=inputs_embeds,
                    input_ids=prepare_inputs.input_ids,
                    images=prepare_inputs.images,
                    images_seq_mask=prepare_inputs.images_seq_mask,
                    images_spatial_crop=prepare_inputs.images_spatial_crop,
                    attention_mask=prepare_inputs.attention_mask,
                    past_key_values=past_key_values,
                    pad_token_id=self.tokenizer.eos_token_id,
                    bos_token_id=self.tokenizer.bos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id,
                    max_new_tokens=512,
                    do_sample=True,
                    temperature=0.4,
                    top_p=0.9,
                    repetition_penalty=1.1,
                    use_cache=True,
                )

            response = self.tokenizer.decode(
                outputs[0][len(prepare_inputs.input_ids[0]):].cpu().tolist(),
                skip_special_tokens=True
            )
            return response.strip()

        except Exception as e:
            print(f"Error processing image {image_path}: {str(e)}")
            return f"Error processing image: {str(e)}"

def process_directory(input_dir: str, output_dir: str, model_path: str, chunk_size: int):
    """Process all images in a directory and save extracted text to output directory."""
    os.makedirs(output_dir, exist_ok=True)
    extractor = HandwrittenTextExtractor(model_path, chunk_size)

    image_files = [f for f in os.listdir(input_dir)
                   if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.webp'))]

    if not image_files:
        print(f"No image files found in {input_dir}")
        return

    print(f"Found {len(image_files)} images to process")

    for image_file in tqdm(image_files, desc="Processing images"):
        image_path = os.path.join(input_dir, image_file)
        output_path = os.path.join(output_dir, f"{os.path.splitext(image_file)[0]}.txt")

        extracted_text = extractor.process_image(image_path)

        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(extracted_text)

        print(f"Processed {image_file} -> {os.path.basename(output_path)}")

# === STEP 1: SETUP DIRECTORIES ===
input_dir = "/content/data"
output_dir = "/content/output"
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

# === STEP 2: UPLOAD IMAGES ===
print("Please upload your images:")
uploaded_files = files.upload()

for filename in uploaded_files.keys():
    os.rename(filename, os.path.join(input_dir, filename))

print(f"Uploaded {len(uploaded_files)} files.")

# === STEP 3: RUN THE MODEL ===
model_path = "deepseek-ai/deepseek-vl2-tiny"  # Change if using a different model
chunk_size = 512  # Adjust based on available GPU memory

print("Starting text extraction...")
process_directory(input_dir, output_dir, model_path, chunk_size)

# === STEP 4: DOWNLOAD EXTRACTED TEXT FILES ===
print("Downloading extracted text files...")
for file in glob.glob(f"{output_dir}/*.txt"):
    files.download(file)

print("Processing complete! ✅")


Please upload your images:


Saving 2.png to 2.png
Uploaded 1 files.
Starting text extraction...
Loading model and processor from deepseek-ai/deepseek-vl2-tiny...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/165k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/6.27M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


processor_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

Add pad token = ['<｜▁pad▁｜>'] to the tokenizer
<｜▁pad▁｜>:2
Add image token = ['<image>'] to the tokenizer
<image>:128815
Add grounding-related tokens = ['<|ref|>', '<|/ref|>', '<|det|>', '<|/det|>', '<|grounding|>'] to the tokenizer with input_ids
<|ref|>:128816
<|/ref|>:128817
<|det|>:128818
<|/det|>:128819
<|grounding|>:128820
Add chat tokens = ['<|User|>', '<|Assistant|>'] to the tokenizer with input_ids
<|User|>:128821
<|Assistant|>:128822



The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


config.json:   0%|          | 0.00/2.29k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/247k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-000001.safetensors:   0%|          | 0.00/6.74G [00:00<?, ?B/s]

Model loaded successfully!
Found 1 images to process


Processing images:   0%|          | 0/1 [00:00<?, ?it/s]You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Processing images: 100%|██████████| 1/1 [00:14<00:00, 14.04s/it]

Error processing image /content/data/2.png: No operator found for `memory_efficient_attention_forward` with inputs:
     query       : shape=(10, 729, 16, 72) (torch.bfloat16)
     key         : shape=(10, 729, 16, 72) (torch.bfloat16)
     value       : shape=(10, 729, 16, 72) (torch.bfloat16)
     attn_bias   : <class 'NoneType'>
     p           : 0.0
`decoderF` is not supported because:
    attn_bias type is <class 'NoneType'>
    bf16 is only supported on A100+ GPUs
`flshattF@v2.3.0` is not supported because:
    requires device with capability > (8, 0) but your GPU has capability (7, 5) (too old)
    bf16 is only supported on A100+ GPUs
`tritonflashattF` is not supported because:
    requires device with capability > (8, 0) but your GPU has capability (7, 5) (too old)
    bf16 is only supported on A100+ GPUs
    operator wasn't built - see `python -m xformers.info` for more info
    triton is not available
    requires GPU with sm80 minimum compute capacity, e.g., A100/H100/L4
`c




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Processing complete! ✅
