In [1]:
from docling.datamodel.base_models import InputFormat
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode
from docling_core.transforms.chunker import HierarchicalChunker
from typing import List, Dict, Any, Optional
from dataclasses import dataclass
from pathlib import Path
import json
from tqdm import tqdm
import time
from concurrent.futures import ThreadPoolExecutor
import logging
pipeline_options = PdfPipelineOptions(do_table_structure=True)
pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE  # use more accurate TableFormer model

doc_converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    }
)

In [None]:
from transformers import AutoTokenizer
from huggingface_hub import login
from PIL import Image
import torch
from colpali_engine import ColPali
from transformers.models.paligemma.configuration_paligemma import PaliGemmaConfig
import gradio as gr

def setup_models():
    
    try:
        pali_config = PaliGemmaConfig(
            vocab_size=32000,
            hidden_size=4096,
            intermediate_size=11008,
            num_hidden_layers=32,
            num_attention_heads=32,
            max_position_embeddings=8192,
            rms_norm_eps=1e-6,
            use_cache=True,
            pad_token_id=0,
            bos_token_id=1,
            eos_token_id=2,
            tie_word_embeddings=False,
            use_memory_efficient_attention=True,
            hidden_act="silu"
        )
        
        # Инициализация ColPali
        colpali = ColPali(config=pali_config)
        
        # Загружаем Qwen2-VL напрямую из transformers
        from transformers import Qwen2VLForCausalLM
        
        qwen_model = Qwen2VLForCausalLM.from_pretrained(
            "Qwen/Qwen2-VL-7B",
            device_map="auto",
            trust_remote_code=True,
            torch_dtype=torch.float16
        )
        
        qwen_tokenizer = AutoTokenizer.from_pretrained(
            "Qwen/Qwen2-VL-7B",
            trust_remote_code=True
        )
        
        return colpali, qwen_model, qwen_tokenizer
        
    except Exception as e:
        print(f"Ошибка при загрузке моделей: {str(e)}")
        raise

def process_image(image, colpali, qwen_model, qwen_tokenizer):
    try:
        # ColPali обработка
        colpali_result = colpali.process_image(image)
        
        # Qwen2-VL обработка
        qwen_inputs = qwen_tokenizer(
            text="Describe this image in detail:",
            images=image,
            return_tensors="pt"
        ).to(qwen_model.device)
        
        with torch.no_grad():
            qwen_outputs = qwen_model.generate(
                **qwen_inputs,
                max_new_tokens=100,
                num_return_sequences=1,
                do_sample=True,
                temperature=0.7,
                top_p=0.9
            )
        
        qwen_result = qwen_tokenizer.decode(qwen_outputs[0], skip_special_tokens=True)
        
        return f"""
{colpali_result}

{qwen_result}
"""
        
    except Exception as e:
        return (e)

def main():
    login_to_hf()
    
    colpali, qwen_model, qwen_tokenizer = setup_models()
    
    demo = gr.Interface(
        fn=lambda img: process_image(img, colpali, qwen_model, qwen_tokenizer),
        inputs=gr.Image(type="pil"),
        outputs=gr.Textbox(label="Результат"),
        title="ColPali + Qwen2-VL Demo",
        description="Загрузите изображение для анализа обеими моделями"
    )
    
    demo.launch(share=True)

if __name__ == "__main__":
    main()