# DeHaDO-AI || NCVPTIPG-2025
[Challenge Website](https://sites.google.com/view/dehado-ai)

## Load all the Libraries

In [None]:
import os
import json
import tqdm
import random

import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

In [None]:
from prompt import load_prompt
from internvl import *

## Define all the relevenat paths

In [None]:
BASE_PATH = '/DATA/gyan/GP/ncvpripg2025/dehado/'

PATH_PHASE_1 = os.path.join(BASE_PATH,'DEHADO-AI_TRAINING_DATASET')
PATH_PHASE_2 = os.path.join(BASE_PATH,'DEHADO-AI_TRAINING_DATASET_PHASE_II')
PATH_PHASE_TEST = os.path.join(BASE_PATH,'IMAGES')

### Select the Image folder, we want to work with.

In [None]:
SELECTED_IMAGE_FOLDER = PATH_PHASE_TEST # CHNAGE THE SELECTION HERE

In [None]:
if SELECTED_IMAGE_FOLDER == PATH_PHASE_1:
    output_path = '/DATA/gyan/GP/ncvpripg2025/dehado/output_phase_3_01'
elif SELECTED_IMAGE_FOLDER == PATH_PHASE_2:
    output_path = '/DATA/gyan/GP/ncvpripg2025/dehado/output_phase_3_02'
elif SELECTED_IMAGE_FOLDER == PATH_PHASE_TEST:
    output_path = '/DATA/gyan/GP/ncvpripg2025/dehado/output_phase_test'
else:
    raise Exception('Wrong choice of Image Folder!')


print(f'The Output Path is: {output_path}')

### Choose the Image Folder

In [None]:
if (SELECTED_IMAGE_FOLDER == PATH_PHASE_1) or (SELECTED_IMAGE_FOLDER == PATH_PHASE_2):
    IMG_PATH = os.path.join(SELECTED_IMAGE_FOLDER,'IMAGES_750')
elif (SELECTED_IMAGE_FOLDER == PATH_PHASE_TEST):
    IMG_PATH = SELECTED_IMAGE_FOLDER
else:
    raise Exception('Error! Wrong folder is selected.')

### Get all the images

In [None]:
all_images = os.listdir(IMG_PATH)
print(f'There are {len(all_images)} images present.')

## Load the Prompt File for the InternVL

In [None]:
prompt = load_prompt()

### Build the question for the InternVL

In [None]:
question = '<image>\n' + prompt
print(question)

## Load the InternVL Model

In [None]:
path = 'OpenGVLab/InternVL3-8B'
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)

## Which images have already been passed to OCR and we have results
1. We are saving each output as a seperate json file. 
2. We always check for all the images which we have already processed.
3. This way, we have to run for only those images which are left, even if the for loop breaks due to any issue.

In [None]:
ocr_done = os.listdir(output_path)
print(f'OCR is already done for : {len(ocr_done)} images.')

In [None]:
def get_remaining_images(all_images, ocr_done):
    
    # Clean the ocr_done list (remove .txt extension)
    ocr_done_cleaned = {os.path.splitext(name)[0] for name in ocr_done}
    
    # Filter all_images: keep original name if base name not in ocr_done_cleaned
    left_images = [img for img in all_images if os.path.splitext(img)[0] not in ocr_done_cleaned]
    
    return left_images


### List down all the images which are left to be passed into InternVL

In [None]:
left_images = get_remaining_images (all_images = all_images,
                                    ocr_done = ocr_done)
print(f'We have a total of {len(left_images)} images which are left to be OCRed.')

In [None]:
error_files = list()

for sel_image in left_images:

    try:
        sel_img_path = os.path.join(IMG_PATH,sel_image)

        # set the max number of tiles in `max_num`
        pixel_values = load_image(sel_img_path, 
                                max_num=12).to(torch.bfloat16).cuda()
        
        # Create the Config - max_new_tokens can be changed
        # Add the eos and pad token to resolve the warnings
        generation_config = dict(max_new_tokens=1024,
                                 do_sample=True, 
                                 eos_token_id=151645, 
                                 pad_token_id=151645)

        # Generate the response from InternVL
        response = model.chat(tokenizer, pixel_values, question, generation_config)

        # Save the output a .txt file
        nm,_ = os.path.splitext(sel_image)
        txt_filename = nm + '.txt'
        txt_path = os.path.join(output_path,txt_filename)

        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(response)
    except:
        error_files.append(sel_image)

In [None]:
print(f'we have {len(error_files)} errored files.')