In [2]:
import textwrap
import torch
from transformers import AutoTokenizer
from PIL import Image
import importlib.util
import sys
from decord import VideoReader, cpu  # For video handling

model_dir = '/home/raw/Desktop/Coding/military_int_icc/shakti-2B-041224'
sys.path.append(model_dir)

# Dynamically import the configuration and model
config_module_path = f"{model_dir}/configuration_shakti.py"
model_module_path = f"{model_dir}/modeling_shakti.py"

# Load the configuration module dynamically
spec_config = importlib.util.spec_from_file_location("shaktiConfig", config_module_path)
config_module = importlib.util.module_from_spec(spec_config)
sys.modules["shaktiConfig"] = config_module
spec_config.loader.exec_module(config_module)

# Load the model module dynamically
spec_model = importlib.util.spec_from_file_location("shaktiModel", model_module_path)
model_module = importlib.util.module_from_spec(spec_model)
sys.modules["shaktiModel"] = model_module
spec_model.loader.exec_module(model_module)

from shaktiConfig import shaktiConfig
from shaktiModel import shaktiModel

# Load the custom model configuration
config = shaktiConfig.from_pretrained(model_dir)

# Load the custom model using the configuration
model = shaktiModel.from_pretrained(model_dir, config=config, attn_implementation='sdpa', torch_dtype=torch.half)
model.eval().cuda()

# Initialize the tokenizer and processor
tokenizer = AutoTokenizer.from_pretrained(model_dir)
processor = model.init_processor(tokenizer)

# Define video processing parameters
MAX_NUM_FRAMES = 16
video_path = '/home/raw/Downloads/Unbelievable Camouflage Skills.mp4'  # Update with the actual video path

def encode_video(video_path):
    """Encodes video into frames."""
    def uniform_sample(l, n):
        gap = len(l) / n
        idxs = [int(i * gap + gap / 2) for i in range(n)]
        return [l[i] for i in idxs]

    vr = VideoReader(video_path, ctx=cpu(0))
    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
    frame_idx = [i for i in range(0, len(vr), sample_fps)]
    if len(frame_idx) > MAX_NUM_FRAMES:
        frame_idx = uniform_sample(frame_idx, MAX_NUM_FRAMES)
    frames = vr.get_batch(frame_idx).asnumpy()
    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
    print(f"Encoded {len(frames)} frames from video: {video_path}")
    return frames

# Encode frames for the video
video_frames = encode_video(video_path)

# Queries for the video
queries = [
    "Give me every detail of camouflage soldier if any in detail?",
]

# Process and generate answers for each query
for idx, query in enumerate(queries, start=1):
    print(f"Query {idx}: {query}")
    
    # Prepare messages
    messages = [
        {"role": "user", "content": f"""<|video|> {query}"""},
        {"role": "assistant", "content": ""}
    ]
    
    # Process inputs for the video
    inputs = processor(messages, images=None, videos=[video_frames])
    inputs.to('cuda')

    # Update input parameters
    inputs.update({
        'tokenizer': tokenizer,
        'max_new_tokens': 200,
        'decode_text': True,
    })

    # Generate the model's output
    outputs = model.generate(**inputs)
    print("Answer:")
    for sentence in outputs[0].split('.'):
        if sentence.strip():
            wrapped_text = textwrap.fill(sentence.strip(), width=100)
            print(wrapped_text)
    print("\n" + "-" * 80 + "\n")  # Separator between queries


Encoded 16 frames from video: /home/raw/Downloads/Unbelievable Camouflage Skills.mp4
Query 1: Give me every detail of camouflage soldier if any in detail?
Answer:
The video shows a group of soldiers in camouflage uniforms, which are designed to blend in with
their surroundings
The uniforms are primarily dark in color, with some lighter shades used for the patterns and
markings
The soldiers are wearing helmets and carrying weapons, which are not clearly visible in the video
The camouflage is effective in hiding the soldiers from the viewer, making it difficult to discern
their exact location or activity

--------------------------------------------------------------------------------

