In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import cv2
import numpy as np
import os

class VideoProcessor:
    def __init__(self, video_path, output_folder):
        self.video_path = video_path
        self.output_folder = output_folder
        
        # Ensure the output directory exists
        os.makedirs(output_folder, exist_ok=True)
        
    def process_video(self):
        # Capture the video
        video_cap = cv2.VideoCapture(self.video_path)
        
        if not video_cap.isOpened():
            raise ValueError("Error opening video file")
        
        # Get video properties
        fps = int(video_cap.get(cv2.CAP_PROP_FPS))  # Frames per second
        total_frames = int(video_cap.get(cv2.CAP_PROP_FRAME_COUNT))  # Total number of frames
        duration = int(total_frames / fps)  # Duration of the video in seconds
        
        print(f"FPS: {fps}, Total Frames: {total_frames}, Duration: {duration} seconds")

        all_frames_per_second = []  # List to hold lists of frames for each second
        frame_list = []  # Temporary list to hold frames for one second
        frame_count = 0  # Counter to track frames in a second

        while video_cap.isOpened():
            ret, frame = video_cap.read()
            
            if not ret:
                break
            
            frame_list.append(frame)  # Add the current frame to the current second's list
            frame_count += 1
            
            # Once we've collected 'fps' frames (i.e., 1 second worth of frames)
            if frame_count == fps:
                all_frames_per_second.append(frame_list)  # Save this second's frames
                self._save_frames(frame_list, len(all_frames_per_second) - 1)  # Save frames to disk
                frame_list = []  # Reset for the next second
                frame_count = 0

        # Handling the last set of frames (if video doesn't perfectly divide into seconds)
        if frame_list:
            all_frames_per_second.append(frame_list)
            self._save_frames(frame_list, len(all_frames_per_second) - 1)

        video_cap.release()
        return all_frames_per_second
    
    def _save_frames(self, frame_list, current_sec):
        
        for idx, frame in enumerate(frame_list):
            filename = f"{self.output_folder}/{current_sec},{idx}.npy"
            np.save(filename, frame)
            print(f"Saved frame {idx} of second {current_sec} to {filename}")

In [3]:
n1 = VideoProcessor('src/vid1.mp4', 'output2').process_video()


FPS: 25, Total Frames: 1149, Duration: 45 seconds
Saved frame 0 of second 0 to output2/0,0.npy
Saved frame 1 of second 0 to output2/0,1.npy
Saved frame 2 of second 0 to output2/0,2.npy
Saved frame 3 of second 0 to output2/0,3.npy
Saved frame 4 of second 0 to output2/0,4.npy
Saved frame 5 of second 0 to output2/0,5.npy
Saved frame 6 of second 0 to output2/0,6.npy
Saved frame 7 of second 0 to output2/0,7.npy
Saved frame 8 of second 0 to output2/0,8.npy
Saved frame 9 of second 0 to output2/0,9.npy
Saved frame 10 of second 0 to output2/0,10.npy
Saved frame 11 of second 0 to output2/0,11.npy
Saved frame 12 of second 0 to output2/0,12.npy
Saved frame 13 of second 0 to output2/0,13.npy
Saved frame 14 of second 0 to output2/0,14.npy
Saved frame 15 of second 0 to output2/0,15.npy
Saved frame 16 of second 0 to output2/0,16.npy
Saved frame 17 of second 0 to output2/0,17.npy
Saved frame 18 of second 0 to output2/0,18.npy
Saved frame 19 of second 0 to output2/0,19.npy
Saved frame 20 of second 0 to 

In [4]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
from PIL import Image
import os

class ReduceRedunant:
    # Load MobileNetV2 as feature extractor
    def load_mobilenetv2(self):
        base_model = MobileNetV2(alpha=1.0,include_top=False,weights='imagenet',pooling='avg')
        return Model(inputs=base_model.input, outputs=base_model.output)

    # Function to extract features from image
    def extract_features(self, img, model):
        img = np.expand_dims(img, axis=0)
        img = preprocess_input(img)
        features = model.predict(img, verbose=0)
        return features.flatten()

    # Function to compute similarity and reduce image list to 5 most similar
    def reduce_images(self, images: np.ndarray, num_images: int = 5):
        
        # Initialize MobileNetV2 model
        model = self.load_mobilenetv2()
        
        # Extract features for all images
        image_features = [self.extract_features(img, model) for img in images]
        
        # Calculate cosine similarity matrix
        similarity_matrix = cosine_similarity(image_features)
        
        # Calculate mean similarity for each image (sum of all similarities)
        similarity_scores = np.mean(similarity_matrix, axis=1)
        
        # Print similarity scores for all images
#         for idx, score in enumerate(similarity_scores):
#             print(f"Similarity score for image {idx + 1}: {score}")
        
        # Select the top 'num_images' with the highest similarity scores (without changing order)
        # We pick the indices of the top `num_images` based on similarity scores.
        most_similar_indices = np.argsort(similarity_scores)[:num_images]
        img_list_1=[]
        for idx in most_similar_indices:
            img_list_1.append(images[idx])
        # Return the reduced list of similar images (without sorting by similarity)
        return img_list_1

    # Function to save reduced images to disk
    def save_images(self, images, save_path='reduced_images', num_images=5):
        if not os.path.exists(save_path):
            os.makedirs(save_path)
        
        for i, img in enumerate(images):
            # Convert the image (NumPy array) to a PIL Image object
            img_pil = Image.fromarray((img * 255).astype(np.uint8))  # Assuming images are normalized [0, 1]
            img_pil.save(os.path.join(save_path, f'image_{i+1}.png'))
            
    def generate_test_images(self , num_images=24, image_size=(224, 224, 3)):
        images = []
        for i in range(num_images):
            if i < 10:
                # Create similar images (slight variation)
                base_image = np.random.rand(*image_size) * 0.5 + 0.5  # Base image with mid-level brightness
                # Add small noise to create slight variations
                variation = np.random.rand(*image_size) * 0.1
                images.append(base_image + variation)
            else:
                # Create distinctly different images (e.g., with random noise)
                random_image = np.random.rand(*image_size)
                images.append(random_image)
        
        return np.array(images)



2024-10-15 19:49:46.386991: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-10-15 19:49:46.405570: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-10-15 19:49:46.411254: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-10-15 19:49:46.425118: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [None]:
reduced_images = ReduceRedunant()
n2= []
count=0
for i in range(len(n1)):
    n21 = reduced_images.reduce_images(n1[i], 5)
    n2.append(n21)
#     reduced_images.save_images(images=n21, save_path='reduced_images', num_images=5)
    count+=1
    print(count)

In [None]:
def save_images(image_list, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize the image counter
    image_counter = 1

    # Flatten the list of lists
    for img_sublist in image_list:
        for img in img_sublist:
            # Define the output image path with the current counter value
            img_filename = os.path.join(output_folder, f'image_{image_counter}.jpg')
            
            # Convert the image from a NumPy array (assumed to be in float format) to an 8-bit image
            img_8bit = (img * 255).astype(np.uint8) if img.max() <= 1 else img
            
            # Save the image using OpenCV
            cv2.imwrite(img_filename, img_8bit)
            
            # Increment the counter
            image_counter += 1
            
save_images(n2, 'reduced_images')


In [None]:
pip install decord transformers einops timm accelerate>=0.26.0

In [2]:
#pip install decord transformers einops timm accelerate>=0.26.0 sentencepiece -q
#pip install sentencepiece
#importing libraries
import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer
import os
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
class ImageProcessor:
    def __init__(self ,path='OpenGVLab/InternVL2-8B'):
        self.path=path
    
        self.model = AutoModel.from_pretrained(
    self.path,
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    use_flash_attn=True,
    trust_remote_code=True).eval().cuda()
        self.tokenizer = AutoTokenizer.from_pretrained( path, trust_remote_code=True, use_fast=False)
    #building transformer
    def build_transform(self , input_size):
        MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
        transform = T.Compose([
            T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
            T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
            T.ToTensor(),
            T.Normalize(mean=MEAN, std=STD)
        ])
        return transform

    #aspect_ratio
    def find_closest_aspect_ratio(self,aspect_ratio, target_ratios, width, height, image_size):
        best_ratio_diff = float('inf')
        best_ratio = (1, 1)
        area = width * height
        for ratio in target_ratios:
            target_aspect_ratio = ratio[0] / ratio[1]
            ratio_diff = abs(aspect_ratio - target_aspect_ratio)
            if ratio_diff < best_ratio_diff:
                best_ratio_diff = ratio_diff
                best_ratio = ratio
            elif ratio_diff == best_ratio_diff:
                if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                    best_ratio = ratio
        return best_ratio

    #preprocessig
    def dynamic_preprocess(self , image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
        orig_width, orig_height = image.size
        aspect_ratio = orig_width / orig_height

        # calculate the existing image aspect ratio
        target_ratios = set(
            (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
            i * j <= max_num and i * j >= min_num)
        target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

        # find the closest aspect ratio to the target
        target_aspect_ratio = self.find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)

        # calculate the target width and height
        target_width = image_size * target_aspect_ratio[0]
        target_height = image_size * target_aspect_ratio[1]
        blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

        # resize the image
        resized_img = image.resize((target_width, target_height))
        processed_images = []
        for i in range(blocks):
            box = (
                (i % (target_width // image_size)) * image_size,
                (i // (target_width // image_size)) * image_size,
                ((i % (target_width // image_size)) + 1) * image_size,
                ((i // (target_width // image_size)) + 1) * image_size
            )
            # split the image
            split_img = resized_img.crop(box)
            processed_images.append(split_img)
        assert len(processed_images) == blocks
        if use_thumbnail and len(processed_images) != 1:
            thumbnail_img = image.resize((image_size, image_size))
            processed_images.append(thumbnail_img)
        return processed_images

    #image loading
    def load_image(self , image_file, input_size=448, max_num=12):
        image = Image.open(image_file).convert('RGB')
        transform = self.build_transform(input_size=input_size)
        images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(image) for image in images]
        pixel_values = torch.stack(pixel_values)
        return pixel_values

    #iterating 5 frames
#     def frame_iter(self , img_list,max_images=5):
#         l=[]
#         for img in img_list[:max_images]:
#             pixel_values = self.load_image(img, max_num=12).to(torch.bfloat16).cuda()
#             generation_config = dict(max_new_tokens=1024, do_sample=True) 
#             question = '<image>\nPlease describe what happened in the image shortly.'
#             response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)
#             l.append(response)
#         return l
    def frame_iter(self, img_list, max_images=5):
        concatenated_responses = []  # Final list to hold concatenated responses

        # Process the image list in chunks of max_images (5 by default)
        for i in range(6, len(img_list)-6, max_images):

            # Get the current chunk of images (slice the list for every 5 images)
            current_chunk = img_list[i:i+max_images]
            p1 = self.load_image(current_chunk[0], max_num=12).to(torch.bfloat16).cuda()
            p2 = self.load_image(current_chunk[1], max_num=12).to(torch.bfloat16).cuda()
            p3 = self.load_image(current_chunk[2], max_num=12).to(torch.bfloat16).cuda()
            p4 = self.load_image(current_chunk[3], max_num=12).to(torch.bfloat16).cuda()
            p5 = self.load_image(current_chunk[4], max_num=12).to(torch.bfloat16).cuda()
            # Iterate over the images in the current chunk and generate responses
            pixel_values = torch.cat((p1,p2,p3,p4,p5), dim=0)
            generation_config = dict(max_new_tokens=1024, do_sample=True)
            question = '<image>\nPlease describe what happened in the image shortly.'
            response = self.model.chat(self.tokenizer, pixel_values, question, generation_config)

            # Concatenate all responses for this chunk and add to the final list
            concatenated_responses.append(response)

        return concatenated_responses


In [3]:
paths_list=[]
for img_path in os.listdir('reduced_images'):
    paths_list.append(os.path.join('reduced_images', img_path))

In [4]:
captions_fps = ImageProcessor().frame_iter(paths_list)

InternLM2ForCausalLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


FlashAttention2 is not installed.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (11577 > 8192). Running this sequence through the model will result in indexing errors


OutOfMemoryError: CUDA out of memory. Tried to allocate 7.99 GiB. GPU 0 has a total capacity of 21.99 GiB of which 3.65 GiB is free. Including non-PyTorch memory, this process has 18.33 GiB memory in use. Of the allocated memory 16.59 GiB is allocated by PyTorch, and 1.43 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [7]:
len(captions_fps)

['This image shows a busy urban street scene. Multiple lanes are visible, filled with cars, motorcycles, and other forms of public transportation such as buses and trucks. There is a mix of older and newer vehicles, with a significant number of commercial vehicles including trucks and buses. Pedestrians can be seen crossing the streets or walking along the sidewalks. The image includes overhead markings, road signs, and traffic lights. The street is flanked by buildings and there is greenery around the area. The date and time stamp at the top of the image indicate it was taken on Tuesday, December 12, 2023, at 15:45:49.',
 'The image depicts a street scene in a city, captured by a security camera. The timestamp indicates the date as December 12, 2023, at 3:47 PM. The location is Krishna Kanthi Nethaji Bridge, according to the text in the image. \n\nThe road is divided into several lanes for vehicular traffic. On the left side of the image, there is a sidewalk with pedestrians walking. 

In [None]:
# Importing the required libraries
import os
import openai
from langchain import OpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.chat_models import ChatOpenAI

# Setting the OPEN AI API Key from Environment Variable
# Load GPT-3.5-turbo via LangChain
llm = ChatOpenAI(openai_api_key=openai.api_key, model_name="gpt-3.5-turbo", temperature=0.5)

# Function to summarize the text using GPT-3.5-turbo
class LangchainSummarizer:
    def summarize_with_langchain(self, list_text):
        text = " ".join(list_text)
        document = Document(page_content=text)
        chain = load_summarize_chain(llm, chain_type="map_reduce")
        summary = chain.run([document])
        return summary



In [None]:

class Video2Text():
    def __init__(self, video_path, output_folder, model_path='OpenGVLab/InternVL2-8B'):
        self.video_path = video_path
        self.output_folder = output_folder
        self.model_path = model_path
        self.video_processor = VideoProcessor(video_path, output_folder)
        self.image_processor = ImageProcessor(model_path)
        self.langchain_summarizer = LangchainSummarizer()
        self.reduce_redunant = ReduceRedunant()

    def process_video(self):
        frames_1sec = self.video_processor.process_video()
        print("Video to Frames conversion completed successfully")
        return frames_1sec
    
    def remove_redundant(self, images):
        reduced_images = self.reduce_redunant.reduce_images(images, 5)
        self.reduce_redunant.save_images(reduced_images, save_path='reduced_images', num_images=5)
        print("Redundant frames removed successfully")
        return reduced_images
    
    def captioning(self, images):
        paths_list=[]
        for img_path in os.listdir(images):
            paths_list.append(os.path.join(images, img_path))
        captions_fps = self.image_processor.frame_iter(paths_list)
        print("Captioned every 5 frames successfully")
        return captions_fps
    
    def summarize_text(self, captions):
        summaries_1sec = []
        for caption in captions:
            summary = self.langchain_summarizer.summarize_with_langchain(caption)
            summaries_1sec.append(summary)
        print("Text Summarized successfully")
        return summaries_1sec
    
    def forward(self):
        frames_1sec = self.process_video()
        reduced_images = self.remove_redundant(frames_1sec)
        captions_fps = self.captioning(reduced_images)
        summaries_1sec = self.summarize_text(captions_fps)
        return summaries_1sec



In [None]:
sums = Video2Text(video_path='src/vid1.mp4', output_folder="outputs")