load BERT and Tokenizer

# Video Processing: Object Detection and Audio Transcription

In this notebook, we will process several video clips to:

- Perform object detection using different models.
- Transcribe audio using different transcription tools.
- Measure performance metrics such as processing speed, detection accuracy, and transcription quality.
- Compare the performance of different tools.

**Tools Used:**

- **Object Detection Models**:
  - YOLOv8 (Ultralytics)
  - YOLOv5 (Ultralytics) 
  - BLIP

Install necessary packages for document parsing, YOLO object detection, BLIP and data analysis

In [ ]:
# Install Ultralytics for YOLOv8 and YOLOv5
!pip install ultralytics

# Install OpenCV for video processing
!pip install opencv-python

# Install pandas for data manipulation
!pip install pandas

# Install matplotlib and seaborn for visualization
!pip install matplotlib seaborn

# Install PyTorch and related libraries for deep learning
!pip install torch torchvision torchaudio

# Install Hugging Face Transformers for NLP and computer vision models
!pip install transformers

# Install Hugging Face Datasets for data handling
!pip install datasets

# Install tqdm for progress bars
!pip install tqdm

Import necessary libraries and AI models for , object detection, video processing, and data analysis

In [ ]:
import os
import torch
import cv2
import time
import pandas as pd
import numpy as np
from PIL import Image
from tqdm import tqdm
from ultralytics import YOLO
from transformers import BlipProcessor, BlipForConditionalGeneration
import matplotlib.pyplot as plt
import seaborn as sns

Data for analysing

In [ ]:
# List of video files
video_files = [
    'fruit-and-vegetable-detection.mp4',  # High resolution, good lighting
    'traffic-mini.mp4',  # Medium resolution, low lighting
]

Function to extract frames from the video

In [ ]:
def extract_frames(video_path, frame_interval=30):
    cap = cv2.VideoCapture(video_path)
    frames = []
    success, frame = cap.read()
    count = 0

    while success:
        if count % frame_interval == 0:
            frames.append(frame)
        success, frame = cap.read()
        count += 1

    cap.release()
    return frames

# Example usage
video_path = 'fruit-and-vegetable-detection.mp4'  # Update with your video path
frames = extract_frames(video_path)

Load the BLIP processor and model for Video Question Answering

In [ ]:
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-vqa-base")

# Set the model to evaluation mode
model.eval()

# Function to perform Video Question Answering using BLIP
def blip_video_qa(frames, question):
    answers = []

    for i, frame in enumerate(frames):
        # Convert frame to PIL Image
        image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        # Prepare inputs for the model
        inputs = processor(image, question, return_tensors="pt")

        # Generate the answer
        output = model.generate(**inputs)
        answer = processor.decode(output[0], skip_special_tokens=True)
        answers.append((i, answer))  # Store frame index and answer

        print(f"Frame {i} Answer: {answer}")

    return answers

# Define your question
question = "When in the video does it show the apple for the first time?"

# Perform Video QA using BLIP
blip_answers = blip_video_qa(frames, question)

 Measure processing time for BLIP

In [ ]:
import time

start_time = time.time()
blip_answers = blip_video_qa(frames, question)
blip_processing_time = time.time() - start_time

print(f"BLIP Processing Time: {blip_processing_time:.2f} seconds")

Function for Object Detection in Videos Using YOLOv8

In [ ]:
def yolo_v8_detection(video_path, user_object, frame_interval=5):
    model = YOLO('yolov8s.pt')  # Load the YOLOv8 model
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    timestamps = []
    processing_times = []
    class_names = model.names

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1

        if frame_count % frame_interval != 0:
            continue

        start_time = time.time()
        results = model(frame)
        end_time = time.time()
        processing_times.append(end_time - start_time)

        for result in results:
            boxes = result.boxes
            for box in boxes:
                class_id = int(box.cls[0])
                class_name = class_names[class_id]
                if class_name == user_object:
                    timestamp = frame_count / fps
                    timestamps.append(timestamp)
                    break

    cap.release()
    avg_processing_time = np.mean(processing_times)

    return {
        'timestamps': sorted(set(timestamps)),
        'processing_time': avg_processing_time,
        'frames_processed': frame_count // frame_interval
    }

Function for Object Detection in Videos Using YOLOv5

In [ ]:
def yolo_v5_detection(video_path, user_object, frame_interval=5):
    model = torch.hub.load('ultralytics/yolov5', 'yolov5s', force_reload=True)
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_count = 0
    timestamps = []
    processing_times = []
    class_names = model.names

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frame_count += 1

        if frame_count % frame_interval != 0:
            continue

        start_time = time.time()
        results = model(frame)
        end_time = time.time()
        processing_times.append(end_time - start_time)

        # Parse detection results
        labels = results.xyxy[0][:, -1].cpu().numpy()
        for label in labels:
            class_name = class_names[int(label)]
            if class_name == user_object:
                timestamp = frame_count / fps
                timestamps.append(timestamp)
                break

    cap.release()
    avg_processing_time = np.mean(processing_times)

    return {
        'timestamps': sorted(set(timestamps)),
        'processing_time': avg_processing_time,
        'frames_processed': frame_count // frame_interval
    }

Script for Running Object Detection and Video Question Answering with YOLOv8, YOLOv5, and BLIP

In [ ]:
video_user_objects = {
    'fruit-and-vegetable-detection.mp4': 'apple',
    'traffic-mini.mp4': 'truck',
    # Add more mappings if needed
}
# List to store results
results_list = []

for video in video_files:
    print(f"Processing {video}...")
    
    video_basename = os.path.basename(video)
    user_object = video_user_objects.get(video_basename, None)
    if user_object is None:
        print(f"No user_object specified for {video_basename}. Skipping this video.")
        continue
    
    # YOLOv8 Detection
    yolo_v8_results = yolo_v8_detection(video, user_object)
    
    
    results_list.append({
        'Video': video_basename,
        'Tool': 'YOLOv8',
        'Processing Time (s)': yolo_v8_results['processing_time'],
        'Frames Processed': yolo_v8_results['frames_processed']
    })
    
    # YOLOv5 Detection
    yolo_v5_results = yolo_v5_detection(video, user_object)
  
    
    results_list.append({
        'Video': video_basename,
        'Tool': 'YOLOv5',
        'Processing Time (s)': yolo_v5_results['processing_time'],
        'Frames Processed': yolo_v5_results['frames_processed']
    })
    # BLIP Detection (using Video QA)
    start_time = time.time()
    blip_answers = blip_video_qa(extract_frames(video), "When in the video its show on the first time the apple?")
    blip_processing_time = time.time() - start_time

    results_list.append({
        'Video': video_basename,
        'Tool': 'BLIP',
        'Processing Time (s)': blip_processing_time,
        'Frames Processed': len(frames)
    })
    results_df = pd.DataFrame(results_list)

Script for Displaying and Visualizing Object Detection and Video QA Results

In [ ]:
pd.set_option('display.max_columns', None)  # Ensure all columns are displayed
pd.set_option('display.expand_frame_repr', False)  # Avoid line breaks in the table display
pd.set_option('display.colheader_justify', 'center')  # Center-align column headers

# Print the results DataFrame
print("\nSummary of Processing Results:")
print(results_df.to_string(index=False))  # Display the DataFrame in a readable format

# Plot Processing Time Comparison for YOLOv8, YOLOv5, and BLIP
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df, x='Tool', y='Processing Time (s)', hue='Video')
plt.title('Processing Time Comparison for YOLOv8, YOLOv5, and BLIP')
plt.xlabel('Detection Tool')
plt.ylabel('Processing Time (seconds)')
plt.legend(title='Video File', loc='upper right')
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()  # Adjust layout for better fit
plt.show()

In [4]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

extract embeddings from BERT using the [CLS] token, which will serve as the sentence embedding.

In [5]:
# Function to get the [CLS] embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding


compute the similarity between different queries using cosine similarity to show how BERT handles semantic similarity.

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Get embeddings for similar and different queries
query1 = "Explain the role of backpropagation in deep learning."
query2 = "How do convolutional neural networks work?"
query3 = "What is the process to register for courses at the university?"

embedding1 = get_cls_embedding(query1)
embedding2 = get_cls_embedding(query2)
embedding3 = get_cls_embedding(query3)

# Compute cosine similarities
similarity_1_2 = cosine_similarity(embedding1, embedding2)
similarity_1_3 = cosine_similarity(embedding1, embedding3)

print(f"Similarity between query 1 and query 2: {similarity_1_2[0][0]}")
print(f"Similarity between query 1 and query 3: {similarity_1_3[0][0]}")


Similarity between query 1 and query 2: 0.8579674959182739
Similarity between query 1 and query 3: 0.8106188774108887


text retrevial using BERT we embedd the documents with a cls token which respresents the sentence and we then do a similarity check between the query .
since the query is about backpropagtion document 1 and 3 should have a higher similarity than doc 2 which is about vpb

In [7]:
# Example documents
doc1 = "Neural networks are computing systems inspired by the biological neural networks."
doc2 = "To connect to the university VPN, you need to configure your VPN client."
doc3 = "Backpropagation is a fundamental algorithm in training deep learning models."

# Get embeddings for documents
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)

# Compare query to documents
query = "How does backpropagation work in neural networks?"

query_embedding = get_cls_embedding(query)

# Compute cosine similarities between query and documents
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)

# Show results
print(f"Similarity with Document 1: {similarity_doc1[0][0]}")
print(f"Similarity with Document 2: {similarity_doc2[0][0]}")
print(f"Similarity with Document 3: {similarity_doc3[0][0]}")


Similarity with Document 1: 0.8547440767288208
Similarity with Document 2: 0.8254550695419312
Similarity with Document 3: 0.8463909029960632


In [8]:
# Example documents
doc1 = "Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose, treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make more informed decisions. Despite these advances, there are challenges such as data privacy and the need for comprehensive validation of AI models before widespread adoption."
doc2 = "Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to ensure long-term sustainability."
doc3 = "Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes, where each node represents a neuron. The training of neural networks involves adjusting weights based on input data, allowing the model to learn patterns and make predictions. Despite their success, training large neural networks requires significant computational power and data."

# Get embeddings for documents
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)

# Compare query to documents
query = "How do neural networks function in artificial intelligence, and what are the challenges of training them?"

query_embedding = get_cls_embedding(query)

# Compute cosine similarities between query and documents
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)

# Show results
print(f"Similarity with Document 1: {similarity_doc1[0][0]}")
print(f"Similarity with Document 2: {similarity_doc2[0][0]}")
print(f"Similarity with Document 3: {similarity_doc3[0][0]}")


Similarity with Document 1: 0.7179515957832336
Similarity with Document 2: 0.5887858271598816
Similarity with Document 3: 0.6379864811897278


as you can see the results are as expected

In [9]:
# Import necessary libraries
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Set the model to evaluation mode
model.eval()

# Function to get the [CLS] embedding for a given sentence
def get_cls_embedding(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", max_length=512, truncation=True, padding=True)
    
    # Get the embeddings from the BERT model
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract the [CLS] token embedding (first token)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
    return cls_embedding

# Example documents (longer text)
doc1 = """
Artificial Intelligence (AI) has transformed the healthcare industry by offering new ways to diagnose, 
treat, and manage diseases. AI algorithms, particularly deep learning, are being used to analyze medical 
images, predict disease outbreaks, and personalize treatment plans. The integration of AI in healthcare has 
reduced human error, improved accuracy, and increased the efficiency of medical professionals. AI-powered 
robots are assisting surgeons in complex procedures, while predictive analytics is helping doctors make 
more informed decisions. Despite these advances, there are challenges such as data privacy and the need 
for comprehensive validation of AI models before widespread adoption.
"""

doc2 = """
Cloud computing has revolutionized the way businesses operate, offering flexible and scalable infrastructure 
that can adjust to their needs. By moving to the cloud, companies no longer need to invest heavily in 
on-premise hardware. Instead, they can access powerful computing resources over the internet, enabling them 
to focus on innovation and growth. Businesses use cloud services for data storage, application hosting, and 
collaboration, benefiting from reduced costs, enhanced security, and improved accessibility. However, concerns 
about data breaches and vendor lock-in persist, as companies need to carefully select cloud providers to 
ensure long-term sustainability.
"""

doc3 = """
Neural networks, a fundamental building block of artificial intelligence, have evolved significantly since their 
inception. Initially inspired by the human brain, neural networks are designed to mimic the way neurons in 
the brain process information. Over the years, advances in deep learning, a subset of neural networks, have made 
it possible for AI systems to achieve unprecedented levels of accuracy in tasks like image recognition, natural 
language processing, and autonomous driving. Neural networks are composed of layers of interconnected nodes, 
where each node represents a neuron. The training of neural networks involves adjusting weights based on input 
data, allowing the model to learn patterns and make predictions. Despite their success, training large neural 
networks requires significant computational power and data.
"""

doc4 = """
As the world faces the growing threat of climate change, sustainable energy has become a major focus of global 
efforts. Renewable energy sources such as solar, wind, and hydropower are being developed to reduce dependence 
on fossil fuels. Clean technologies are playing a critical role in achieving sustainability goals, with innovations 
in energy storage, electric vehicles, and smart grids leading the way. Governments and private companies alike are 
investing heavily in research and development to create more efficient and cost-effective solutions. While the 
transition to sustainable energy presents challenges, including the initial cost of infrastructure and the need for 
reliable energy storage, it also offers immense benefits in terms of reducing greenhouse gas emissions and creating 
new economic opportunities.
"""

# Query to compare with the documents
query = "How do neural networks function in artificial intelligence, and what are the challenges of training them?"

# Get embeddings for documents and query
doc_embedding1 = get_cls_embedding(doc1)
doc_embedding2 = get_cls_embedding(doc2)
doc_embedding3 = get_cls_embedding(doc3)
doc_embedding4 = get_cls_embedding(doc4)
query_embedding = get_cls_embedding(query)

# Compute cosine similarities between the query and each document
similarity_doc1 = cosine_similarity(query_embedding, doc_embedding1)
similarity_doc2 = cosine_similarity(query_embedding, doc_embedding2)
similarity_doc3 = cosine_similarity(query_embedding, doc_embedding3)
similarity_doc4 = cosine_similarity(query_embedding, doc_embedding4)

# Show similarity results
print(f"Similarity with Document 1 (AI in Healthcare): {similarity_doc1[0][0]}")
print(f"Similarity with Document 2 (Cloud Computing): {similarity_doc2[0][0]}")
print(f"Similarity with Document 3 (Neural Networks in AI): {similarity_doc3[0][0]}")
print(f"Similarity with Document 4 (Sustainable Energy): {similarity_doc4[0][0]}")




Similarity with Document 1 (AI in Healthcare): 0.7179515957832336
Similarity with Document 2 (Cloud Computing): 0.5887858271598816
Similarity with Document 3 (Neural Networks in AI): 0.6379864811897278
Similarity with Document 4 (Sustainable Energy): 0.6569733023643494


the issue with BERT was he computed based on the word and which had the most occurences