<a href="https://colab.research.google.com/github/Patfarmurs/Activity/blob/main/AI_agent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install gradio
!pip install opencv-python
!pip install transformers
!pip install torch
!pip install datasets

Collecting gradio
  Downloading gradio-5.1.0-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.2-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.0 (from gradio)
  Downloading gradio_client-1.4.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting huggingface-hub>=0.25.1 (from gradio)
  Downloading huggingface_hub-0.26.0-py3-none-any.whl.metadata (13 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata

In [2]:
import cv2
from transformers import BlipProcessor, BlipForConditionalGeneration

# Load the processor and model for image captioning
# The processor prepares input data (images) for the model, and the model generates the captions
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model_blip = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

def process_video(video_path):
    # Open the video file using OpenCV
    cap = cv2.VideoCapture(video_path)

    # Get the frames per second (fps) of the video
    fps = int(cap.get(cv2.CAP_PROP_FPS))
    frame_count = 0
    frame_descriptions = {}

    # Loop through each frame of the video
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break

        # Process one frame every second, based on the fps of the video
        if frame_count % fps == 0:
            # Convert the frame from BGR (OpenCV format) to RGB (model expects RGB)
            rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

            # Prepare the frame for the model
            inputs = processor(images=rgb_frame, return_tensors="pt")

            # Generate the caption for the frame
            outputs = model_blip.generate(**inputs)

            # Decode the generated caption from token IDs to human-readable text
            caption = processor.decode(outputs[0], skip_special_tokens=True)

            # Store the caption with the timestamp (in seconds)
            frame_descriptions[frame_count // fps] = caption

        frame_count += 1

    # Release the video capture object after processing
    cap.release()

    # Return the dictionary containing frame timestamps and their corresponding captions
    return frame_descriptions

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/287 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/506 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/4.56k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [3]:
from transformers import pipeline

summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
def summarize_text(text, max_length=10, min_length=2):
  summary = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
  return summary[0]["summary_text"]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



In [4]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import numpy as np
import torch

# Load the processor and model for text-to-speech
processor_speech = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")  # Prepares text for the TTS model
model_speech = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")  # Text-to-speech model
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")  # Vocoder model to enhance audio quality

# Set the index for selecting a specific speaker's embedding from the dataset
speaker_index = 0 # Change speaker_index to change the speaker sound. Accepted values between 0 and 7930

# Load a dataset of speaker embeddings, which provides pre-trained voice representations
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

# Extract the speaker embedding at the specified index (in this case, the first speaker)
speaker_embeddings = embeddings_dataset[speaker_index]["xvector"]

# Convert the speaker embedding into a tensor and add an extra dimension for batch processing
speaker_embeddings = torch.tensor(speaker_embeddings).unsqueeze(0)

def speak(text_input):
    # Convert the input text into the format the model requires
    inputs = processor_speech(text=text_input, return_tensors="pt")

    # Generate speech using the model, speaker embeddings, and vocoder
    speech = model_speech.generate_speech(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)

    # Convert the speech to numpy array to feed it directly into Gradio
    speech = speech.cpu().numpy()

    # Return the generated speech (in numpy format)
    return speech

preprocessor_config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/232 [00:00<?, ?B/s]

spm_char.model:   0%|          | 0.00/238k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/40.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/585M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/50.7M [00:00<?, ?B/s]

0000.parquet:   0%|          | 0.00/21.3M [00:00<?, ?B/s]

Generating validation split:   0%|          | 0/7931 [00:00<?, ? examples/s]

In [5]:
def process_video_and_generate_speech(video_path):
    # Step 1: Process the video and get frame descriptions
    frame_descriptions = process_video(video_path)

    # Combine descriptions into a single string
    frame_descriptions_text = "\\n".join([f"{v}," for k, v in frame_descriptions.items()])

    # Step 2: Summarize the descriptions
    summary = summarize_text(frame_descriptions_text)

    # Step 3: Convert the summary into speech
    spoken_summary = speak(summary)

    # Return the speech as a numpy array along with the sampling rate
    return 16000, spoken_summary

In [6]:
import gradio as gr

# Creating the Gradio interface
with gr.Blocks() as interface:
    # Adding a Markdown title to the interface
    gr.Markdown("# Video Frame Caption Generator")

    # Create a row layout to hold the video input and audio output
    with gr.Row():
        video_input = gr.Video(label="Upload a video file")  # Video upload component
        audio_output = gr.Audio(label="Speech Output", type="numpy")  # Audio output component

    # Link the video input to the function that processes the video and generates speech
    video_input.upload(process_video_and_generate_speech, inputs=video_input, outputs=audio_output)

# Launch the interface
interface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://bbb93a57edd1e703d6.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


