In [6]:
%%capture
%pip install -r requirements.txt
%pip install git+https://github.com/openai/CLIP.git

In [1]:
import os
from dotenv import load_dotenv 
load_dotenv()
key = os.getenv("key")
# os.environ["IMAGEIO_FFMPEG_EXE"] = "/usr/bin/ffmpeg"

In [2]:
from moviepy.editor import VideoFileClip
from pathlib import Path
import speech_recognition as sr
from pytube import YouTube
from pprint import pprint
from PIL import Image
import matplotlib.pyplot as plt


In [None]:
os.getcwd()

In [6]:
video_url="https://youtu.be/3dhcmeOTZ_Q"
output_video_path = "content/video_data/"

In [7]:
# from the video collect images,audio,text
output_folder = "content/mixed_data/"
output_audio_path = "content/mixed_data/output_audio.wav"

In [None]:
filepath = output_video_path + "input_vid.mkv"
filepath

In [None]:
# download the video 
import yt_dlp

def download_video(url, output_path):
    ydl_opts = {
        'outtmpl': output_path + '/input_vid.mkv',  # Path where to save
    }

    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        result = ydl.extract_info(url, download=True)
    
    metadata = {
        "Author": result.get("uploader", "Unknown"),
        "Title": result.get("title", "Unknown Title"),
        "Views": result.get("view_count", "Unknown Views"),
    }
    
    return metadata

metadata_vid = download_video(video_url, output_video_path)
metadata_vid

In [None]:
# converting video to image
def video_to_image(video_path, output_folder):
    clip = VideoFileClip(video_path)
    clip.write_images_sequence(
        os.path.join(output_folder, "frame%04d.png"), fps = 0.2
    )

video_to_image(filepath, output_folder)

In [None]:
# converting video to audio 
def video_to_audio(video_path, output_audio_path):
    clip = VideoFileClip(video_path)
    audio = clip.audio
    audio.write_audiofile(output_audio_path)

video_to_audio(filepath, output_audio_path)

In [None]:
# converting audio to text 
def audio_to_text(audio_path):
    recogizer = sr.Recognizer()
    audio = sr.AudioFile(audio_path)

    with audio as source:
        audio_data = recogizer.record(source)

        try:
            # recognize the speech
            text= recogizer.recognize_whisper(audio_data)
        except sr.UnknownValueError:
            print("Speech recognition could not understand teh audio")
    return text

text_data = audio_to_text(output_audio_path)

In [None]:
text_data

In [None]:
with open(output_folder + "output_text.txt", "w") as file:
    file.write(text_data)

print("Text data saved to the file.")
file.close()

# MultiModel RAG

In [3]:
from llama_index.core.indices import MultiModalVectorStoreIndex
from llama_index.core import SimpleDirectoryReader
from llama_index.core import StorageContext
from llama_index.vector_stores.lancedb import LanceDBVectorStore
import lancedb

In [4]:
text_store = LanceDBVectorStore(uri = "lancedb", table_name="text_collection")
image_store = LanceDBVectorStore(uri = "lancedb", table_name="image_collection")

In [9]:
storage_context = StorageContext.from_defaults(vector_store=text_store, image_store=image_store)
documents = SimpleDirectoryReader(output_folder).load_data()

In [None]:
index = MultiModalVectorStoreIndex.from_documents(documents, storage_context=storage_context)

In [10]:
retriever_engine = index.as_retriever(similarity_top_k = 1, image_similarity_top_k = 3)

In [11]:
from llama_index.core.response.notebook_utils import display_source_node
from llama_index.core.schema import ImageNode

In [12]:
def retrieve(retriever_engine, query_str):
    retrieval_results = retriever_engine.retrieve(query_str)

    retrieved_image = []
    retrieval_text = []
    for res_node in retrieval_results:
        if isinstance(res_node.node, ImageNode):
            retrieved_image.append(res_node.node.metadata["file_path"])
        else:
            display_source_node(res_node, source_length=200)
            retrieval_text.append(res_node.text)

    return retrieved_image, retrieval_text

In [13]:
query = "can you tell me what is linear regression? Explain equation of multiple linear regression?"

In [None]:
img, text = retrieve(retriever_engine, query)

In [16]:
import matplotlib.pyplot as plt
def plot_images(images_path):
  images_shown = 0
  plt.figure(figsize=(16, 9))
  for img_path in images_path:
        if os.path.isfile(img_path):
            image = Image.open(img_path)

            plt.subplot(2, 3, images_shown + 1)
            plt.imshow(image)
            plt.xticks([])
            plt.yticks([])

            images_shown += 1
            if images_shown >= 5:
                break

In [None]:
plot_images(img)

In [None]:
import json
metadata_str=json.dumps(metadata_vid)
query_str = "Tell me the equation linear regression?"
context_str = "".join(text)

qa_tmpl_str=(
    "Based on the provided information, including relevant images and retrieved context from the video, \
    accurately and precisely answer the query without any additional prior knowledge.\n"

    "---------------------\n"
    "Context: {context_str}\n"
    "Metadata for video: {metadata_str} \n"

    "---------------------\n"
    "Query: {query_str}\n"
    "Answer: "
)

In [None]:
from llama_index.multi_modal_llms.openai import OpenAIMultiModal

image_documents = SimpleDirectoryReader( input_files=img).load_data()
openai_mm_llm = OpenAIMultiModal(model="gpt-4-vision-preview", api_key=key, max_new_tokens=1500)


result=openai_mm_llm.complete(
    prompt=qa_tmpl_str.format(
        query_str=query_str,metadata_str=metadata_str, context_str=context_str
    ),
    image_documents=image_documents,
)
    
pprint(result.text)