# Video Search App

## Prerequisites

### Install Dependencies

In [1]:
# !pip install -q --upgrade google-cloud-aiplatform 
!pip install -q --upgrade google-cloud-dlp python-docx --upgrade google-auth
!pip install -q --upgrade google-cloud-videointelligence
!pip install -q bigframes==0.26
!pip install -q --upgrade moviepy
!pip install -q --upgrade ffmpeg
!pip install -q unidecode
!pip install -q --upgrade youtube-dl
!pip install -q --upgrade pytubefix
!pip install -q langchain_google_vertexai
!pip install -U -q langchain langchainhub langchain-openai

import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

time.sleep(10)

print("Installation done")

Installation done


### Setup Environment Variables

#### Once Kernel restarts, run this cell and all below
##### Run > Run selected cell and all below

In [36]:
from langchain_google_vertexai import VertexAIEmbeddings

In [37]:
import socket
import re

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(re.search(r'\d+', SVC_ACC).group())

LOCATION="us-central1"

UNIQUE_PREFIX = socket.gethostname()
UNIQUE_PREFIX = re.sub('[^A-Za-z0-9]+', '', UNIQUE_PREFIX)

UID = "JTC"

BUCKET_NAME = f"{PROJECT_ID}-{UNIQUE_PREFIX}-{LOCATION}"

BUCKET_URI = f"gs://{BUCKET_NAME}"

! gcloud config set project $PROJECT_ID
! gcloud storage buckets create {BUCKET_URI} --project={PROJECT_ID} --location={LOCATION}

!rm -r -f content
!mkdir content
!mkdir content/clips/
!mkdir content/frames/

Updated property [core/project].
Creating gs://ai-sb-test-fluxtest-us-central1/...
[1;31mERROR:[0m (gcloud.storage.buckets.create) HTTPError 409: Your previous request to create the named bucket succeeded and you already own it.


### Initialize Libraries

In [38]:
import sys
import vertexai
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.vision_models import MultiModalEmbeddingModel
import glob
import os
import time
from typing import Any, Dict, Iterable, List, Optional, Tuple, Union
from IPython.display import display
import PIL
import numpy as np
import pandas as pd
import requests
from vertexai.generative_models import (
    GenerationConfig,
    HarmBlockThreshold,
    HarmCategory,
    Image,
)
from vertexai.language_models import TextEmbeddingModel
from vertexai.vision_models import Image as vision_model_Image
from vertexai.vision_models import MultiModalEmbeddingModel
from moviepy.editor import VideoFileClip
from google.api_core.client_options import ClientOptions

### Download and upload videos to a bucket

#### Helper Functions

In [39]:
import pytube
from google.cloud import storage
import unidecode
from pytubefix import YouTube

# -1.1) Fetching from youtube
def download_video(video_url, download_folder):
    # Create a YouTube object
    yt = YouTube(video_url)
    
    # Filter to get the highest resolution stream that includes both video and audio
    stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
    
    # Download the video
    if stream:
        print(f"Going to download {video_url}")
        
        # Create download path with filename
        download_path = os.path.join(download_folder)

        # Download the video to the specified path
        stream.download(output_path=download_path)

        print(f"Downloaded into {download_path}")
        return stream.title
    else:
        print("No suitable stream found")
        return None


def upload_file_to_gcs(bucket_name, source_file_path, destination_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_path)
    return blob

def upload_file_to_bucket(bucket_name, source_file_path_name): 
    source_file_path = f"./{source_file_path_name}"
    destination_blob_name = source_file_path_name.replace(' ', '_')
    destination_blob_name = unidecode.unidecode(destination_blob_name)
    uploaded_blob         = upload_file_to_gcs(bucket_name, source_file_path, destination_blob_name)
    print(f"File [{source_file_path}] \nUploaded to [{uploaded_blob.name}] \nIn bucket [{bucket_name}]")
    return destination_blob_name

### Download Videos from youtube

In [40]:
# Define the video URL
video_url_list = [
    "https://www.youtube.com/watch?v=WKsgKA-YHmc",
    "https://www.youtube.com/watch?v=i4osPznQ5d0",
    "https://www.youtube.com/watch?v=XzipIUW8vF4",
    "https://www.youtube.com/watch?v=mfoj_hulL8Q",
    "https://www.youtube.com/watch?v=75E_xOkWeXY",
    "https://www.youtube.com/watch?v=iLQx08RkHvc",
    "https://www.youtube.com/watch?v=KQXrabmy6Ws",
    
]

downloaded_videos = []

for video in video_url_list:
    video_name = download_video(video,"./content/videos")
    downloaded_videos.append(re.sub(r"[^a-zA-Z0-9\.\s]", "", video_name))
    

Going to download https://www.youtube.com/watch?v=WKsgKA-YHmc
Downloaded into ./content/videos
Going to download https://www.youtube.com/watch?v=i4osPznQ5d0
Downloaded into ./content/videos
Going to download https://www.youtube.com/watch?v=XzipIUW8vF4
Downloaded into ./content/videos
Going to download https://www.youtube.com/watch?v=mfoj_hulL8Q
Downloaded into ./content/videos
Going to download https://www.youtube.com/watch?v=75E_xOkWeXY
Downloaded into ./content/videos
Going to download https://www.youtube.com/watch?v=iLQx08RkHvc
Downloaded into ./content/videos
Going to download https://www.youtube.com/watch?v=KQXrabmy6Ws
Downloaded into ./content/videos


### Create csv file

In [41]:
import csv

with open('shots.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    field = ["id", "video_id", "video_title", "start_time", "end_time", "clip_name", "frame_name", "associated_text", "associated_speech", "associated_object", "description"]
    writer.writerow(field)

## Process videos

##### Generate video descriptions

In [82]:
import base64
import time
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part
import vertexai.preview.generative_models as generative_models
from vertexai.generative_models import Tool
from vertexai.generative_models import grounding

#Generate video description using AI Model
#Model used is customisable (default = gemini-1.0-pro)

desc_context = """You are provided with a short video. First, watch the whole video in full. Then, 
    describe the contents of the video in chronological order.
    Go into as much detail as possible."""

text_context = """You are provided with a short video. Register every text visible in chronological order.
    Do not include audio. Only include what you can see visually."""

speech_context = """You are provided with a short video. Watch the video in full, and listen to the audio.
    Register everything you hear in chronological order. Convert everything into text, word for word."""

object_context = """You are provided with a short video. If there are people or animals in the video, describe all the different people or animals that appear throughout the video. Associate them with
    a name if mentioned.
    Include their appearance, emotions, actions if possible. Do so in great detail. If there are no people or animals, simply state that there aren't any. """

def generate(prompt, video):
    vertexai.init(project=PROJECT_ID, location="us-central1")
    model = GenerativeModel("gemini-1.5-flash-002")
    max_retry = 5
    retry = 0
    success = False
    for i in range(5):
        try:
            responses = model.generate_content(
            [prompt, video],
            generation_config={
                "max_output_tokens": 2048,
                "temperature": 0,
                "top_p": 0.4,
                "top_k": 32
            },
            safety_settings={
                  generative_models.HarmCategory.HARM_CATEGORY_HATE_SPEECH: generative_models.HarmBlockThreshold.BLOCK_NONE,
                  generative_models.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
                  generative_models.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: generative_models.HarmBlockThreshold.BLOCK_NONE,
                  generative_models.HarmCategory.HARM_CATEGORY_HARASSMENT: generative_models.HarmBlockThreshold.BLOCK_NONE,
            },
            stream=False,
            )
            return responses
        except Exception as e:
            print(e)
            time.sleep(5)
            print("retrying")
    raise Exception("Failed to generate response after 5 retries")

## Split Videos into 1 minute clips

In [83]:
import moviepy.editor as mpe
import pandas as pd
import math

from vertexai.language_models import TextEmbeddingModel
text_embedding_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@003")

shots_list = []

shots_df = pd.read_csv('shots.csv')

id_list = []
unnamed_id_list = []
video_title_list = []
start_time_list = []
end_time_list = []
clip_name_list = []
frame_name_list = []
associated_desc=[]
associated_speech=[]
associated_text=[]
associated_object=[]
embeddingsList = []

videoPath = "./content/videos/"

file_number = 0
clip_id = -1

for videoFile in os.listdir(videoPath):
    
    if os.path.isfile(os.path.join(videoPath, videoFile)):  # Check if it's a file
        video_name = (re.sub(r"[^a-zA-Z0-9\.\s]", "", videoFile))[:-4]
        try:
            video = mpe.VideoFileClip(videoPath + videoFile)
            
            # e.g. Cut into 60s clips (Change clip_duration to set the desired duration of subclips)
            clip_duration = 60
            clip_no = math.ceil(video.duration/clip_duration)
            clip_no = int(clip_no)

            file_number += 1    

            for i in range(clip_no):
                start_time = i * clip_duration
                end_time = min(start_time + clip_duration, video.duration)

                text = ""
                clip_name = f"content/clips/{video_name}_clip_"+str(i)+".mp4"
                video.subclip(start_time, end_time).write_videofile(clip_name, logger=None)
                time = (start_time  + end_time)/2

                # saving a frame at 2 second
                frame_name = "content/frames/frame_"+str(i)+".png"
                video.save_frame(frame_name, t = float(time))

                id_list.append(clip_id)
                unnamed_id_list.append(file_number)
                video_title_list.append(video_name)
                start_time_list.append(start_time)
                end_time_list.append(end_time)
                clip_name_list.append(clip_name)
                frame_name_list.append(frame_name)

                # Process the clips

                with open(clip_name, "rb") as f:
                    text = base64.b64encode(f.read())
                    clip = Part.from_data(data=base64.b64decode(text), mime_type="video/mp4")

                    desc = generate(desc_context, clip).text
                    associated_desc.append(desc)
                    speechDesc = generate(speech_context, clip).text
                    associated_speech.append(speechDesc)
                    textDesc = generate(text_context, clip).text
                    associated_text.append(textDesc)
                    objDesc = generate(object_context, clip).text
                    associated_object.append(objDesc)

                    string = desc + "\n" + speechDesc + "\n" + textDesc + "\n" +  objDesc
                    embeddings = text_embedding_model.get_embeddings([string])
                    text_embedding = [embedding.values for embedding in embeddings][0]
                    embeddingsList.append(text_embedding)

                clip_id += 1
                
                print(f"\n{clip_name} has been processed.")
        except OSError:
            print(f"Error processing {videoFile}. Skipping.")
            continue

shots_df['id'] = id_list        
shots_df['video_id'] = unnamed_id_list
shots_df['video_title'] = video_title_list
shots_df['start_time'] = start_time_list
shots_df['end_time'] = end_time_list
shots_df['clip_name'] = clip_name_list
shots_df['frame_name'] = frame_name_list

shots_df['description'] = associated_desc
shots_df['associated_speech'] = associated_speech
shots_df['associated_text'] = associated_text
shots_df['associated_object'] = associated_object
shots_df["embedding"] = embeddingsList
shots_df.head(5)


content/clips/Open Digital Platform A smart district operating system_clip_0.mp4 has been processed.

content/clips/Open Digital Platform A smart district operating system_clip_1.mp4 has been processed.

content/clips/Open Digital Platform A smart district operating system_clip_2.mp4 has been processed.

content/clips/Open Digital Platform A smart district operating system_clip_3.mp4 has been processed.

content/clips/FIABCI delegates visit Punggol Digital District_clip_0.mp4 has been processed.

content/clips/FIABCI delegates visit Punggol Digital District_clip_1.mp4 has been processed.

content/clips/JID Day Insights Pushing the Boundaries of Sustainable Design_clip_0.mp4 has been processed.

content/clips/JID Day Insights Pushing the Boundaries of Sustainable Design_clip_1.mp4 has been processed.

content/clips/JID Day Insights Pushing the Boundaries of Sustainable Design_clip_2.mp4 has been processed.

content/clips/JID Day Insights Pushing the Boundaries of Sustainable Design_cli

Unnamed: 0,id,video_id,video_title,start_time,end_time,clip_name,frame_name,associated_text,associated_speech,associated_object,description,embedding
0,-1,1,Open Digital Platform A smart district operati...,0,60.0,content/clips/Open Digital Platform A smart di...,content/frames/frame_0.png,Security Counter\nOur digital landscape is pow...,Our digital landscape is powered by data.\nAnd...,Here is a description of the people in the vid...,Here is a description of the video in chronolo...,"[0.023411588743329048, 0.020503206178545952, -..."
1,0,1,Open Digital Platform A smart district operati...,60,120.0,content/clips/Open Digital Platform A smart di...,content/frames/frame_1.png,Here is a list of the text visible in the vide...,"Developed by JTC and GovTech, in collaboration...",Here is a description of the people in the vid...,Here is a description of the video in chronolo...,"[0.020479103550314903, 0.0032895293552428484, ..."
2,1,1,Open Digital Platform A smart district operati...,120,180.0,content/clips/Open Digital Platform A smart di...,content/frames/frame_2.png,Here is a list of the text visible in the vide...,the platform can then tap on these data sets f...,Here is a description of the people in the pro...,Here is a description of the video in chronolo...,"[-0.01953345350921154, 0.0030750450678169727, ..."
3,2,1,Open Digital Platform A smart district operati...,180,186.36,content/clips/Open Digital Platform A smart di...,content/frames/frame_3.png,Here's a list of the text visible in the video...,Here's a transcription of the audio from the p...,There are no people or animals in this video.,The video begins with a QR code and a website ...,"[-0.00630419235676527, -0.006567784119397402, ..."
4,3,2,FIABCI delegates visit Punggol Digital District,0,60.0,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_0.png,Here is a listing of all the text visible in t...,Here's a transcription of the audio from the p...,Here is a description of the people in the pro...,Here is a description of the video in chronolo...,"[0.013792615383863449, 0.014185289852321148, -..."


## Embeddings

### Convert Dataframe into CSV and store in bucket

In [84]:
shots_df.to_csv('content/shots.csv')
# Copy the file to our new bucket.
!gsutil cp content/shots.csv {BUCKET_URI}/shots.csv

Copying file://content/shots.csv [Content-Type=text/csv]...
- [1 files][433.6 KiB/433.6 KiB]                                                
Operation completed over 1 objects/433.6 KiB.                                    


### Convert the Dataframe into a JSON file, which will be uploaded into Vector Search Index

In [85]:
from datetime import datetime

BUCKET_URI_ME=f"{BUCKET_URI}/embeddings/"
LOCATION = 'us-central1'

from google.cloud import aiplatform

aiplatform.init(project=PROJECT_ID, location=LOCATION)

jsonl_string = shots_df[['id','video_id','video_title','start_time','end_time','clip_name', 'frame_name', "associated_text", "associated_speech", "associated_object", "description", "embedding"]].to_json(orient="records", lines=True)
with open(f"./videodata.json", "w") as f:
    f.write(jsonl_string)

In [86]:
! gsutil cp videodata.json {BUCKET_URI_ME}

Copying file://videodata.json [Content-Type=application/json]...
- [1 files][308.3 KiB/308.3 KiB]                                                
Operation completed over 1 objects/308.3 KiB.                                    


## Creating Vector Search Index

In [87]:
# create Index
my_index = aiplatform.MatchingEngineIndex.create_tree_ah_index(
    display_name=f"vs_index_{UID}",
    contents_delta_uri=BUCKET_URI_ME,
    dimensions=768,
    approximate_neighbors_count=10,
    project = PROJECT_ID
)

Creating MatchingEngineIndex
Create MatchingEngineIndex backing LRO: projects/849276838208/locations/us-central1/indexes/1509246834889981952/operations/2378069550085177344
MatchingEngineIndex created. Resource name: projects/849276838208/locations/us-central1/indexes/1509246834889981952
To use this MatchingEngineIndex in another session:
index = aiplatform.MatchingEngineIndex('projects/849276838208/locations/us-central1/indexes/1509246834889981952')


### Create Index Endpoint and deploy the Index
To use the Index, you need to create an Index Endpoint. It works as a server instance accepting query requests for your Index.


In [88]:
# create IndexEndpoint
my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint.create(
    display_name=f"vs_endpoint_{UID}", public_endpoint_enabled=True
)

Creating MatchingEngineIndexEndpoint
Create MatchingEngineIndexEndpoint backing LRO: projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024/operations/7956481376025313280
MatchingEngineIndexEndpoint created. Resource name: projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024
To use this MatchingEngineIndexEndpoint in another session:
index_endpoint = aiplatform.MatchingEngineIndexEndpoint('projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024')


In [90]:
DEPLOYED_INDEX_ID = f"vs_deployed_{UID}"
# deploy the Index to the Index Endpoint
my_index_endpoint.deploy_index(index=my_index, deployed_index_id=DEPLOYED_INDEX_ID)

Deploying index MatchingEngineIndexEndpoint index_endpoint: projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024
Deploy index MatchingEngineIndexEndpoint index_endpoint backing LRO: projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024/operations/1801608797781753856
MatchingEngineIndexEndpoint index_endpoint Deployed index. Resource name: projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024


<google.cloud.aiplatform.matching_engine.matching_engine_index_endpoint.MatchingEngineIndexEndpoint object at 0x7f553b6030a0> 
resource name: projects/849276838208/locations/us-central1/indexEndpoints/9115967343007105024

#### Go to your Vertex AI console and check that the index is CREATED successfully 

### Get an existing Index

In [91]:
import tqdm
import time

In [92]:
REGION = LOCATION = "us-central1"

aiplatform.init(project=PROJECT_ID, location=LOCATION)
vertexai.init(project=PROJECT_ID, location=LOCATION)

In [93]:
# Uncomment below to manually insert Index 
# my_index_id = "8368299436119425024"
# my_index = aiplatform.MatchingEngineIndex(my_index_id)

my_index_name = my_index._gca_resource.name
my_index_display_name = my_index.display_name
my_index_id = my_index.name.split('/')[-1]

my_index_endpoint_name = my_index_endpoint._gca_resource.name
my_index_endpoint_display_name = my_index_endpoint.display_name
my_index_endpoint_id = my_index_endpoint.name.split('/')[-1]
my_index_endpoint_public_domain = my_index_endpoint.public_endpoint_domain_name

my_index = aiplatform.MatchingEngineIndex(my_index_name)

my_index_endpoint = aiplatform.MatchingEngineIndexEndpoint(my_index_endpoint_id)

In [94]:
# Set variables for the current deployed index.
API_ENDPOINT=my_index_endpoint_public_domain
INDEX_ENDPOINT=my_index_endpoint_name

In [95]:
from langchain_google_vertexai import VertexAIEmbeddings
from google.cloud import aiplatform_v1

embeddings = VertexAIEmbeddings(model="textembedding-gecko@003")

text_embedding_model = embeddings #TextEmbeddingModel.from_pretrained("textembedding-gecko@003")

### Querying using Vector Search

In [96]:
#Set your query
query = "Did FIABCI delegates visit Punggol"

#Set the number of results you want
neighbor_count = 3

test_embeddings = embeddings.embed_query(query)

In [97]:
API_ENDPOINT

'1369639239.us-central1-849276838208.vdb.vertexai.goog'

In [98]:
# Configure Vector Search client
client_options = {
  "api_endpoint": API_ENDPOINT
}
vector_search_client = aiplatform_v1.MatchServiceClient(
  client_options=client_options,
)
# Build FindNeighborsRequest object
datapoint = aiplatform_v1.IndexDatapoint(
  feature_vector=test_embeddings
)

query = aiplatform_v1.FindNeighborsRequest.Query(
  datapoint=datapoint,
  # The number of nearest neighbors to be retrieved
  neighbor_count=neighbor_count
)

request = aiplatform_v1.FindNeighborsRequest(
  index_endpoint=INDEX_ENDPOINT,
  deployed_index_id=DEPLOYED_INDEX_ID,
  # Request can have multiple queries
  queries=[query],
  return_full_datapoint=False,
)

# Execute the request
response = vector_search_client.find_neighbors(request)

df_new = pd.DataFrame()
# print('neighbor_count', neighbor_count)

shots_df['distance'] = None

for i in range(0,neighbor_count):
    x=response.nearest_neighbors[0]

    df_match = shots_df.loc[shots_df['id'] == int(x.neighbors[i].datapoint.datapoint_id) ]
    df_match['distance'] = x.neighbors[i].distance

    # Append the matching rows to the new DataFrame
    df_new = pd.concat([df_new, df_match])
    

# Print the new DataFrame
df_sorted = df_new.sort_values(by="distance", ascending=True)
display(df_sorted)

#Export DataFrame to CSV file for reference
df_new.to_csv("results.csv")

Unnamed: 0,id,video_id,video_title,start_time,end_time,clip_name,frame_name,associated_text,associated_speech,associated_object,description,embedding,distance
12,11,4,10 Sustainable Design and Infrastructure Featu...,120,157.18,content/clips/10 Sustainable Design and Infras...,content/frames/frame_2.png,Here is a listing of all the text visible in t...,Certainly! Here is a transcription of the audi...,Here is a description of the people and animal...,Here is a description of the video in chronolo...,"[-0.002668655477464199, -0.0018522889586165547...",0.632236
4,3,2,FIABCI delegates visit Punggol Digital District,0,60.0,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_0.png,Here is a listing of all the text visible in t...,Here's a transcription of the audio from the p...,Here is a description of the people in the pro...,Here is a description of the video in chronolo...,"[0.013792615383863449, 0.014185289852321148, -...",0.665848
5,4,2,FIABCI delegates visit Punggol Digital District,60,91.32,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_1.png,Here is a list of the text visible in the vide...,"of how to use the data to organise the living,...","Certainly, here is a description of the people...",Here is a description of the video in chronolo...,"[0.004918518476188183, 0.011962203308939934, -...",0.669026


In [99]:
results = []

for result in response.nearest_neighbors:
    for neighbor in result.neighbors:
        clip_id = int(neighbor.datapoint.datapoint_id)
        distance = neighbor.distance
        df_match = shots_df.loc[shots_df.index == clip_id]
        if not df_match.empty:
            match_info = df_match.iloc[0].to_dict()
            match_info['distance'] = distance
            results.append(match_info)

In [100]:
clipNames = []

for i in df_sorted['clip_name']:
    clipNames.append(i)
    
#Display the clips     
print(clipNames)

['content/clips/10 Sustainable Design and Infrastructure Features at Punggol Digital District_clip_2.mp4', 'content/clips/FIABCI delegates visit Punggol Digital District_clip_0.mp4', 'content/clips/FIABCI delegates visit Punggol Digital District_clip_1.mp4']


### Display Top 3 Relevant Clips

In [101]:
import IPython

for i in clipNames:
    IPython.display.display(IPython.display.Video(i, width=600))

In [102]:
import re

PROJECT_ID = !(gcloud config get-value core/project)
PROJECT_ID = PROJECT_ID[0]

SVC_ACC = !(gcloud config get-value core/account)
SVC_ACC = SVC_ACC[0]

PROJECT_NUMBER=str(re.search(r'\d+', SVC_ACC).group())

FOLDER_NAME="."

dataset_id = "video_rag_dataset"
table_id = "video_embeddings"
LOCATION="us-central1"
# full_table_id = '.'.join([PROJECT_ID,dataset_id,table_id])
file_path = "videodata.json"


from google.cloud import bigquery
client = bigquery.Client(project=PROJECT_ID)
from datetime import datetime

In [103]:
def create_bigquery_dataset():
    dataset = client.dataset(dataset_id)
    dataset = client.create_dataset(dataset, timeout=30)  # Make an API request.
    print("Created dataset {}.{}".format(client.project, dataset.dataset_id))

def create_bigquery_table():    
    # Define the schema for the table
    schema = [
        bigquery.SchemaField("id", "STRING"),
        bigquery.SchemaField("video_id", "STRING"),
        bigquery.SchemaField("video_title", "STRING"),
        bigquery.SchemaField("start_time", "STRING"),
        bigquery.SchemaField("end_time", "STRING"),
        bigquery.SchemaField("clip_name", "STRING"),
        bigquery.SchemaField("frame_name", "STRING"),
        bigquery.SchemaField("associated_text", "STRING"),
        bigquery.SchemaField("associated_speech", "STRING"),
        bigquery.SchemaField("associated_object", "STRING"),
        bigquery.SchemaField("description", "STRING"),
        bigquery.SchemaField("embedding", "FLOAT", mode="REPEATED")
    ]
    
    # Define the table reference
    table_ref = client.dataset(dataset_id).table(table_id)
    
    # Define the table object
    table = bigquery.Table(table_ref, schema=schema)
    
    # Create the table in BigQuery
    try:
        client.create_table(table)
        print("Table created successfully.")
    except Exception as e:
        print(f"Error creating table: {e}")

def load_data_into_bigquery_table():
    job_config = bigquery.LoadJobConfig(
        source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON, autodetect=False)
    
    table_ref = client.dataset(dataset_id).table(table_id)
    
    with open(file_path, "rb") as source_file:
        job = client.load_table_from_file(source_file, table_ref, job_config=job_config)
    
    job.result()  # Waits for the job to complete.
    
    table = client.get_table(table_ref)  # Make an API request.
    print("Loaded {} rows and {} columns to {}".format(
        table.num_rows, len(table.schema), table_ref))
    
def read_bq_table_dataframe():  
    table_ref = client.dataset(dataset_id).table(table_id)
    dataframe = client.list_rows(table_ref).to_dataframe(create_bqstorage_client=True)
    
    return dataframe

In [104]:
# Call the function to create the table
create_bigquery_dataset()
create_bigquery_table()
load_data_into_bigquery_table()

Created dataset ai-sb-test.video_rag_dataset
Table created successfully.
Loaded 20 rows and 12 columns to ai-sb-test.video_rag_dataset.video_embeddings


In [105]:
dataframe = read_bq_table_dataframe()
dataframe

Unnamed: 0,id,video_id,video_title,start_time,end_time,clip_name,frame_name,associated_text,associated_speech,associated_object,description,embedding
0,-1,1,Open Digital Platform A smart district operati...,0,60.0,content/clips/Open Digital Platform A smart di...,content/frames/frame_0.png,Security Counter\nOur digital landscape is pow...,Our digital landscape is powered by data.\nAnd...,Here is a description of the people in the vid...,Here is a description of the video in chronolo...,"[0.0234115887, 0.0205032062, -0.0235713497, 0...."
1,3,2,FIABCI delegates visit Punggol Digital District,0,60.0,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_0.png,Here is a listing of all the text visible in t...,Here's a transcription of the audio from the p...,Here is a description of the people in the pro...,Here is a description of the video in chronolo...,"[0.0137926154, 0.0141852899, -0.0281700697, 0...."
2,5,3,JID Day Insights Pushing the Boundaries of Sus...,0,60.0,content/clips/JID Day Insights Pushing the Bou...,content/frames/frame_0.png,Here is a list of the text visible in the vide...,My name is Praveen Hassan Chandrashekhar. I'm ...,The video shows one person:\n\nPraveen Hassan ...,Here is a description of the video in chronolo...,"[-0.0067363228, -0.0022997283, -0.0177673753, ..."
3,9,4,10 Sustainable Design and Infrastructure Featu...,0,60.0,content/clips/10 Sustainable Design and Infras...,content/frames/frame_0.png,Here is a list of the text visible in the vide...,Punggol Digital District is Singapore's first ...,Here is a description of the people in the vid...,Here is a description of the video in chronolo...,"[-0.0081472127, -0.0133477803, -0.0354543589, ..."
4,12,5,8 hours of work done in just 2.5 hours Thats t...,0,60.0,content/clips/8 hours of work done in just 2.5...,content/frames/frame_0.png,Here is a list of the text visible in the vide...,Here's a transcription of the audio from the v...,Here is a description of the people in the vid...,Here is a description of the video in chronolo...,"[0.0193700604, 0.0265663471, -0.039097216, 0.0..."
5,14,6,Whats Happening at SWITCH onenorth,0,60.0,content/clips/Whats Happening at SWITCH oneno...,content/frames/frame_0.png,Here is a list of the text visible in the vide...,"Alright, is this the right spot?\n\nCan I star...",Here is a description of the people and animal...,Here is a description of the video in chronolo...,"[-0.0022940466, -0.0116759641, -0.0432454385, ..."
6,16,7,Punggol Digital District The future is yours t...,0,60.0,content/clips/Punggol Digital District The fut...,content/frames/frame_0.png,THE FU\nTHE FUTURE IS\nTHE FUTURE IS YOURS TO\...,Here is a transcription of the audio from the ...,Here is a description of the people and animal...,Here is a description of the video in chronolo...,"[-0.0100430017, 0.0097944997, -0.0406552441, 0..."
7,0,1,Open Digital Platform A smart district operati...,60,120.0,content/clips/Open Digital Platform A smart di...,content/frames/frame_1.png,Here is a list of the text visible in the vide...,"Developed by JTC and GovTech, in collaboration...",Here is a description of the people in the vid...,Here is a description of the video in chronolo...,"[0.0204791036, 0.0032895294, -0.0439610444, 0...."
8,4,2,FIABCI delegates visit Punggol Digital District,60,91.32,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_1.png,Here is a list of the text visible in the vide...,"of how to use the data to organise the living,...","Certainly, here is a description of the people...",Here is a description of the video in chronolo...,"[0.0049185185, 0.0119622033, -0.0332286321, 0...."
9,6,3,JID Day Insights Pushing the Boundaries of Sus...,60,120.0,content/clips/JID Day Insights Pushing the Bou...,content/frames/frame_1.png,Here is a transcription of the visible text in...,Here's a transcription of the audio from the p...,"The video shows one person, a man who appears ...",Here is a description of the video in chronolo...,"[-0.0045801108, -0.0206682421, -0.010879931, 0..."


## Querying using Gemini Model instead of Vector Search

### Defining Functions for Gemini

In [106]:
import os
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part
from vertexai.preview.language_models import TextGenerationModel

def generate(input_prompt):
    model = GenerativeModel("gemini-ultra")
    responses = model.generate_content(
        input_prompt ,
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.2,
        "top_p": 1,
        "top_k": 32
    },
        safety_settings=[],
        stream=True,
    )
    
    all_response  = []
    
    for response in responses:
        # print(response.text, end="")
        all_response.append(response.text)
    
    return(" ".join(all_response))
    

def generate_pro(input_prompt):
    model = GenerativeModel("gemini-1.5-flash-002")
    responses = model.generate_content(
    input_prompt,
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.2,
        "top_p": 1
    },stream=True,)
    
    all_response  = []
    
    for response in responses:
        all_response.append(response.text)

    return(" ".join(all_response))


### Query using Gemini

In [107]:
#Insert with your own query
query = "Summarise the PDD project"

In [108]:
import pandas as pd

def combine_column_to_string(df, column_name):

    column_values = df[column_name].tolist()
    combined_string = ', '.join(column_values)
    return combined_string

In [109]:
import pandas as pd
shots_df = pd.read_csv('./results.csv') 
shots_df

Unnamed: 0.1,Unnamed: 0,id,video_id,video_title,start_time,end_time,clip_name,frame_name,associated_text,associated_speech,associated_object,description,embedding,distance
0,5,4,2,FIABCI delegates visit Punggol Digital District,60,91.32,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_1.png,Here is a list of the text visible in the vide...,"of how to use the data to organise the living,...","Certainly, here is a description of the people...",Here is a description of the video in chronolo...,"[0.004918518476188183, 0.011962203308939934, -...",0.669026
1,4,3,2,FIABCI delegates visit Punggol Digital District,0,60.0,content/clips/FIABCI delegates visit Punggol D...,content/frames/frame_0.png,Here is a listing of all the text visible in t...,Here's a transcription of the audio from the p...,Here is a description of the people in the pro...,Here is a description of the video in chronolo...,"[0.013792615383863449, 0.014185289852321148, -...",0.665848
2,12,11,4,10 Sustainable Design and Infrastructure Featu...,120,157.18,content/clips/10 Sustainable Design and Infras...,content/frames/frame_2.png,Here is a listing of all the text visible in t...,Certainly! Here is a transcription of the audi...,Here is a description of the people and animal...,Here is a description of the video in chronolo...,"[-0.002668655477464199, -0.0018522889586165547...",0.632236


In [110]:
import pandas as pd
import re

System_Prompts = """ You are an expert in screening video descriptions and understanding the context and contents of the video.
Only answer based on the description of the video provided here: 
"""

Question_Prompts = """ -- Based on the video description provided, answer the following query as accurately as possible:
"""

videoDesc = combine_column_to_string(shots_df,'description') + combine_column_to_string(shots_df,'associated_text') + combine_column_to_string(shots_df,'associated_speech') + combine_column_to_string(shots_df,'associated_object')

combined_prompt = System_Prompts + ' ' + videoDesc + ' ' + Question_Prompts + query


print("Your prompt: \n" + combined_prompt)


Your prompt: 
 You are an expert in screening video descriptions and understanding the context and contents of the video.
Only answer based on the description of the video provided here: 
 Here is a description of the video in chronological order:

The video opens with a woman speaking. She is standing in front of a large screen displaying an image of a city and the words, “Singapore’s Most Highly Anticipated Smart District.” She discusses how data is used to organize living and working spaces, and to determine how people spend their time in the district.

Next, a pop-up box appears on the screen, asking, “Switch off lights and AC?” with “Yes” and “No” options. Then, two graphs appear, showing data on cooling tower fan speed and total power saved. The numbers on the graphs change, indicating energy savings.

A man is then shown speaking. He is identified as László Gönczi from FIABCI Hungary. He comments on the masterplan for the district, noting that it is very nice and includes a lot 

In [111]:
print("Response from AI model: \n")
print(generate_pro(combined_prompt))

Response from AI model: 

The  Punggol Digital District (PDD) in Singapore is a smart and sustainable  development project.  Key features highlighted include:

* **Smart Technology Integration:**  The  district utilizes data to optimize living and working spaces, including energy management (demonstrated by a system that suggests switching off lights and AC, and graphs showing energy savings ), waste management (a pneumatic waste conveyance system), and air quality monitoring.  A digital platform monitors and manages electrical, cooling, and waste systems, allowing for  real-time resource optimization.

* **Sustainable Design:**  The project incorporates sustainable practices such as rainwater harvesting for irrigation, and building orientation to utilize natural winds and reduce energy consumption from air conditioning.  The goal is a net-zero  future.

* **Community Focus:**  The design emphasizes community building, fostering interaction between companies and attracting talent.  Ame

## Method 2: Agent Builder App

### Create Datastore

Give 'Discovery Engine Admin' permission access to the service account

In [None]:
print(f"service account: {SVC_ACC}")

In [None]:
gcs_uri = "gs://ai-sb-test/video-rag/"

In [None]:
def create_data_store(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.DataStoreServiceClient(client_options=client_options)

    # Initialize request argument(s)
    data_store = discoveryengine.DataStore(
        display_name=data_store_name,
        industry_vertical="GENERIC",
        content_config="NO_CONTENT",
    )

    request = discoveryengine.CreateDataStoreRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        data_store=data_store,
        data_store_id=data_store_id,
    )
    operation = client.create_data_store(request=request)

    # Make the request
    # The try block is necessary to prevent execution from haulting due to an error being thrown when the datastore takes a while to instantiate
    try:
        response = operation.result(timeout=90)
    except:
        print("long-running operation")

In [None]:
# The datastore name can only contain lowercase letters, numbers, and hyphens
DATASTORE_NAME = f"{UNIQUE_PREFIX}-datastore"
DATASTORE_ID = f"{DATASTORE_NAME}-id"
LOCATION = 'global'

# print variables for verification
print(f"Datastore name: {DATASTORE_NAME}")
print(f"Datastore ID: {DATASTORE_ID}")

# Create the datastore
try:
    create_data_store(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)
    print(f"Datastore {DATASTORE_ID} successfully created")
except:
    print("Datastore may already exist")

### Import csv from bucket to datastore

This section may take up to 5 - 10 mins to complete the import to the datastore

In [None]:
# Helper Function to import documents from GCS bucket into datastore
def import_documents(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str,
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    source_documents = [f"{gcs_uri}/*"]

    request = discoveryengine.ImportDocumentsRequest(
        parent=parent,
        gcs_source=discoveryengine.GcsSource(
            input_uris=source_documents, data_schema="csv"
        ),
        auto_generate_ids = True,
        # Options: `FULL`, `INCREMENTAL`
        reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
    )

    # Make the request
    operation = client.import_documents(request=request)

    response = operation.result()

    # Once the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

    # Handle the response
    return operation.operation.name

In [None]:
import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, BUCKET_URI)

## Create a Search Engine for your datastore

In [None]:
# Helper function to create a Vertex Search Engine
def create_engine(
    project_id: str, location: str, data_store_name: str, data_store_id: str
):
    # Create a client
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )
    client = discoveryengine.EngineServiceClient(client_options=client_options)

    # Initialize request argument(s)
    config = discoveryengine.Engine.SearchEngineConfig(
        search_tier="SEARCH_TIER_ENTERPRISE", search_add_ons=["SEARCH_ADD_ON_LLM"]
    )

    engine = discoveryengine.Engine(
        display_name=data_store_name,
        solution_type="SOLUTION_TYPE_SEARCH",
        industry_vertical="GENERIC",
        data_store_ids=[data_store_id],
        search_engine_config=config,
    )

    request = discoveryengine.CreateEngineRequest(
        parent=discoveryengine.DataStoreServiceClient.collection_path(
            project_id, location, "default_collection"
        ),
        engine=engine,
        engine_id=engine.display_name,
    )

    # Make the request
    operation = client.create_engine(request=request)
    response = operation.result(timeout=90)

Wait for 5 mins, to allow both the datastore and app to load properly

In [None]:
import time
try:
    create_engine(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)
except:
    print("App may already be created")
time.sleep(300)
print(f"{DATASTORE_NAME} app created")

### Updating the config for the App

This part will be done manually.
1. Go to [Agent Builder](https://console.cloud.google.com/gen-app-builder/engines?hl=en-GB) and select your app
2. On the left panel, select 'Data'
3. Select the 'Schema'tab
4. If any of the fields in the 'Retrievable' column are disabled, click 'Edit' and enable it.
5. On the left panel, select 'Congfigurations'
6. Select the 'UI' tab
7. Under 'Data display options' open 'Configure fields in result'
8. Customise answer UI 
9. Click 'SAVE AND PUBLISH'

## Previewing App

On the left panel, select 'Preview' and try searching for something to test the app

## The End