# Data Preparation

In this notebook, we retrieve the videos and their captions from 2 different datasets, join their information and index them.
The notebook covers: 

## Selecting 10 videos (maybe we should have them all)
This section creates a json file of the top 10 videos from the activitynet videos dataset.

In [2]:
import json
from pprint import pprint

#[('o1WPnnvs00I', {'duration': 229.86, 'subset': 'training', 'resolution': '640x480',
data:list

with open('activity_net.v1-3.min.json', 'r') as json_data:
    data = json.load(json_data)
    
    # 'database' is a <key, valu> pair -> <video_id, video_info>
    videos = data['database']
    
    # Sort the list by number of annotations (video moments)
    sorted_list = sorted(videos.items(), key= lambda x: len(x[1]['annotations']), reverse = True)

    # Select the top 10 videos 
    top_10_videos = sorted_list[:10]

    # Convert the list of tuples to a dictionary before dumping
    top_10_dict = {video_id: video_info for video_id, video_info in top_10_videos}

    # Check the video id and number of moments of the items in the list
    for video_id, video_info in top_10_videos:
        print(f"{video_id} - {len(video_info['annotations'])} moments")

    #print(top_10_dict.keys) # Each key is a video id

with open('top10.json', 'w') as file: # Gotta use the full relative path if running on a python notebook
    json.dump(top_10_dict, file, indent=2)





o1WPnnvs00I - 23 moments
oGwn4NUeoy8 - 23 moments
VEDRmPt_-Ms - 20 moments
qF3EbR8y8go - 19 moments
DLJqhYP-C0k - 18 moments
t6f_O8a4sSg - 18 moments
6gyD-Mte2ZM - 18 moments
jBvGvVw3R-Q - 18 moments
PJ72Yl0B1rY - 17 moments
QHn9KyE-zZo - 17 moments


## Video Metadata Indexing

In this section we process the selected videos from our json file.
We first start by creating the OpenSearch index.

In [3]:
## New Index Mappings for k-nn vectors and embeddings
## (embeddings are the means from the words extracted from the captions)

from opensearchpy import OpenSearch
import requests
from opensearchpy import helpers

host = 'api.novasearch.org'
port = 443

user = 'user13' 
password = 'rumoao+20' 
index_name = user # We can only have an index with the same name has our user name.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

# The fields and how they are searched and how important they are, are defined in the mappings
index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"-1",
         "knn":"true"
      }
   },
   "mappings":{
       "dynamic":      "strict", # Prevents accidental addition of new fields to the index. This way indexed documents must match the index mapping.
       "properties":{
         "video_id":{
            "type":"keyword"
         },
         "title":{
            "type":"text",
            "analyzer":"english",
            "similarity":"BM25"
         },
         "video_path":{
            "type":"text"
         },
         "duration":{
            "type":"float"
         },
         "description":{  # The description field is a text field of the join from the en_captions field that is an array of strings.
            "type":"text",
            "analyzer":"english",
            "similarity":"BM25"
         },
        "description_embedding":{
            "type":"knn_vector",
            "dimension": 768,
            "method":{
               "name":"hnsw",
               "space_type":"innerproduct", # cosinesimil > innerproduct  because the captions are normalized and this provides better semantic similarity
               "engine":"faiss",
               "parameters":{
                  "ef_construction":256,
                  "m":48
               }
            }
        },
        "annotations": {
                "type": "nested",
                "properties": {
                    "segment": {"type": "float"},
                    "label": {"type": "text"},
                    "is_answer": {"type": "boolean"},
                    "confidence": {"type": "float"}
                }
        },
      }
   }
}

# Create the index with the specified mappings and settings
response = client.indices.create(index=index_name, body=index_body)

# Check if the index creation was successful
if response['acknowledged']:
    print(f"Index '{index_name}' created successfully!")
else:
    print(f"Failed to create index: {response}")

RequestError: RequestError(400, 'resource_already_exists_exception', 'index [user13/GrTTB-RiTiyyvhqn2vwULA] already exists')

In [None]:
# Importing the dataset and indexing its data

from datasets import load_dataset

# Load the dataset, trust_remote_code=True is needed to load the dataset from the remote repository.
dataset = load_dataset('dataset-download.py', trust_remote_code=True) 

doc_list = []

index_number_id = 0 # Index number to use as document ID (0, 1, 2, ...)

with open('C:/Git Repositories/MPDW-Project/top10.json', 'r') as data:
    data = json.load(data).items()

    # Check the video id and number of moments of the items in the list
    for video_id, video_info in data:
        # Creating the document to be indexed from the video in the dataset
        doc = {
            'video_id': video_id, # Document ID
            'title': video_info['annotations'][0]['label'], # Title
            'video_path': video_info['url'], # Video path
            'description': "",
            'duration': video_info['duration'],
            "annotations": video_info['annotations']
        }

        doc_list.append(doc)

for split in ['train', 'test', 'validation']:
    for video in dataset[split]:
        # Iterate through the documents in doc_list
        for doc in doc_list:
            #print(video['video_id'].replace("v_", ""))
            video['video_id'] = video['video_id'].replace("v_", "") # clean the video_key from the captions dataset, it comes with the format v_<key> instead of just <key>

            if doc['video_id'] == video['video_id']:  # Check if the video_id matches
                print("doc found")
                
                description = ""  # Initialize the description string

                # Combine all the captions into the description
                for caption in video['en_captions']:
                    description += f" {caption}"

                # Update the document's description
                doc['description'] = description


# Sending the docs to the opensearch index
for doc in doc_list:
    response = client.index(index = index_name, id= index_number_id, body = doc)
    
    print(response)
    
    index_number_id+= 1

# Refresh the index so the docs are searchable (we weren't getting hits because of this...)
client.indices.refresh(index = 'user13')


doc found
Updated description for video_id: DLJqhYP-C0k
doc found
Updated description for video_id: qF3EbR8y8go
doc found
Updated description for video_id: oGwn4NUeoy8
doc found
Updated description for video_id: VEDRmPt_-Ms
doc found
Updated description for video_id: o1WPnnvs00I
doc found
Updated description for video_id: jBvGvVw3R-Q
doc found
Updated description for video_id: t6f_O8a4sSg
doc found
Updated description for video_id: 6gyD-Mte2ZM
doc found
Updated description for video_id: QHn9KyE-zZo
doc found
Updated description for video_id: PJ72Yl0B1rY
doc found
Updated description for video_id: t6f_O8a4sSg
doc found
Updated description for video_id: 6gyD-Mte2ZM
doc found
Updated description for video_id: QHn9KyE-zZo
doc found
Updated description for video_id: PJ72Yl0B1rY
{'_index': 'user13', '_id': '0', '_version': 1, 'result': 'created', '_shards': {'total': 1, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1}
{'_index': 'user13', '_id': '1', '_version': 1, 'result': 

# 4 Query Search Types:
- text based search
- embeddings based search
- boolean filters alone (?)
- search with boolean filters (?)

In [None]:
# Simple Text Based Search

prompt = "a woman appears"

query_bm25 = {
  'size': 20, # Number of max results to return
  '_source': ['video_id', 'title', 'description'], # Index Fields to return
  'query': {
    'multi_match': {
      'query': prompt,
      'fields': ['description', 'title'] # Index Fields to search
    }
  }
}

response = client.search(
    body = query_bm25,
    index = 'user13'
)

#print(response['hits']['hits']) -> how to view the list of hits

# Print each hit
for hit in response['hits']['hits']:
    print(hit['_source']['video_id'], hit['_score'], hit['_source']['title'], hit['_source']['description'])

qF3EbR8y8go 1.4283175 Painting  woman is painting in a white paper green leaves in a chinese tree.  a red paint is shown and woman put a stamp on the corner of the paper.  woman is painting a blue ad purple chinese flower.  a red and black flowers are painted on a white paper with very detail for the same woman in a dark room.  woman used some black painting for make details, put the red stamp on the corner and finished the painting with yellow and reddetails on the flowers.
oGwn4NUeoy8 0.9529365 Playing congas  A small group of people are seen on a stage getting their instruments ready.  A woman begins playing the drums while another plays piano and the others watch.  The two continue to play their instruments and others on the side watch.
PJ72Yl0B1rY 0.664531 Beach soccer  A group of athletes play beach soccer in several different games and locations surrounded by audiences in bleachers and dancing cheerleaders.  A group of soccer players play beach soccer while an audience claps for

In [None]:
# Embeddings Based Search

