# Opening an Index
Criação de um index + configurações do index

In [40]:
from opensearchpy import OpenSearch
import requests
from opensearchpy import helpers

host = 'api.novasearch.org'
port = 443

user = 'user13' 
password = 'rumoao+20' 
index_name = user # We can only have an index with the same name has our user name.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"-1",
         "knn":"true"
      }
   },
   "mappings":{
       "dynamic":      "strict",
       "properties":{
         "doc_id":{
            "type":"keyword"
         },
         "tags":{
            "type":"keyword"
         },
         "json":{
            "type":"flat_object"
         },
         "contents":{
            "type":"text",
            "analyzer":"standard",
            "similarity":"BM25"
         }
      }
   }
}

if client.indices.exists(index=index_name):
    print("Index already exists.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)



Creating index:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'user13'}


In [41]:
## INDEX CREATION CHECK

import pprint as pp

print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}
pp.pprint(client.indices.get_alias("*"))

client.indices.put_settings(index = index_name, body = index_settings)
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))



----------------------------------------------------------------------------------- INDEX SETTINGS


AuthorizationException: AuthorizationException(403, 'security_exception', 'no permissions for [indices:admin/aliases/get] and User [name=user13, backend_roles=[own_index], requestedTenant=null]')

In [None]:
# Index Deletion

This line is here to prevent you from inadvertently deleting data.

if client.indices.exists(index=index_name):
    # Delete the index.
    response = client.indices.delete(
        index = index_name
    )
    print('\nDeleting index:')
    print(response)


Deleting index:
{'acknowledged': True}


# Processing Embeddings

Extract embeddings from video descriptions or transcripts.
Step 1- Use a transformer model to generate embeddings
Step 2 - Index a sample video moment

Embeddings are a way to represent words, sentences, or other types of data (like images or videos) in a vector space. These vectors are usually real-valued arrays, where each number represents a particular feature or characteristic of the data being represented.
Capturing semantic relationships allows us to capture the meaning of the data.
Embeddings are generates using models like CLIP. CLIP is a multimodal model that works wih both images and text, learning joint embeddings for both.

Embeddings are dense, real-valued vectors that represent data (like text, images, or video moments) in a way that preserves semantic relationships.

They allow you to perform tasks like semantic search by measuring the distance or similarity between vectors.

In your project, you'll generate video moment embeddings using models like CLIP and store them in OpenSearch for efficient retrieval based on user input.

For this, we are going to use CLIP

-- End of Stuff For the Project -- 

-- Below is all experimental stuff --


## Simple Text and Document Indexing

Short example to visualize the basic purpose of the project. From transcripts (texts) we must get the meaning from words and feed it to a model.
Since we are generating meaning from videos, we need to extract their data such as: captions, descriptions and feed that json to the opensearch index.

In [26]:
# Text Analyzer

anls = {
  "analyzer": "standard",
  "text": "the quick brown fox"
}
client.indices.analyze(body=anls, index=index_name)

{'tokens': [{'token': 'the',
   'start_offset': 0,
   'end_offset': 3,
   'type': '<ALPHANUM>',
   'position': 0},
  {'token': 'quick',
   'start_offset': 4,
   'end_offset': 9,
   'type': '<ALPHANUM>',
   'position': 1},
  {'token': 'brown',
   'start_offset': 10,
   'end_offset': 15,
   'type': '<ALPHANUM>',
   'position': 2},
  {'token': 'fox',
   'start_offset': 16,
   'end_offset': 19,
   'type': '<ALPHANUM>',
   'position': 3}]}

In [27]:
import json

docs = ["Around 9 Million people live in London", "London is known for its financial district"]
dd = '{"name":"John", "age":30, "car":null}'
parsed_dd = json.loads(dd) # We have to parse the json string to a python dictionary, otherwise it will not match what the index is expecting.

# The json fields of the document must match the json fields of the openframeworks index creation
doc = {
    'doc_id': 'documentA', # Document ID
    'tags': ['red', 'blue'], # Tags
    'json': parsed_dd, # JSON field with content name, age and car
    'contents': docs[0] # Text field with content
}
resp = client.index(index=index_name, id=1, body=doc)
print(resp['result'])

doc = {
    'doc_id': 'documentB',
    'tags': ['red'],
    'json': parsed_dd,
    'contents': docs[1]
}
resp = client.index(index=index_name, id=2, body=doc)
print(resp['result'])

updated
updated


In [28]:
resp = client.index(index=index_name, id=1, body=doc)
print(resp['result'])

updated


In [None]:
# Simple Search

prompt = "How many people live in London?"

query_bm25 = {
  'size': 5, # Number of results to return
  '_source': ['_tags'], # We can specify which fields we want to return. We can also use '_source' to return all fields.
#  '_source': ['doc_id'],
#  '_source': '',
  'query': {
    'multi_match': {
      'query': prompt,
      'fields': ['contents'] # Fields to search
    }
  }
}

response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)



Search results:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 4, 'total': 4},
 'hits': {'hits': [],
          'max_score': None,
          'total': {'relation': 'eq', 'value': 0}},
 'timed_out': False,
 'took': 5}


# Task 2.2 da Phase 1.

Já dá para fazer de acordo com o exemplo curto acima.
Apenas temos de extrair as captions do dataset do professor e dar feed dos documentos criados a partir disso ao open search

TODO: 
- Parse the ActivityNet Captions dataset and create the OpenSearch index mappings for the recipe tittle and description
- Index the dataset and test the search functionality
- Try to optimize the search results

## Loading the ActivityNet Captions dataset

To load the dataset, we store the download script and call it using the datasets library.

Dataset Fields:

video_id : str unique identifier for the video
video_path: str Path to the video file -duration: float32 Duration of the video
captions_starts: List_float32 List of timestamps denoting the time at which each caption starts
captions_ends: List_float32 List of timestamps denoting the time at which each caption ends
en_captions: list_str List of english captions describing parts of the video

In [None]:
from datasets import load_dataset

# Load the dataset, trust_remote_code=True is needed to load the dataset from the remote repository.
dataset = load_dataset('dataset-download.py', trust_remote_code=True) 

print(dataset['train'][0])  # First training example
print(dataset['validation'][0])  # First validation example
print(dataset['test'][0])  # First test example

{'video_id': 'v_QOlSCBRmfWY', 'video_path': 'https://www.youtube.com/watch?v=QOlSCBRmfWY', 'duration': 82.7300033569336, 'captions_starts': [0.8299999833106995, 17.3700008392334, 56.2599983215332], 'captions_ends': [19.860000610351562, 60.810001373291016, 79.41999816894531], 'en_captions': ['A young woman is seen standing in a room and leads into her dancing.', ' The girl dances around the room while the camera captures her movements.', ' She continues dancing around the room and ends by laying on the floor.']}
{'video_id': 'v_uqiMw7tQ1Cc', 'video_path': 'https://www.youtube.com/watch?v=uqiMw7tQ1Cc', 'duration': 55.150001525878906, 'captions_starts': [0.2800000011920929, 13.789999961853027], 'captions_ends': [55.150001525878906, 54.31999969482422], 'en_captions': ['A weight lifting tutorial is given.', '  The coach helps the guy in red with the proper body placement and lifting technique.']}
{'video_id': 'v_uqiMw7tQ1Cc', 'video_path': 'https://www.youtube.com/watch?v=uqiMw7tQ1Cc', 'dur

In [None]:
from datasets import load_dataset

# Load the dataset, trust_remote_code=True is needed to load the dataset from the remote repository.
dataset = load_dataset('dataset-download.py', trust_remote_code=True) 

index_number_id = 0 # Index number to use as document ID (0, 1, 2, ...)

# Indexing the training dataset (should we also index the validation and test datasets?)
# We can use the video_id as the document ID or we can use the index of the dataset as the document ID
for doc in dataset['train']:
    resp = client.index(index = index_name, id = index_number_id, body = doc)
    index_number_id = index_number_id + 1
    print(resp['result'])

for doc in dataset['validation']: # previous iteration used a for int instead of a for doc, this new way is simpler
    resp = client.index(index = index_name, id = index_number_id, body = doc)
    index_number_id = index_number_id + 1
    print(resp['result'])

#doc = dataset['train'][0]  # First training example

#doc['video_id'] # Video ID (works like documentID)

#resp = client.index(index=index_name, id=1, body=doc)
#print(resp['result'])


TransportError: TransportError(503, '<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>503 Service Unavailable</title>\n</head><body>\n<h1>Service Unavailable</h1>\n<p>The server is temporarily unable to service your\nrequest due to maintenance downtime or capacity\nproblems. Please try again later.</p>\n</body></html>\n')

In [45]:
doc

{'video_id': 'v_QOlSCBRmfWY',
 'video_path': 'https://www.youtube.com/watch?v=QOlSCBRmfWY',
 'duration': 82.7300033569336,
 'captions_starts': [0.8299999833106995, 17.3700008392334, 56.2599983215332],
 'captions_ends': [19.860000610351562, 60.810001373291016, 79.41999816894531],
 'en_captions': ['A young woman is seen standing in a room and leads into her dancing.',
  ' The girl dances around the room while the camera captures her movements.',
  ' She continues dancing around the room and ends by laying on the floor.']}

In [58]:
# Search for a specific video 
client.index(index=index_name, body=doc)
client.indices.refresh(index=index_name)  # <--- very important if refresh_interval = -1

query = {
  "query": {
    "match": {
      "en_captions": "a lion appears" # Search for a specific caption
    }
  }
}

results = client.search(index=index_name, body=query)
print('\nSearch results:')
for hit in results['hits']['hits']:
    print(hit['_source']['video_id'], "-", hit['_score'])

TransportError: TransportError(503, '<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML 2.0//EN">\n<html><head>\n<title>503 Service Unavailable</title>\n</head><body>\n<h1>Service Unavailable</h1>\n<p>The server is temporarily unable to service your\nrequest due to maintenance downtime or capacity\nproblems. Please try again later.</p>\n</body></html>\n')

In [1]:
from opensearchpy import OpenSearch
import requests
from opensearchpy import helpers

host = 'api.novasearch.org'
port = 443

user = 'user13' 
password = 'rumoao+20' 
index_name = user # We can only have an index with the same name has our user name.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"-1",
         "knn":"true"
      }
   },
   "mappings":{
       "dynamic":      "strict", # Prevents accidental addition of new fields to the index. This way indexed documents must match the index mapping.
       "properties":{
         "video_id":{
            "type":"keyword"
         },
         "video_path":{
            "type":"text"
         },
         "captions_starts":{
            "type":"float" # OpenSearch treats this as an array of floats automatically
         },
         "captions_ends":{
            "type":"float" # OpenSearch treats this as an array of floats automatically
         },
         "en_captions":{
            "type":"text", # OpenSearch treats this as an array of strings automatically
            "analyzer":"standard",
            "similarity":"BM25"
         },
         
      }
   }
}

doc = {
    "video_id": "v_QOlSCBRmfWY",
    "video_path": "https://www.youtube.com/watch?v=QOlSCBRmfWY",
    "duration": 82.73,
    "captions_starts": [...],
    "captions_ends": [...],
    "en_captions": [...],
    "contents": " ".join(en_captions)  # <--- this is critical
}

chatgpt suggestion

Example use cases:
✅ Text search applications (like yours) – where users input natural language (e.g., "where a woman appears").

✅ Document search engines – indexing entire articles, PDFs, captions, transcripts, etc.

✅ Chatbot context search, Q&A systems, semantic search, etc.

🔍 So when is it useful?
You usually include a contents field when:

You want to allow simple search across all the textual parts of a document.

You want to match keywords, phrases, or semantic concepts.

You’re using match, match_phrase, query_string, or knn_vector queries.

If you have multiple text fields (like title, en_captions, etc.), you can either:

Concatenate them into contents, or

Use a copy_to directive in the mapping:

"en_captions": {
  "type": "text",
  "copy_to": "contents"
}