## Install and import packages

In [None]:
!pip install opensearch-py
!pip install sentence-transformers
!pip install pandas

In [None]:
import json
import pandas as pd
import time
import math
from opensearchpy import OpenSearch
from opensearchpy import helpers
from sentence_transformers import SentenceTransformer
transformer_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

## Connect to open search

In [None]:
# Connection variables
host = 'localhost'
port = '9200'
auth = ('admin', 'admin')

# Connect
client = OpenSearch(
    timeout = 300,
    hosts = [{'host': host, 'port': port}],
    http_compress = True, 
    http_auth = auth,
    use_ssl = False,
    verify_cers = False,
)

client.ping()

## Read json file and add vector column

In [None]:
# Read json as pandas dataframe
file = open('../../data/cleansing/data.json')
data = json.load(file)
df = pd.DataFrame(data)
df

In [None]:
# Add the vector column
df_bk = df.copy()
df_bk = df_bk.assign(vector="")
df_bk

In [None]:
# Create the vector values
for index in df_bk.index:
    title = df_bk['title'][index]
    description = df_bk['description'][index]
    tags = df_bk['tags'][index]
    
    # Create a single string with all the text columns included
    bundle = title + ' ' + description
    
    for tag in tags:
        bundle += ' ' + tag
    
    # Transform the single string into vector
    vector = transformer_model.encode(bundle)
    df_bk['vector'][index] = vector
    
df_bk

## Create opensearch index

In [None]:
index_name = 'videos'

index_body = {
    'settings': {
        'index': {
            'number_of_shards': 20, 
            'number_of_replicas': 1,
            'knn': {
                'algo_param': {
                    # Default 512: https://opensearch.org/docs/latest/search-plugins/knn/knn-index#method-definitions
                    # Higher values lead to more accurate but slower searches.
                    'ef_search': 256, 
                    # Using during graph creation
                    'ef_construction': 256, 
                    # Bidirectional links for each element
                    'm': 4 
                }
            }
        },
        'knn': 'true'
    },
    'mappings': {
        'properties': {
            'url': {
                'type': 'text'
            },
            'thumbnail': {
                'type': 'text'
            },
            'title': {
                'type': 'text'
            }, 
            'description': {
                'type': 'text'
            },
            'tags': {
                # Text type can be used as array
                'type': 'text'
            }, 
            'vector': {
                'type': 'knn_vector', 
                'dimension': 384
            }
        }
    }
}

# Delete index if exists
if(client.indices.exists(index=index_name)):
    client.indices.delete(index=index_name)

# Create the index
reply = client.indices.create(index_name, index_body)
print(reply)

In [None]:
# Creating python array to bulk multile data at same time
data = []

for index in df_bk.index:
    url = df_bk['url'][index]
    thumbnail = df_bk['thumbnail'][index]
    title = df_bk['title'][index]
    description = df_bk['description'][index]
    tags = df_bk['tags'][index]
    vector = df_bk['vector'][index]
    
    data.append({'_index': index_name,
                 'url': url, 
                 'thumbnail': thumbnail, 
                 'title': title, 
                 'description': description, 
                 'tags': tags, 
                 'vector': vector})

In [None]:
# Bulk into opensearch
reply = helpers.bulk(client, data, max_retries=5)

In [None]:
query = input('Enter your query: ')
query_vector = transformer_model.encode(query)

open_search_query = {
    'size': 24, 
    # Fields than will be sended as response
    '_source': ['url', 'thumbnail', 'title', 'tags'],
    # Filter
    "query": {
        "bool": {
            'must': [
                {'knn': {
                    "vector": {
                        "vector": query_vector,
                        "k": 24
                    }
                }}
            ]
        }
    }
}

response = client.search(
    index = index_name, 
    size = 24, 
    body = open_search_query,
    request_timeout = 64
)

videos = [x['_source'] for x in response['hits']['hits']]
videos

In [None]:
# Testing query to get random videos
# From: https://stackoverflow.com/questions/25887850/random-document-in-elasticsearch

open_search_random_query = {
   "size": 24,
    # Fields than will be sended as response
    '_source': ['url', 'thumbnail', 'title', 'tags'],
   "query": {
      "function_score": {
         "functions": [
            {
               "random_score": {
                  "seed": math.ceil(time.time())
               }
            }
         ]
      }
   }
}

response = client.search(
    index = index_name, 
    body = open_search_random_query,
    request_timeout = 64
)

videos = [x['_source'] for x in response['hits']['hits']]
videos