<img src="https://relevance.ai/wp-content/uploads/2021/11/logo.79f303e-1.svg" width="150" alt="Relevance AI" />
<h5> Developer-first vector platform for ML teams </h5>

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RelevanceAI/workflows/blob/main/workflows/vectorize/Vectorize_Your_Data_with_Relevance_AI_form.ipynb)


In [None]:
#@title After filling this form, press the top left button.
# You can grab your token here https://cloud.relevance.ai/sdk/api

token = "<copy paste from https://cloud.relevance.ai/sdk/api>"  #@param  {type:"string"}
dataset_id = "<your dataset ID here>"  #@param {type:"string"}
encode_type = "<choose from text|image_urls>"  #@param {type: "integer"}
model_id = "<choose from mpnet|multiqampnet|bit|clip or sentence-transformers/<model_url> from https://huggingface.co/sentence-transformers>"  #@param {type:"string"}
fields =  "<your fields to vectorize here eg. product_title, product_description>"    #@param {type:"string"}

project = token.split(':')[0]
api_key = token.split(':')[1]
region = token.split(':')[2]


def strip_empty_string(list):
    without_empty_strings = []
    for string in list:
        if (string != ''):
            without_empty_strings.append(string.strip())
    return without_empty_strings

fields = strip_empty_string(fields.split(','))

### Field Validation ###

if encode_type not in ['text', 'image_urls']:
    print(f'Encode type {encode_type} is not supported. Choose from text or image_urls.')

if (model_id not in ['mpnet', 'multiqampnet', 'bit', 'clip']) or ('sentence-transformers/' not in model_id):
    print(f'Model ID {model_id} is not supported. Choose from mpnet, multiqampnet, bit, clip or sentence-transformers/<model_url> from https://huggingface.co/sentence-transformers.')

!pip install -q RelevanceAI==2.3.2

from relevanceai import Client 
client = Client(token=token)

ds = client.Dataset(dataset_id)


if (fields not in ['mpnet', 'multiqampnet', 'bit', 'clip']) or ('sentence-transformers/' not in model_id):
    print(f'Model ID {model_id} is not supported. Choose from mpnet, multiqampnet, bit, clip or sentence-transformers/<model_url> from https://huggingface.co/sentence-transformers.')

config = {
    "dataset_id": dataset_id,
    "model_id": model_id,
    "encode_type": encode_type,
    "fields": fields,
    "region": region,
    "project": project,
    "api_key": api_key,
    "authorizationToken": token
}

project = token.split(':')[0]
api_key = token.split(':')[1]
region = token.split(':')[2]

show_warnings_in_logs = False #@param {type:"boolean"}
#@markdown Once the form is filled and you've clicked run, monitor below for logs of it running

import base64
import json
import warnings
warnings.filterwarnings('ignore')

print("Installing RelevanceAI ...")

import subprocess

def install_package(package):
    try:
        process = subprocess.Popen(['pip', 'install', package],
                            stdout=subprocess.PIPE, 
                            stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
    except:
        raise ValueError(f'Error installing {package}. {stdout} {stderr}')

print("Installing machine learning models and dependencies to vectorize data. Takes ~2mins.")

import contextlib

class DevNull:
    def write(self, msg):
        pass

try:
    if config['model_id'] == 'clip' and config['encode_type'].lower() == 'image_urls':
        install_package("vectorhub[clip]")
        from vectorhub.bi_encoders.text_image.torch import Clip2Vec
        class Model(Clip2Vec):
            @property
            def __name__(self):
                return config['model_id']
        enc = Model()
        enc.encode = enc.encode_image

    elif config['model_id'] == 'clip' and config['encode_type'].lower() == 'text':
        install_package("vectorhub[clip]")
        from vectorhub.bi_encoders.text_image.torch import Clip2Vec
        class Model(Clip2Vec):
            @property
            def __name__(self):
                return config['model_id']
        enc = Model()
        enc.encode = enc.encode_text

    elif config['model_id'] == 'mpnet':
        install_package("vectorhub[encoders-text-sentence-transformers]")
        from vectorhub.encoders.text.sentence_transformers import SentenceTransformer2Vec
        class Model(SentenceTransformer2Vec):
            @property
            def __name__(self):
                return config['model_id']
        enc = Model("all-mpnet-base-v2")

    elif config['model_id'] == 'multiqampnet':
        install_package("vectorhub[encoders-text-sentence-transformers]")
        from vectorhub.encoders.text.sentence_transformers  import SentenceTransformer2Vec
        class Model(SentenceTransformer2Vec):
            @property
            def __name__(self):
                return config['model_id']
        enc = Model("multi-qa-mpnet-base-dot-v1")

    elif config['model_id'] == 'bit':
        install_package("vectorhub[encoders-image-tfhub]")
        from vectorhub.encoders.image.tfhub import BitMedium2Vec
        class Model(BitMedium2Vec):
            @property
            def __name__(self):
                return config['model_id']
        enc = Model()


except Exception as e:
    print(e)
    raise ValueError(f'Incorrect token provided. {json.dumps(config, indent=2)}')

print("Finished installing machine learning models and dependencies to vectorize data.")
# enc.__name__ = config['model_id']

import os
import sys
import warnings

f = open(os.devnull, 'w')
sys.stderr = f

def encode_documents(docs):
    try:
      if show_warnings_in_logs:
          with contextlib.redirect_stdout(None):
              with warnings.catch_warnings():
                  warnings.simplefilter("ignore")
                  return enc.encode_documents(config['fields'], docs)
      return enc.encode_documents(config['fields'], docs)
    except:
      raise Exception("===TRY RESTARTING COLAB NOTEBOOK! Click 'Runtime' > 'Restart runtime'===")

print("Starting to vectorize your data.")
# Simple bug fix lol
client.logger.warn = client.logger.warning
client.pull_update_push(config['dataset_id'], encode_documents, 
  show_progress_bar=True, 
  filters=[{'field' : f,
    'filter_type' : 'exists', 
    "condition":"==", 
    "condition_value":""} for f in config['fields']], 
    select_fields=config['fields'], retrieve_chunk_size=100)

print("Finished vectorizing your data with, you may close this window.")