In [None]:
import os

api_key = ""
project = ""
region = ""
authorizationToken = f"{os.environ['TEST_ACTIVATION_TOKEN']}" if os.getenv('TEST_ACTIVATION_TOKEN') else ""

dataset_id = "dummy-ecommerce-clean"
encode_type = "text"
model_id = "sentence-transformers/all-mpnet-base-v2"
fields = ["product_title"]


In [None]:
token = "" #@param {type:"string"}

show_warnings_in_logs = False #@param {type:"boolean"}

import base64
import json
import warnings
warnings.filterwarnings('ignore')
print("Installing RelevanceAI ...")

import subprocess

def install_package(package):
    try:
        process = subprocess.Popen(['pip', 'install', '-U', package],
                            stdout=subprocess.PIPE, 
                            stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
    except:
        raise ValueError(f'Error installing {package}. {stdout} {stderr}')

!pip install -q RelevanceAI==2.3.4

print("Installing machine learning models and dependencies to vectorize data. Takes ~2mins.")

import contextlib

class DevNull:
    def write(self, msg):
        pass

from relevanceai import Client 
from relevanceai.utils import decode_workflow_token
config = decode_workflow_token(token)
client = Client(token=config['authorizationToken'])

if not 'sentence-transformers/' in model_id:
    try:
        if model_id == 'clip' and encode_type.lower() == 'image_urls':
            install_package("vectorhub[clip]")
            from vectorhub.bi_encoders.text_image.torch import Clip2Vec
            class Model(Clip2Vec):
                @property
                def __name__(self):
                    return model_id
            enc = Model()
            enc.encode = enc.encode_image

        elif model_id == 'clip' and encode_type.lower() == 'text':
            install_package("vectorhub[clip]")
            from vectorhub.bi_encoders.text_image.torch import Clip2Vec
            class Model(Clip2Vec):
                @property
                def __name__(self):
                    return model_id
            enc = Model()
            enc.encode = enc.encode_text

        elif model_id == 'mpnet':
            install_package("vectorhub[encoders-text-sentence-transformers]")
            from vectorhub.encoders.text.sentence_transformers import SentenceTransformer2Vec
            class Model(SentenceTransformer2Vec):
                @property
                def __name__(self):
                    return model_id
            enc = Model("all-mpnet-base-v2")

        elif model_id == 'multiqampnet':
            install_package("vectorhub[encoders-text-sentence-transformers]")
            from vectorhub.encoders.text.sentence_transformers  import SentenceTransformer2Vec
            class Model(SentenceTransformer2Vec):
                @property
                def __name__(self):
                    return model_id
            enc = Model("multi-qa-mpnet-base-dot-v1")

        elif model_id == 'bit':
            install_package("vectorhub[encoders-image-tfhub]")
            from vectorhub.encoders.image.tfhub import BitMedium2Vec
            class Model(BitMedium2Vec):
                @property
                def __name__(self):
                    return model_id
            enc = Model()


    except Exception as e:
        print(e)
        raise ValueError(f'Incorrect token provided. {json.dumps(authorizationToken, indent=2)}')

    print("Finished installing machine learning models and dependencies to vectorize data.")
    # enc.__name__ = config['model_id']

    import os
    import sys
    import warnings

    f = open(os.devnull, 'w')
    sys.stderr = f

    def encode_documents(docs):
        try:
            if show_warnings_in_logs:
                with contextlib.redirect_stdout(None):
                    with warnings.catch_warnings():
                        warnings.simplefilter("ignore")
                        return enc.encode_documents(fields, docs)
            return enc.encode_documents(fields, docs)
        except:
            raise Exception("===TRY RESTARTING COLAB NOTEBOOK! Click 'Runtime' > 'Restart runtime'===")

    print("Starting to vectorize your data.")
    # Simple bug fix lol
    client.logger.warn = client.logger.warning
    client.pull_update_push(dataset_id, encode_documents, 
    show_progress_bar=True, 
    filters=[{'field' : f,
        'filter_type' : 'exists', 
        "condition":"==", 
        "condition_value":""} for f in fields],
        select_fields=fields, retrieve_chunk_size=100)
else:

    print(f"Encoding with {model_id} ... ")
    ds = client.Dataset(dataset_id)
    if encode_type == 'text':
      ds.vectorize_text(fields=fields, models=['model_id'])
    elif encode_type == 'image':
      ds.vectorize_image(fields=fields, models=[model_id])

print("Finished vectorizing your data with, you may close this window.")