[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RelevanceAI/workflows/blob/main/workflows/cluster/Cluster_Your_Data_with_Relevance_AI.ipynb)

# Instructions

1. Paste the token copied to your clipboard provided from the 'Cluster' Workflow dashboard.
2. Click the ▶️  on the left or go to "Runtime" -> "Run All" and click "Run anyway" on the warning that pops up.
3. You should see a progress bar underneath the form, keep this window opened and active until the progress bar is complete otherwise it'll terminate.

Note: 
- For fastest clustering speed make sure to go to "Runtime" -> "Change runtime type" and enable "Hardware accelerator" as "GPU".

In [None]:
#@title Paste token below and press ▶️  button to the left of this title { display-mode: "form" }
# %tb

token = "" #@param {type:"string"}

show_warnings_in_logs = False #@param {type:"boolean"}
#@markdown Once the form is filled and you've clicked run, monitor below for logs of it running

import base64
import json
import warnings
warnings.filterwarnings('ignore')

config = json.loads(base64.b64decode(token + "==="))

import subprocess

def install_package(package):
    process = subprocess.Popen(['pip', 'install', package],
                        stdout=subprocess.PIPE, 
                        stderr=subprocess.PIPE)
    stdout, stderr = process.communicate()
    return
print(config)

!pip install -q -U RelevanceAI==2.1.4
print("Installing RelevanceAI")

import contextlib

class DevNull:
    def write(self, msg):
        pass

## Instantiate client ###
from relevanceai import Client 
client = Client(token=config['authorizationToken'])

## Checking valid vector field ###
for v in config['vector_fields']:
  if not '_vector_'in v:
    raise ValueError(f"'{v}' is not a valid vector field")


df = client.Dataset(config['dataset_id'])
try:
  if config['clusteringType'] == 'community-detection':
    cluster_method  = "community_detection"
    !pip install -q sentence-transformers==2.2.0
    df.cluster(
        cluster_method,
        cluster_config={"threshold": config['cutoff']},
        vector_fields=config['vector_fields']
    )
  elif config['clusteringType'] == 'kmeans':
    if df.shape[0] < 3000:
      df.cluster(
          config['clusteringType'],
          cluster_config={"n_clusters": config['n_clusters']},
          vector_fields=config['vector_fields']
      )
    else:
      cluster_method = 'minibatchkmeans'
      df.cluster(
          cluster_method,
          cluster_config={"n_clusters": config['n_clusters']},
          vector_fields=config['vector_fields']
      )
  else:
    cluster_method = "kmeans"
    df.cluster(
        cluster_method,
        cluster_config={"n_clusters": config['n_clusters']},
        vector_fields=config['vector_fields']
    )
except Exception as e:
  raise ValueError('Incorrect token provided')

print("Finished clustering your data, you may close this window.")
