[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RelevanceAI/workflows/blob/main/workflows/cluster/Cluster_Your_Data_with_Relevance_AI.ipynb)

# Instructions

1. Paste the token copied to your clipboard provided from the 'Subcluster' Workflow dashboard.
2. Click the ▶️  on the left or go to "Runtime" -> "Run All" and click "Run anyway" on the warning that pops up.
3. You should see a progress bar underneath the form, keep this window opened and active until the progress bar is complete otherwise it'll terminate.

Note: 
- For fastest clustering speed make sure to go to "Runtime" -> "Change runtime type" and enable "Hardware accelerator" as "GPU".

In [None]:
#@title Paste token below and press ▶️  button to the left of this title { display-mode: "form" }
# %tb

# config = {
#   "dataset_id": "advanced_search_example",
#   "n_clusters": 10,
#   "vector_fields": [
#     "product_title_clip_vector_"
#   ],
#   "cutoff": 0.75,
#   "clusteringType": "community-detection",
#   "region": "ap-southeast-2",
#   "project": "xxx",
#   "api_key": "xxx",
#   "authorizationToken": "xxx:xxx:ap-southeast-2:xxx"
# }

import base64
import json

token = "" #@param {type:"string"}

show_warnings_in_logs = False #@param {type:"boolean"}
#@markdown Once the form is filled and you've clicked run, monitor below for logs of it running

config = json.loads(base64.b64decode(token + "==="))

# print(json.dumps(config, indent=2))

print("Installing RelevanceAI")

!pip install -q RelevanceAI==2.1.8
## Instantiate client ###
from relevanceai import Client 
client = Client(token=config['authorizationToken'])

## Checking valid vector field ###
for v in config['vector_fields']:
  if not '_vector_'in v:
    raise ValueError(f"'{v}' is not a valid vector field")


df = client.Dataset(config['dataset_id'])
try:
  if config['clusteringType'] == 'community-detection':
    cluster_method  = "community_detection"
    !pip install -q sentence-transformers==2.2.0
    df.cluster(
        cluster_method,
        cluster_config={"threshold": config['cutoff']},
        vector_fields=config['vector_fields']
    )
  elif config['clusteringType'] == 'kmeans':
    if df.shape[0] < 3000:
      df.cluster(
          config['clusteringType'],
          cluster_config={"n_clusters": config['n_clusters']},
          vector_fields=config['vector_fields']
      )
    else:
      cluster_method = 'minibatchkmeans'
      df.cluster(
          cluster_method,
          cluster_config={"n_clusters": config['n_clusters']},
          vector_fields=config['vector_fields']
      )
  else:
    cluster_method = "kmeans"
    df.cluster(
        cluster_method,
        cluster_config={"n_clusters": config['n_clusters']},
        vector_fields=config['vector_fields']
    )
except Exception as e:
  raise ValueError('Incorrect token provided')

if cluster_method == "community-detection":
    print(f"Finished subclustering your data with {cluster_method} with cutoff={config['cutoff']}, you may close this window.")
elif cluster_method == "kmeans":
    print(f"Finished subclustering your data with {cluster_method} with n_clusters={config['n_clusters']}, you may close this window.")
