[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/RelevanceAI/workflows/blob/main/workflows/cluster/Cluster_Your_Data_with_Relevance_AI.ipynb)

# Instructions

1. Paste the token copied to your clipboard provided from the 'Subcluster' Workflow dashboard.
2. Click the ▶️  on the left or go to "Runtime" -> "Run All" and click "Run anyway" on the warning that pops up.
3. You should see a progress bar underneath the form, keep this window opened and active until the progress bar is complete otherwise it'll terminate.

Note: 
- For fastest clustering speed make sure to go to "Runtime" -> "Change runtime type" and enable "Hardware accelerator" as "GPU".

In [None]:
#@title Paste token below and press ▶️  button to the left of this title { display-mode: "form" }
# %tb

# config = {
#   "dataset_id": "advanced_search_example",
#   "n_clusters": 10,
#   "vector_fields": [
#     "product_title_clip_vector_"
#   ],
#   "cutoff": 0.75,
#   "clusteringType": "community-detection",
#   "region": "ap-southeast-2",
#   "project": "xxx",
#   "api_key": "xxx",
#   "authorizationToken": "xxx:xxx:ap-southeast-2:xxx"
# }

import base64
import json

token = "" #@param {type:"string"}

show_warnings_in_logs = False #@param {type:"boolean"}
#@markdown Once the form is filled and you've clicked run, monitor below for logs of it running

config = json.loads(base64.b64decode(token + "==="))

# print(json.dumps(config, indent=2))

print("Installing RelevanceAI")

!pip install -q -U RelevanceAI==3.0.4
## Instantiate client ###
from relevanceai import Client 
client = Client(token=config['authorizationToken'])

vector_fields = config['vector_fields']
if isinstance(vector_fields, str):
    vector_fields = [vector_fields]
    
## Checking valid vector field ###
for v in vector_fields:
  if not '_vector_'in v:
    raise ValueError(f"'{v}' is not a valid vector field")


df = client.Dataset(config['dataset_id'])
cluster_method = "kmeans" # setting to default
n_clusters = int(config.get('n_clusters', 25))
try:
  if config['clusteringType'] == 'community-detection':
    cluster_method  = "community_detection"
    !pip install -q sentence-transformers==2.2.0
    df.cluster(
        model=cluster_method,
        model_kwargs={"threshold": config['cutoff']},
        vector_fields=config['vector_fields']
    )
  elif config['clusteringType'] == 'kmeans':
    if df.shape[0] < 10000:
      from sklearn.cluster import KMeans
      cluster_method = 'kmeans'
      model = KMeans(n_clusters=n_clusters, random_state=42)
      alias = f"{cluster_method}-{n_clusters}"
      df.cluster(
          model=model,
          model_kwargs={"n_clusters": n_clusters},
          vector_fields=config['vector_fields'],
          alias=alias
      )
    else:
      cluster_method = 'minibatchkmeans'
      from sklearn.cluster import MiniBatchKMeans
      cluster_method = 'kmeans'
      model = MiniBatchKMeans(n_clusters=n_clusters,random_state=42)
      alias = f"{cluster_method}-{n_clusters}"

      # Ensure chunksize is larger than number of clusters or else algorithm error
      chunksize = n_clusters + 100
      df.batch_cluster(
          model=model,
          model_kwargs={"n_clusters": n_clusters},
          vector_fields=config['vector_fields'],
          alias=alias,
          chunksize=chunksize
      )
  elif config['clusteringType'] == 'kmedoids':
    !pip install -q scikit-learn-extra==0.2.0
    from sklearn_extra.cluster import KMedoids
    cluster_method = "kmedoids"
    model = KMedoids(n_clusters=n_clusters, random_state=42, init="k-medoids++")
    alias = f"{cluster_method}-{n_clusters}"
    df.cluster(
        model=model,
        model_kwargs={"n_clusters": n_clusters},
        vector_fields=config['vector_fields'],
        alias=alias
    )
  else:
    cluster_method = "kmeans"
    df.cluster(
        model=cluster_method,
        model_kwargs={"n_clusters": n_clusters, "random_state": 42},
        vector_fields=config['vector_fields'],
    )
except Exception as e:
  raise ValueError('Incorrect token provided')

if cluster_method == "community-detection":
    print(f"Finished clustering your data with {cluster_method} with cutoff={config['cutoff']}, you may close this window.")
elif cluster_method == "kmeans":
    print(f"Finished clustering your data with {cluster_method} with n_clusters={n_clusters}, you may close this window.")
