## Neural DB distributed training

In this notebook, you will be able to train ThirdAI's Neural DB on 10k Amazon datapoints in a distributed fashion.

In [None]:
!pip3 install thirdai --upgrade
!pip3 install "thirdai[neural_db]"  # You may have to unquote this
!pip3 install pyarrow
!pip3 install "ray[all]>=2.7.0"  # You may have to unquote this
!pip3 install torch

In [None]:
from thirdai import neural_db as ndb
import thirdai.distributed_bolt as dist

## Activate your license keys
email us at contact@thirdai.com for your license key.

In [None]:
from thirdai import licensing
import os

## Please request for a trial license @ https://www.thirdai.com/try-bolt/
if "THIRDAI_KEY" in os.environ:
    licensing.activate(os.environ["THIRDAI_KEY"])
else:
    licensing.activate("")  # Enter your ThirdAI key here

## Download the training data file


In [None]:
import os

filename = "amazon_10k.csv"
os.system(f"wget -O {filename} 'https://www.dropbox.com/scl/fi/97utx7ukp0rb37f8d98ia/amazon_10k.csv?rlkey=aq8yq42o54tcj62u9q3op0m80&dl=0'")

In [None]:
db = ndb.NeuralDB("my_user") # you can use any username, in the future, this username will let you push models to the model bazaar

doc = ndb.CSV(
    filename,
    id_column="id",
    strong_columns=["TITLE", "BULLET_POINTS"],
    weak_columns=["DESCRIPTION"],
    reference_columns=["TITLE"],
)

## Insert the document to be indexed
train should be set False for distributed pretraining. 

In [None]:
db.insert(sources=[doc], train=False)

## Ray Cluster Initialization
For the purpose of this demo, we will be initializing a mock ray cluster of 2 nodes here.

In [None]:
import ray
from ray.train import ScalingConfig, RunConfig

cpus_per_node = (dist.get_num_cpus() - 1) // 2

ray.init(ignore_reinit_error=True, runtime_env={"env_vars": {"OMP_NUM_THREADS": f"{cpus_per_node}"}})
scaling_config = ScalingConfig(
    num_workers=2,
    use_gpu=False,
    trainer_resources={"CPU": 1},
    resources_per_worker={"CPU": cpus_per_node},
    placement_strategy="PACK",
)

# We need to specify `storage_path` in `RunConfig` which must be a networked file system 
# or cloud storage path accessible by all workers. (Ray 2.7.0 onwards)
run_config = RunConfig(
    name="NeuralDB_ray_storage",
    storage_path="~/ray_results", # For the purpose of this demo, this `storage_path` will work fine since both workers are run on same machine.
)

## Runs distributed training on the document

In [None]:
db.pretrain_distributed(documents=[doc], scaling_config=scaling_config, run_config=run_config, epochs=15, metrics=["loss"])

### Let's ask!

In [None]:
search_results = db.search(query="Macbook pro 13 inches laptop cover", top_k=3)

In [None]:
for result in search_results:
    print(result.text)
    print('**************')

## Terminate the ray cluster

In [None]:
ray.shutdown()