diff --git a/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.ipynb b/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.ipynb index c4447a066a..d323c355f7 100644 --- a/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.ipynb +++ b/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.ipynb @@ -37,7 +37,7 @@ "source": [ "# \n", "input_key = listener.outputs\n", - "select = table_or_collection.outputs(listener.predict_id).select('y', input_key)\n" + "select = table_or_collection.outputs(listener.predict_id).select(target_key, input_key)\n" ] } ], diff --git a/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.md b/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.md index 617b8f0e5e..31c3d0e781 100644 --- a/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.md +++ b/docs/hr/content/docs/reusable_snippets/choose_input_key_from_listener_outputs.md @@ -17,5 +17,5 @@ select = table_or_collection.find() ```python # input_key = listener.outputs -select = table_or_collection.outputs(listener.predict_id).select('y', input_key) +select = table_or_collection.outputs(listener.predict_id).select(target_key, input_key) ``` diff --git a/docs/hr/content/use_cases/transfer_learning.ipynb b/docs/hr/content/use_cases/transfer_learning.ipynb index 4f28e55c8d..2754afeaec 100644 --- a/docs/hr/content/use_cases/transfer_learning.ipynb +++ b/docs/hr/content/use_cases/transfer_learning.ipynb @@ -1 +1 @@ -{"cells":[{"cell_type":"markdown","id":"c288025e-2326-4e8b-ab52-6fb8a5f9560f","metadata":{},"source":["\n","# Transfer learning"]},{"cell_type":"markdown","id":"f7a4aab8-86eb-4e1c-9200-0a16ba75b2e6","metadata":{},"source":["\n","## Configure your production system"]},{"cell_type":"markdown","id":"81e7cd59-67d0-4776-aea1-4864aa768f95","metadata":{},"source":[":::note\n","If you would like to use the production features \n","of SuperDuperDB, then you should set the relevant \n","connections and configurations in a configuration \n","file. Otherwise you are welcome to use \"development\" mode \n","to get going with SuperDuperDB quickly.\n",":::"]},{"cell_type":"code","execution_count":null,"id":"62014646-ccd4-4d10-ac26-1c470f88f2f2","metadata":{},"outputs":[],"source":["import os\n","\n","os.makedirs('.superduperdb', exist_ok=True)\n","os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'"]},{"cell_type":"code","execution_count":null,"id":"8e50edd2-438d-44ab-9da0-0b72197df262","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","data_backend: mongodb://127.0.0.1:27017/documents\n","artifact_store: filesystem://./artifact_store\n","cluster:\n"," cdc:\n"," strategy: null\n"," uri: ray://127.0.0.1:20000\n"," compute:\n"," uri: ray://127.0.0.1:10001\n"," vector_search:\n"," backfill_batch_size: 100\n"," type: in_memory\n"," uri: http://127.0.0.1:21000\n","'''"]},{"cell_type":"code","execution_count":null,"id":"1ad9ee67-6402-45ea-8311-3efb039b5df3","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n"," type: native\n","databackend: mongodb+srv://:@:27017/documents\n","'''"]},{"cell_type":"code","execution_count":null,"id":"9c9e8351-b17f-4882-bda6-5ad51dbc7e1f","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: sqlite://.db\n","'''"]},{"cell_type":"code","execution_count":null,"id":"d16c66bb-6ff2-4cea-b11c-0a65bf86c7ad","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: mysql://:@:/database\n","'''"]},{"cell_type":"code","execution_count":null,"id":"9b7ac715-712c-4ec7-be90-0aaa22518977","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: mssql://:@:\n","'''"]},{"cell_type":"code","execution_count":null,"id":"f21fad9c-cc0e-4cf5-83f0-41a3a614c6af","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: postgres://:@:\n","'''"]},{"cell_type":"code","execution_count":null,"id":"1badb5a3-823c-4463-ab79-6f4f9239dabe","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","metadata_store: sqlite://.db\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: snowflake://:@/\n","'''"]},{"cell_type":"code","execution_count":null,"id":"ae7807d9-9fc1-4c18-8027-a512f827783d","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","metadata_store: sqlite://.db\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: clickhouse://:@:\n","'''"]},{"cell_type":"code","execution_count":null,"id":"fc40c13b-9bc5-47ac-86d6-ef7a379c45ee","metadata":{},"outputs":[],"source":["with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:\n"," f.write(CFG)"]},{"cell_type":"markdown","metadata":{},"source":["\n","## Start your cluster"]},{"cell_type":"markdown","metadata":{},"source":[":::note\n","Starting a SuperDuperDB cluster is useful in production and model development\n","if you want to enable scalable compute, access to the models by multiple users for collaboration, \n","monitoring.\n","\n","If you don't need this, then it is simpler to start in development mode.\n",":::"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","!python -m superduperdb local-cluster up"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","!make testenv_image\n","!make testenv_init"]},{"cell_type":"markdown","id":"32f8484d-2e35-472a-9b24-1a30ec1d144b","metadata":{},"source":["\n","## Connect to SuperDuperDB"]},{"cell_type":"markdown","id":"06d66021-ce62-4021-a2c5-158dee92b3bb","metadata":{},"source":[":::note\n","Note that this is only relevant if you are running SuperDuperDB in development mode.\n","Otherwise refer to \"Configuring your production system\".\n",":::"]},{"cell_type":"code","execution_count":null,"id":"61976f44-8139-41c0-a73e-569c6d16c4b1","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper('mongodb://localhost:27017/documents')"]},{"cell_type":"code","execution_count":null,"id":"e981a457","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","db = superduper('sqlite://my_db.db')"]},{"cell_type":"code","execution_count":null,"id":"19ecf7c0-b730-4503-9b5d-e97697b3bcee","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = 'superduper'\n","password = 'superduper'\n","port = 3306\n","host = 'localhost'\n","database = 'test_db'\n","\n","db = superduper(f\"mysql://{user}:{password}@{host}:{port}/{database}\")"]},{"cell_type":"code","execution_count":null,"id":"df208e8c-4fd0-438f-af29-22a763a2aebd","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = 'sa'\n","password = 'Superduper#1'\n","port = 1433\n","host = 'localhost'\n","\n","db = superduper(f\"mssql://{user}:{password}@{host}:{port}\")"]},{"cell_type":"code","execution_count":null,"id":"d2297295","metadata":{},"outputs":[],"source":["# \n","!pip install psycopg2\n","from superduperdb import superduper\n","\n","user = 'postgres'\n","password = 'postgres'\n","port = 5432\n","host = 'localhost'\n","database = 'test_db'\n","db_uri = f\"postgres://{user}:{password}@{host}:{port}/{database}\"\n","\n","db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))"]},{"cell_type":"code","execution_count":null,"id":"cc6c8517","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = \"superduperuser\"\n","password = \"superduperpassword\"\n","account = \"XXXX-XXXX\" # ORGANIZATIONID-USERID\n","database = \"FREE_COMPANY_DATASET/PUBLIC\"\n","\n","snowflake_uri = f\"snowflake://{user}:{password}@{account}/{database}\"\n","\n","db = superduper(\n"," snowflake_uri, \n"," metadata_store='sqlite:///your_database_name.db',\n",")"]},{"cell_type":"code","execution_count":null,"id":"05da45e3-d9e4-49ca-b9ee-db1b8bf4eb44","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = 'default'\n","password = ''\n","port = 8123\n","host = 'localhost'\n","\n","db = superduper(f\"clickhouse://{user}:{password}@{host}:{port}\", metadata_store=f'mongomock://meta')"]},{"cell_type":"code","execution_count":null,"id":"0e89c8dd-d845-423a-9acc-97e3360d370c","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper('duckdb://mydb.duckdb')"]},{"cell_type":"code","execution_count":null,"id":"2de71562","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper(['my.csv'], metadata_store=f'mongomock://meta')"]},{"cell_type":"code","execution_count":null,"id":"cb029a5e-fedf-4f07-8a31-d220cfbfbb3d","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper('mongomock:///test_db')"]},{"cell_type":"markdown","id":"032c2e7b-3f54-4263-b778-0fef60596efb","metadata":{},"source":["\n","## Get useful sample data"]},{"cell_type":"code","execution_count":null,"id":"854358f9-a2f4-4a53-818f-f03819d5fb03","metadata":{},"outputs":[],"source":["from superduperdb.backends.ibis import dtype\n"]},{"cell_type":"code","execution_count":null,"id":"4e7902bd","metadata":{},"outputs":[],"source":["# \n","!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_classification.json\n","import json\n","\n","with open(\"text_classification.json\", \"r\") as f:\n"," data = json.load(f)\n","sample_datapoint = data[-1]"]},{"cell_type":"code","execution_count":null,"id":"0828031a","metadata":{},"outputs":[],"source":["# \n","!curl -O https://superduperdb-public-demo.s3.amazonaws.com/images_classification.zip && unzip images.zip\n","import json\n","from PIL import Image\n","\n","with open('images/images.json', 'r') as f:\n"," data = json.load(f)\n","\n","data = [{'x': Image.open(d['image_path']), 'y': d['label']} for d in data]\n","sample_datapoint = data[-1]"]},{"cell_type":"markdown","metadata":{},"source":["\n","## Setup tables or collections"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","# Note this is an optional step for MongoDB\n","# Users can also work directly with `DataType` if they want to add\n","# custom data\n","from superduperdb import Schema, DataType\n","from superduperdb.backends.mongodb import Collection\n","\n","table_or_collection = Collection('documents')\n","USE_SCHEMA = False\n","\n","if USE_SCHEMA and isinstance(datatype, DataType):\n"," schema = Schema(fields={'x': datatype})\n"," db.apply(schema)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","from superduperdb.backends.ibis import Table\n","from superduperdb import Schema, DataType\n","from superduperdb.backends.ibis.field_types import dtype\n","\n","datatype = \"str\"\n","\n","if isinstance(datatype, DataType):\n"," schema = Schema(identifier=\"schema\", fields={\"id\": dtype(\"str\"), \"x\": datatype})\n","else:\n"," schema = Schema(\n"," identifier=\"schema\", fields={\"id\": dtype(\"str\"), \"x\": dtype(datatype)}\n"," )\n","\n","table_or_collection = Table('documents', schema=schema)\n","\n","db.apply(table_or_collection)"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","metadata":{},"source":["\n","## Insert data\n","\n","In order to create data, we need to create a `Schema` for encoding our special `Datatype` column(s) in the databackend."]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","from superduperdb import Document, DataType\n","\n","def do_insert(data, schema = None):\n"," \n"," if schema is None and (datatype is None or isinstance(datatype, str)):\n"," data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]\n"," db.execute(table_or_collection.insert_many(data))\n"," elif schema is None and datatype is not None and isinstance(datatype, DataType):\n"," data = [Document({'x': datatype(x['x']), 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': datatype(x)}) for x in data]\n"," db.execute(table_or_collection.insert_many(data))\n"," else:\n"," data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]\n"," db.execute(table_or_collection.insert_many(data, schema=schema))\n"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","from superduperdb import Document\n","\n","def do_insert(data):\n"," db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))\n"]},{"cell_type":"code","execution_count":null,"id":"b5ba80bb-73c3-4894-b193-7ef05b22d3fb","metadata":{},"outputs":[],"source":["do_insert(data[:-len(data) // 4])"]},{"cell_type":"markdown","id":"9e703b58-a46d-4b1f-98fd-f50d46b168fe","metadata":{},"source":["\n","## Compute features"]},{"cell_type":"code","execution_count":null,"id":"ae2e1588-fec8-45a6-b678-fef05fc7b57f","metadata":{},"outputs":[],"source":["# \n","\n","key = 'txt'\n","\n","import sentence_transformers\n","from superduperdb import vector, Listener\n","from superduperdb.ext.sentence_transformers import SentenceTransformer\n","\n","superdupermodel = SentenceTransformer(\n"," identifier=\"embedding\",\n"," object=sentence_transformers.SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n"," datatype=vector(shape=(384,)),\n"," postprocess=lambda x: x.tolist(),\n",")\n","\n","jobs, listener = db.apply(\n"," Listener(\n"," model=superdupermodel,\n"," select=select,\n"," key=key,\n"," identifier=\"features\"\n"," )\n",")"]},{"cell_type":"code","execution_count":null,"id":"17de589c-4d75-4483-b2ca-77d5c25c2fb8","metadata":{},"outputs":[],"source":["# \n","\n","key = 'image'\n","\n","import torchvision.models as models\n","from torchvision import transforms\n","from superduperdb.ext.torch import TorchModel\n","from superduperdb import Listener\n","from PIL import Image\n","\n","class TorchVisionEmbedding:\n"," def __init__(self):\n"," # Load the pre-trained ResNet-18 model\n"," self.resnet = models.resnet18(pretrained=True)\n"," \n"," # Set the model to evaluation mode\n"," self.resnet.eval()\n"," \n"," def preprocess(self, image_array):\n"," # Preprocess the image\n"," image = Image.fromarray(image_array.astype(np.uint8))\n"," preprocess = preprocess = transforms.Compose([\n"," transforms.Resize(256),\n"," transforms.CenterCrop(224),\n"," transforms.ToTensor(),\n"," transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n"," ])\n"," tensor_image = preprocess(image)\n"," return tensor_image\n"," \n","model = TorchVisionEmbedding()\n","superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())\n","\n","jobs, listener = db.apply(\n"," Listener(\n"," model=superdupermodel,\n"," select=select,\n"," key=key,\n"," identifier=\"features\"\n"," )\n",")"]},{"cell_type":"markdown","id":"8bf1dc1c","metadata":{},"source":["## Choose input key from listener outputs"]},{"cell_type":"markdown","metadata":{},"source":[":::note\n","This is useful if you have performed a first step, such as pre-computing \n","features, or chunking your data. You can use this query to \n","choose the input key for further models such as classification models.\n",":::"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","input_key = listener.outputs\n","select = table_or_collection.find()"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","input_key = listener.outputs\n","select = table_or_collection.outputs(listener.predict_id).select('y', input_key)\n"]},{"cell_type":"markdown","id":"c2da0ab6-8fc0-41fc-b8c9-0f8a127d9e8d","metadata":{},"source":["\n","## Build and train classifier"]},{"cell_type":"code","execution_count":null,"id":"d3b94fca-3a0b-433f-88cf-aab5b71b8596","metadata":{},"outputs":[],"source":["# \n","from sklearn.linear_model import LogisticRegression\n","from superduperdb.ext.sklearn.model import SklearnTrainer, Estimator\n","\n","# Create a Logistic Regression model\n","model = LogisticRegression()\n","model = Estimator(\n"," object=model,\n"," identifier='my-model',\n"," trainer=SklearnTrainer(\n"," key=(input_key, 'y'),\n"," select=select,\n"," )\n",")"]},{"cell_type":"code","execution_count":null,"id":"5256e0fb-db16-411e-a1c1-8d44feb26c29","metadata":{},"outputs":[],"source":["# \n","from torch import nn\n","from superduperdb.ext.torch.model import TorchModel\n","from superduperdb.ext.torch.training import TorchTrainer\n","\n","\n","class SimpleModel(nn.Module):\n"," def __init__(self, input_size=16, hidden_size=32, num_classes=3):\n"," super(SimpleModel, self).__init__()\n"," self.fc1 = nn.Linear(input_size, hidden_size)\n"," self.relu = nn.ReLU()\n"," self.fc2 = nn.Linear(hidden_size, num_classes)\n","\n"," def forward(self, x):\n"," out = self.fc1(x)\n"," out = self.relu(out)\n"," out = self.fc2(out)\n"," return out\n","\n","# Loss function\n","def my_loss(X, y):\n"," return torch.nn.functional.binary_cross_entropy_with_logits(\n"," X[:, 0], y.type(torch.float)\n"," )\n","\n","\n","# Create a Logistic Regression model\n","model = SimpleModel()\n","model = TorchModel(\n"," identifier='my-model',\n"," object=model, \n"," trainer=TorchTrainer(\n"," key=(input_key, 'y'),\n"," identifier='my_trainer',\n"," objective=my_loss,\n"," loader_kwargs={'batch_size': 10},\n"," max_iterations=100,\n"," validation_interval=10,\n"," select=select,\n"," ),\n",")"]},{"cell_type":"markdown","id":"ac6fbe06-37d8-451c-a7ed-6ab217f73b7e","metadata":{},"source":["The following command adds the model to the system and trains the model in one command."]},{"cell_type":"code","execution_count":null,"id":"decad591-5934-45b6-a332-a47fc61a0aa8","metadata":{},"outputs":[],"source":["db.apply(model)"]}],"metadata":{"kernelspec":{"display_name":".venv","language":"python","name":".venv"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":5} +{"cells":[{"cell_type":"markdown","id":"c288025e-2326-4e8b-ab52-6fb8a5f9560f","metadata":{},"source":["\n","# Transfer learning"]},{"cell_type":"markdown","id":"f7a4aab8-86eb-4e1c-9200-0a16ba75b2e6","metadata":{},"source":["\n","## Configure your production system"]},{"cell_type":"markdown","id":"81e7cd59-67d0-4776-aea1-4864aa768f95","metadata":{},"source":[":::note\n","If you would like to use the production features \n","of SuperDuperDB, then you should set the relevant \n","connections and configurations in a configuration \n","file. Otherwise you are welcome to use \"development\" mode \n","to get going with SuperDuperDB quickly.\n",":::"]},{"cell_type":"code","execution_count":null,"id":"62014646-ccd4-4d10-ac26-1c470f88f2f2","metadata":{},"outputs":[],"source":["import os\n","\n","os.makedirs('.superduperdb', exist_ok=True)\n","os.environ['SUPERDUPERDB_CONFIG'] = '.superduperdb/config.yaml'"]},{"cell_type":"code","execution_count":null,"id":"8e50edd2-438d-44ab-9da0-0b72197df262","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","data_backend: mongodb://127.0.0.1:27017/documents\n","artifact_store: filesystem://./artifact_store\n","cluster:\n"," cdc:\n"," strategy: null\n"," uri: ray://127.0.0.1:20000\n"," compute:\n"," uri: ray://127.0.0.1:10001\n"," vector_search:\n"," backfill_batch_size: 100\n"," type: in_memory\n"," uri: http://127.0.0.1:21000\n","'''"]},{"cell_type":"code","execution_count":null,"id":"1ad9ee67-6402-45ea-8311-3efb039b5df3","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n"," type: native\n","databackend: mongodb+srv://:@:27017/documents\n","'''"]},{"cell_type":"code","execution_count":null,"id":"9c9e8351-b17f-4882-bda6-5ad51dbc7e1f","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: sqlite://.db\n","'''"]},{"cell_type":"code","execution_count":null,"id":"d16c66bb-6ff2-4cea-b11c-0a65bf86c7ad","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: mysql://:@:/database\n","'''"]},{"cell_type":"code","execution_count":null,"id":"9b7ac715-712c-4ec7-be90-0aaa22518977","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: mssql://:@:\n","'''"]},{"cell_type":"code","execution_count":null,"id":"f21fad9c-cc0e-4cf5-83f0-41a3a614c6af","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: postgres://:@:\n","'''"]},{"cell_type":"code","execution_count":null,"id":"1badb5a3-823c-4463-ab79-6f4f9239dabe","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","metadata_store: sqlite://.db\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: snowflake://:@/\n","'''"]},{"cell_type":"code","execution_count":null,"id":"ae7807d9-9fc1-4c18-8027-a512f827783d","metadata":{},"outputs":[],"source":["# \n","CFG = '''\n","artifact_store: filesystem://\n","metadata_store: sqlite://.db\n","cluster: \n"," compute: ray://\n"," cdc: \n"," uri: http://:\n"," vector_search:\n"," uri: http://:\n","databackend: clickhouse://:@:\n","'''"]},{"cell_type":"code","execution_count":null,"id":"fc40c13b-9bc5-47ac-86d6-ef7a379c45ee","metadata":{},"outputs":[],"source":["with open(os.environ['SUPERDUPERDB_CONFIG'], 'w') as f:\n"," f.write(CFG)"]},{"cell_type":"markdown","metadata":{},"source":["\n","## Start your cluster"]},{"cell_type":"markdown","metadata":{},"source":[":::note\n","Starting a SuperDuperDB cluster is useful in production and model development\n","if you want to enable scalable compute, access to the models by multiple users for collaboration, \n","monitoring.\n","\n","If you don't need this, then it is simpler to start in development mode.\n",":::"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","!python -m superduperdb local-cluster up"]},{"cell_type":"code","execution_count":null,"metadata":{},"outputs":[],"source":["# \n","!make testenv_image\n","!make testenv_init"]},{"cell_type":"markdown","id":"32f8484d-2e35-472a-9b24-1a30ec1d144b","metadata":{},"source":["\n","## Connect to SuperDuperDB"]},{"cell_type":"markdown","id":"06d66021-ce62-4021-a2c5-158dee92b3bb","metadata":{},"source":[":::note\n","Note that this is only relevant if you are running SuperDuperDB in development mode.\n","Otherwise refer to \"Configuring your production system\".\n",":::"]},{"cell_type":"code","execution_count":null,"id":"61976f44-8139-41c0-a73e-569c6d16c4b1","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper('mongodb://localhost:27017/documents')"]},{"cell_type":"code","execution_count":null,"id":"e981a457","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","db = superduper('sqlite://my_db.db')"]},{"cell_type":"code","execution_count":null,"id":"19ecf7c0-b730-4503-9b5d-e97697b3bcee","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = 'superduper'\n","password = 'superduper'\n","port = 3306\n","host = 'localhost'\n","database = 'test_db'\n","\n","db = superduper(f\"mysql://{user}:{password}@{host}:{port}/{database}\")"]},{"cell_type":"code","execution_count":null,"id":"df208e8c-4fd0-438f-af29-22a763a2aebd","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = 'sa'\n","password = 'Superduper#1'\n","port = 1433\n","host = 'localhost'\n","\n","db = superduper(f\"mssql://{user}:{password}@{host}:{port}\")"]},{"cell_type":"code","execution_count":null,"id":"d2297295","metadata":{},"outputs":[],"source":["# \n","!pip install psycopg2\n","from superduperdb import superduper\n","\n","user = 'postgres'\n","password = 'postgres'\n","port = 5432\n","host = 'localhost'\n","database = 'test_db'\n","db_uri = f\"postgres://{user}:{password}@{host}:{port}/{database}\"\n","\n","db = superduper(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))"]},{"cell_type":"code","execution_count":null,"id":"cc6c8517","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = \"superduperuser\"\n","password = \"superduperpassword\"\n","account = \"XXXX-XXXX\" # ORGANIZATIONID-USERID\n","database = \"FREE_COMPANY_DATASET/PUBLIC\"\n","\n","snowflake_uri = f\"snowflake://{user}:{password}@{account}/{database}\"\n","\n","db = superduper(\n"," snowflake_uri, \n"," metadata_store='sqlite:///your_database_name.db',\n",")"]},{"cell_type":"code","execution_count":null,"id":"05da45e3-d9e4-49ca-b9ee-db1b8bf4eb44","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","user = 'default'\n","password = ''\n","port = 8123\n","host = 'localhost'\n","\n","db = superduper(f\"clickhouse://{user}:{password}@{host}:{port}\", metadata_store=f'mongomock://meta')"]},{"cell_type":"code","execution_count":null,"id":"0e89c8dd-d845-423a-9acc-97e3360d370c","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper('duckdb://mydb.duckdb')"]},{"cell_type":"code","execution_count":null,"id":"2de71562","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper(['my.csv'], metadata_store=f'mongomock://meta')"]},{"cell_type":"code","execution_count":null,"id":"cb029a5e-fedf-4f07-8a31-d220cfbfbb3d","metadata":{},"outputs":[],"source":["# \n","from superduperdb import superduper\n","\n","db = superduper('mongomock:///test_db')"]},{"cell_type":"markdown","id":"032c2e7b-3f54-4263-b778-0fef60596efb","metadata":{},"source":["\n","## Get useful sample data"]},{"cell_type":"code","execution_count":null,"id":"854358f9-a2f4-4a53-818f-f03819d5fb03","metadata":{},"outputs":[],"source":["from superduperdb.backends.ibis import dtype\n"]},{"cell_type":"code","execution_count":null,"id":"4e7902bd","metadata":{},"outputs":[],"source":["# \n","!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_classification.json\n","import json\n","\n","with open(\"text_classification.json\", \"r\") as f:\n"," data = json.load(f)\n","sample_datapoint = data[-1]"]},{"cell_type":"code","execution_count":null,"id":"0828031a","metadata":{},"outputs":[],"source":["# \n","!curl -O https://superduperdb-public-demo.s3.amazonaws.com/images_classification.zip && unzip images.zip\n","import json\n","from PIL import Image\n","\n","with open('images/images.json', 'r') as f:\n"," data = json.load(f)\n","\n","data = [{'x': Image.open(d['image_path']), 'y': d['label']} for d in data]\n","sample_datapoint = data[-1]"]},{"cell_type":"markdown","id":"4049140c","metadata":{},"source":["\n","## Setup tables or collections"]},{"cell_type":"code","execution_count":null,"id":"b822ed49","metadata":{},"outputs":[],"source":["# \n","# Note this is an optional step for MongoDB\n","# Users can also work directly with `DataType` if they want to add\n","# custom data\n","from superduperdb import Schema, DataType\n","from superduperdb.backends.mongodb import Collection\n","\n","table_or_collection = Collection('documents')\n","USE_SCHEMA = False\n","\n","if USE_SCHEMA and isinstance(datatype, DataType):\n"," schema = Schema(fields={'x': datatype})\n"," db.apply(schema)"]},{"cell_type":"code","execution_count":null,"id":"109d7fef","metadata":{},"outputs":[],"source":["# \n","from superduperdb.backends.ibis import Table\n","from superduperdb import Schema, DataType\n","from superduperdb.backends.ibis.field_types import dtype\n","\n","datatype = \"str\"\n","\n","if isinstance(datatype, DataType):\n"," schema = Schema(identifier=\"schema\", fields={\"id\": dtype(\"str\"), \"x\": datatype})\n","else:\n"," schema = Schema(\n"," identifier=\"schema\", fields={\"id\": dtype(\"str\"), \"x\": dtype(datatype)}\n"," )\n","\n","table_or_collection = Table('documents', schema=schema)\n","\n","db.apply(table_or_collection)"]},{"cell_type":"code","execution_count":null,"id":"b1e75053","metadata":{},"outputs":[],"source":[]},{"cell_type":"markdown","id":"b833fce6","metadata":{},"source":["\n","## Insert data\n","\n","In order to create data, we need to create a `Schema` for encoding our special `Datatype` column(s) in the databackend."]},{"cell_type":"code","execution_count":null,"id":"38d8c7c2","metadata":{},"outputs":[],"source":["# \n","from superduperdb import Document, DataType\n","\n","def do_insert(data, schema = None):\n"," \n"," if schema is None and (datatype is None or isinstance(datatype, str)):\n"," data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]\n"," db.execute(table_or_collection.insert_many(data))\n"," elif schema is None and datatype is not None and isinstance(datatype, DataType):\n"," data = [Document({'x': datatype(x['x']), 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': datatype(x)}) for x in data]\n"," db.execute(table_or_collection.insert_many(data))\n"," else:\n"," data = [Document({'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'x': x}) for x in data]\n"," db.execute(table_or_collection.insert_many(data, schema=schema))\n"]},{"cell_type":"code","execution_count":null,"id":"1ddbc412","metadata":{},"outputs":[],"source":["# \n","from superduperdb import Document\n","\n","def do_insert(data):\n"," db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x['x'], 'y': x['y']}) if isinstance(x, dict) and 'x' in x and 'y' in x else Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))\n"]},{"cell_type":"code","execution_count":null,"id":"b5ba80bb-73c3-4894-b193-7ef05b22d3fb","metadata":{},"outputs":[],"source":["do_insert(data[:-len(data) // 4])"]},{"cell_type":"markdown","id":"9e703b58-a46d-4b1f-98fd-f50d46b168fe","metadata":{},"source":["\n","## Compute features"]},{"cell_type":"code","execution_count":null,"id":"ae2e1588-fec8-45a6-b678-fef05fc7b57f","metadata":{},"outputs":[],"source":["# \n","\n","key = 'txt'\n","\n","import sentence_transformers\n","from superduperdb import vector, Listener\n","from superduperdb.ext.sentence_transformers import SentenceTransformer\n","\n","superdupermodel = SentenceTransformer(\n"," identifier=\"embedding\",\n"," object=sentence_transformers.SentenceTransformer(\"sentence-transformers/all-MiniLM-L6-v2\"),\n"," datatype=vector(shape=(384,)),\n"," postprocess=lambda x: x.tolist(),\n",")\n","\n","jobs, listener = db.apply(\n"," Listener(\n"," model=superdupermodel,\n"," select=select,\n"," key=key,\n"," identifier=\"features\"\n"," )\n",")"]},{"cell_type":"code","execution_count":null,"id":"17de589c-4d75-4483-b2ca-77d5c25c2fb8","metadata":{},"outputs":[],"source":["# \n","\n","key = 'image'\n","\n","import torchvision.models as models\n","from torchvision import transforms\n","from superduperdb.ext.torch import TorchModel\n","from superduperdb import Listener\n","from PIL import Image\n","\n","class TorchVisionEmbedding:\n"," def __init__(self):\n"," # Load the pre-trained ResNet-18 model\n"," self.resnet = models.resnet18(pretrained=True)\n"," \n"," # Set the model to evaluation mode\n"," self.resnet.eval()\n"," \n"," def preprocess(self, image_array):\n"," # Preprocess the image\n"," image = Image.fromarray(image_array.astype(np.uint8))\n"," preprocess = preprocess = transforms.Compose([\n"," transforms.Resize(256),\n"," transforms.CenterCrop(224),\n"," transforms.ToTensor(),\n"," transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),\n"," ])\n"," tensor_image = preprocess(image)\n"," return tensor_image\n"," \n","model = TorchVisionEmbedding()\n","superdupermodel = TorchModel(identifier='my-vision-model-torch', object=model.resnet, preprocess=model.preprocess, postprocess=lambda x: x.numpy().tolist())\n","\n","jobs, listener = db.apply(\n"," Listener(\n"," model=superdupermodel,\n"," select=select,\n"," key=key,\n"," identifier=\"features\"\n"," )\n",")"]},{"cell_type":"markdown","id":"8bf1dc1c","metadata":{},"source":["## Choose input key from listener outputs"]},{"cell_type":"markdown","id":"08d6a342","metadata":{},"source":[":::note\n","This is useful if you have performed a first step, such as pre-computing \n","features, or chunking your data. You can use this query to \n","choose the input key for further models such as classification models.\n",":::"]},{"cell_type":"code","execution_count":null,"id":"b2684992","metadata":{},"outputs":[],"source":["# \n","input_key = listener.outputs\n","select = table_or_collection.find()"]},{"cell_type":"code","execution_count":null,"id":"2698b10e","metadata":{},"outputs":[],"source":["# \n","input_key = listener.outputs\n","select = table_or_collection.outputs(listener.predict_id).select(target_key, input_key)\n"]},{"cell_type":"markdown","id":"c2da0ab6-8fc0-41fc-b8c9-0f8a127d9e8d","metadata":{},"source":["\n","## Build and train classifier"]},{"cell_type":"code","execution_count":null,"id":"d3b94fca-3a0b-433f-88cf-aab5b71b8596","metadata":{},"outputs":[],"source":["# \n","from sklearn.linear_model import LogisticRegression\n","from superduperdb.ext.sklearn.model import SklearnTrainer, Estimator\n","\n","# Create a Logistic Regression model\n","model = LogisticRegression()\n","model = Estimator(\n"," object=model,\n"," identifier='my-model',\n"," trainer=SklearnTrainer(\n"," key=(input_key, 'y'),\n"," select=select,\n"," )\n",")"]},{"cell_type":"code","execution_count":null,"id":"5256e0fb-db16-411e-a1c1-8d44feb26c29","metadata":{},"outputs":[],"source":["# \n","from torch import nn\n","from superduperdb.ext.torch.model import TorchModel\n","from superduperdb.ext.torch.training import TorchTrainer\n","\n","\n","class SimpleModel(nn.Module):\n"," def __init__(self, input_size=16, hidden_size=32, num_classes=3):\n"," super(SimpleModel, self).__init__()\n"," self.fc1 = nn.Linear(input_size, hidden_size)\n"," self.relu = nn.ReLU()\n"," self.fc2 = nn.Linear(hidden_size, num_classes)\n","\n"," def forward(self, x):\n"," out = self.fc1(x)\n"," out = self.relu(out)\n"," out = self.fc2(out)\n"," return out\n","\n","# Loss function\n","def my_loss(X, y):\n"," return torch.nn.functional.binary_cross_entropy_with_logits(\n"," X[:, 0], y.type(torch.float)\n"," )\n","\n","\n","# Create a Logistic Regression model\n","model = SimpleModel()\n","model = TorchModel(\n"," identifier='my-model',\n"," object=model, \n"," trainer=TorchTrainer(\n"," key=(input_key, 'y'),\n"," identifier='my_trainer',\n"," objective=my_loss,\n"," loader_kwargs={'batch_size': 10},\n"," max_iterations=100,\n"," validation_interval=10,\n"," select=select,\n"," ),\n",")"]},{"cell_type":"markdown","id":"ac6fbe06-37d8-451c-a7ed-6ab217f73b7e","metadata":{},"source":["The following command adds the model to the system and trains the model in one command."]},{"cell_type":"code","execution_count":null,"id":"decad591-5934-45b6-a332-a47fc61a0aa8","metadata":{},"outputs":[],"source":["db.apply(model)"]}],"metadata":{"kernelspec":{"display_name":".venv","language":"python","name":".venv"},"language_info":{"codemirror_mode":{"name":"ipython","version":3},"file_extension":".py","mimetype":"text/x-python","name":"python","nbconvert_exporter":"python","pygments_lexer":"ipython3","version":"3.11.9"}},"nbformat":4,"nbformat_minor":5} diff --git a/docs/hr/content/use_cases/transfer_learning.md b/docs/hr/content/use_cases/transfer_learning.md index 1eaad64ce6..35f04e378e 100644 --- a/docs/hr/content/use_cases/transfer_learning.md +++ b/docs/hr/content/use_cases/transfer_learning.md @@ -504,7 +504,7 @@ choose the input key for further models such as classification models. ```python input_key = listener.outputs - select = table_or_collection.outputs(listener.predict_id).select('y', input_key) + select = table_or_collection.outputs(listener.predict_id).select(target_key, input_key) ```