In [None]:
# restart your notebook if prompted on Colab
try:
    import verta
except ImportError:
    !pip install verta

In [None]:
HOST = "datasetint.dev.verta.ai"

In [None]:
import os
# os.environ['VERTA_EMAIL'] = ''
# os.environ['VERTA_DEV_KEY'] = ''
os.environ['VERTA_EMAIL']

In [None]:
from __future__ import print_function

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

import itertools
import os
import time
import requests

import six

import numpy as np
import pandas as pd

import sklearn
from sklearn import model_selection
from sklearn import linear_model
from sklearn import metrics
import json

In [None]:
try:
    import wget
except ImportError:
    !pip install wget
    import wget

In [None]:
from verta import Client
from verta.utils import ModelAPI

client = Client(HOST)
workspace=""
s3Bucket = ""

### Create Datasets

In [None]:
dataset1 = client.set_dataset(name="Demo Dataset S3", type="s3")
print("Dataset 1: {}".format(dataset1))

dataset2 = client.set_dataset(name="Demo Dataset S3 - 2", type="s3", tags=["tag-1","tag-2","tag-3"], desc="S3 Dataset with tags and description")
print("Dataset 2: {}".format(dataset2))

dataset3 = client.set_dataset(name="Demo Dataset S3 - 3", type="s3", workspace=workspace)
print("Dataset 3: {}".format(dataset3))

### Add dataset tags

In [None]:
url = "{}://{}/v1/dataset/addDatasetTags".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {"id":dataset3.id, "tags":["tag-1","tag-2","tag-3","tag-4"]}

r = requests.post(url, json = payload, headers= client._conn.auth)
r.json()

### Get dataset tags (NOT_SUPPORTED)

In [None]:
url = "{}://{}/v1/dataset/getDatasetTags".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {"id":dataset3.id}

r = requests.get(url, json = payload, headers= client._conn.auth)
r.json()

### Delete selected dataset tags

In [None]:
url = "{}://{}/v1/dataset/deleteDatasetTags".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {"id":dataset3.id, "tags":["tag-1","tag-2"]}

r = requests.delete(url, json = payload, headers= client._conn.auth)
r.json()

### Delete all dataset tags using `delete_all` flag

In [None]:
url = "{}://{}/v1/dataset/deleteDatasetTags".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {"id":dataset3.id, "delete_all":True}

r = requests.delete(url, json = payload, headers= client._conn.auth)
r.json()

### Create DatasetVersions

In [None]:
version1 = dataset1.create_version(bucket_name=s3Bucket)
print("version 1: {}".format(version1.id))
version2 = dataset1.create_version(bucket_name=s3Bucket, tags=["tag-1","tag-2"], desc="S3 DatasetVersion with tags and description")
print("version 2: {}".format(version2.id))

### Find DatasetVersions by `dataset_id` & `dataset_version_ids`

In [None]:
url = "{}://{}/v1/dataset-version/findDatasetVersions".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {'dataset_id':dataset1.id, 'dataset_version_ids':[version1.id]}

r = requests.post(url, json = payload, headers= client._conn.auth)
r.json()

### Find DatasetVersions by `dataset_id`

In [None]:
payload = {'dataset_id':dataset1.id}

r = requests.post(url, json = payload, headers= client._conn.auth)
r.json()

### Find Datasets by Fuzzy `name` & `tags`

In [None]:
find_dataset_output = client.find_datasets(name=dataset1.name, tags=["tag-1"]) #name field work with fuzzy match
for output in find_dataset_output:
    print("Find Datasets output: {}".format(output))

### Find Datasets(All) witout predicates

In [None]:
url = "{}://{}/v1/dataset/findDatasets".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {}

r = requests.post(url, json = payload, headers= client._conn.auth)
r.json()

## Features not supported on traditional dataset versions

### S3Version ID

In [None]:
url = "{}://{}/v1/dataset-version/createDatasetVersion".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {'dataset_id': dataset1.id,
           'description': 'version with s3versionID',
           'dataset_blob': {
              's3': {
                  'components':[
                      {
                          'path':{
                              'path':'s3://my-bucket/my-file.txt',
                              'size':123,
                              'sha256': 'abcd',
                              'internal_versioned_path': s3Bucket
                          },
                          's3_version_id':'a1b2c3d4'
                      }
                  ]
              }
           }
          }

r = requests.post(url, json = payload, headers= client._conn.auth)
r.json()
vertaManagedDatasetVersionId = r.json()['dataset_version']['id']
print("vertaManagedDatasetVersionId: {}".format(vertaManagedDatasetVersionId))

### Verta managed Versioning (do not access dataset (protected repository))

In [None]:
url = "https://datasetint.dev.verta.ai/v1/versioning/repositories/"+dataset1.id+"/commits/"+vertaManagedDatasetVersionId+"/getUrlForBlobVersioned".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {
            "repository_id": {
               "repo_id": dataset1.id
            },
            "commit_sha": vertaManagedDatasetVersionId,
            "location": [
                "version"
            ],
            "path_dataset_component_blob_path": "s3://my-bucket/my-file.txt",
            "method": "put",
            "part_number": 1
        }

r = requests.post(url, json = payload, headers= client._conn.auth)
r.json()


### Delete Dataset by ID

In [None]:
url = "{}://{}/v1/dataset/deleteDataset".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {'id':dataset1.id}

r = requests.delete(url, json = payload, headers= client._conn.auth)
r.json()

### Delete datasets in bulk

In [None]:
url = "{}://{}/v1/dataset/deleteDatasets".format(
            client._conn.scheme,
            client._conn.socket,
        )
payload = {'ids':[dataset2.id, dataset3.id]}

r = requests.delete(url, json = payload, headers= client._conn.auth)
r.json()