From 6a1ba25445b00b7f7353cd8dd1006cbac8f81dfb Mon Sep 17 00:00:00 2001 From: Lalith Sagar Devagudi Date: Tue, 30 Apr 2024 06:38:07 +0200 Subject: [PATCH] Add labeled data and fix broken links --- .../get_useful_sample_data.ipynb | 43 +++++++++++++++++-- .../get_useful_sample_data.md | 31 +++++++++++-- 2 files changed, 68 insertions(+), 6 deletions(-) diff --git a/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.ipynb b/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.ipynb index fc5777790f..f70c00c3b8 100644 --- a/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.ipynb +++ b/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.ipynb @@ -62,7 +62,7 @@ "outputs": [], "source": [ "# \n", - "!curl -O s3://superduperdb-public-demo/images.zip && unzip images.zip\n", + "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/images.zip && unzip images.zip\n", "import os\n", "from PIL import Image\n", "\n", @@ -82,7 +82,7 @@ "outputs": [], "source": [ "# \n", - "!curl -O s3://superduperdb-public-demo/videos.zip && unzip videos.zip\n", + "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/videos.zip && unzip videos.zip\n", "import os\n", "\n", "data = [f'videos/{x}' for x in os.listdir('./videos')]\n", @@ -100,13 +100,50 @@ "outputs": [], "source": [ "# \n", - "!curl -O s3://superduperdb-public-demo/audio.zip && unzip audio.zip\n", + "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/audio.zip && unzip audio.zip\n", "import os\n", "\n", "data = [f'audios/{x}' for x in os.listdir('./audios')]\n", "sample_datapoint = data[-1]\n", "chunked_model_datatype = dtype('str')" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ce66da2f", + "metadata": {}, + "outputs": [], + "source": [ + "# \n", + "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_labeled.json\n", + "import json\n", + "\n", + "with open(\"text_labeled.json\", \"r\") as f:\n", + " data = json.load(f)\n", + "\n", + "sample_datapoint = data[0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9740f757", + "metadata": {}, + "outputs": [], + "source": [ + "# \n", + "!curl -O https://superduperdb-public-demo.s3.amazonaws.com/images_labeled.zip && unzip images_labeled.zip\n", + "import json\n", + "from PIL import Image\n", + "\n", + "with open('images_labeled/images_labeled.json', 'r') as f:\n", + " data = json.load(f)\n", + "\n", + "data = [{'x': Image.open(d['image_path']), 'y': d['label']} for d in data]\n", + "\n", + "sample_datapoint = data[0]" + ] } ], "metadata": { diff --git a/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.md b/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.md index 2a9fd14efa..e42d5f265e 100644 --- a/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.md +++ b/docs/hr/content/docs/reusable_snippets/get_useful_sample_data.md @@ -39,7 +39,7 @@ from superduperdb import dtype ```python - !curl -O s3://superduperdb-public-demo/images.zip && unzip images.zip + !curl -O https://superduperdb-public-demo.s3.amazonaws.com/images.zip && unzip images.zip import os from PIL import Image @@ -53,7 +53,7 @@ from superduperdb import dtype ```python - !curl -O s3://superduperdb-public-demo/videos.zip && unzip videos.zip + !curl -O https://superduperdb-public-demo.s3.amazonaws.com/videos.zip && unzip videos.zip import os data = [f'videos/{x}' for x in os.listdir('./videos')] @@ -65,7 +65,7 @@ from superduperdb import dtype ```python - !curl -O s3://superduperdb-public-demo/audio.zip && unzip audio.zip + !curl -O https://superduperdb-public-demo.s3.amazonaws.com/audio.zip && unzip audio.zip import os data = [f'audios/{x}' for x in os.listdir('./audios')] @@ -73,4 +73,29 @@ from superduperdb import dtype chunked_model_datatype = dtype('str') ``` + + ```python + !curl -O https://superduperdb-public-demo.s3.amazonaws.com/text_labeled.json + import json + + with open("text_labeled.json", "r") as f: + data = json.load(f) + + sample_datapoint = data[0] + ``` + + + ```python + !curl -O https://superduperdb-public-demo.s3.amazonaws.com/images_labeled.zip && unzip images_labeled.zip + import json + from PIL import Image + + with open('images_labeled/images_labeled.json', 'r') as f: + data = json.load(f) + + data = [{'x': Image.open(d['image_path']), 'y': d['label']} for d in data] + + sample_datapoint = data[0] + ``` +