In [7]:
import os
from groclake.cataloglake import CatalogLake
from groclake.modellake import ModelLake
from groclake.datalake import DataLake
from groclake.vectorlake import VectorLake

# Environment variable setup
GROCLAKE_API_KEY = 'ea5d2f1c4608232e07d3aa3d998e5135'
GROCLAKE_ACCOUNT_ID = 'b3920ccf5094424a74ca0aec2b66dfcb'

os.environ['GROCLAKE_API_KEY'] = GROCLAKE_API_KEY
os.environ['GROCLAKE_ACCOUNT_ID'] = GROCLAKE_ACCOUNT_ID

# Initialize Groclake catalog instance
model_lake = ModelLake()
data_lake = DataLake()
vector_lake = VectorLake()

In [8]:
# Create instances for DataLake
data_lake = DataLake()

In [9]:
try:
    data_create = data_lake.create()
    print("DataLake Created Successfully:", data_create)
    # Store the DataLake ID for further operations
    datalake_id = data_create["datalake_id"]
except Exception as e:
    print("Error creating DataLake:", str(e))

DataLake Created Successfully: {'datalake_id': 'a6n31j7q6ufe6yc1', 'message': 'Datalake with same name already exists in this account.', 'datalake_name': ''}


In [10]:
try:
    payload_push = {
        "datalake_id": "a6n31j7q6ufe6yc1",  # Specify the target DataLake
        "document_type": "url",    # Document type can be 'url', 'text', etc.
        "document_data": "https://drive.google.com/uc?export=download&id=1lVln94gqJKUfHXh07iAw6PSnz85s9vUf"  # URL of the document to push
    }

    # https://drive.google.com/uc?export=download&id=1cOYyJ5RuTjLph6Hjx_tAhGN_xH74tBtr

    # Push the document to the DataLake
    data_push = data_lake.push(payload_push)
    print("Response from push:", data_push)

    # Extract the document_id from the response, which will be used for retrieval
    if "document_id" in data_push:
        document_id = data_push["document_id"]
        print("Document ID:", document_id)
    else:
        print("Error: 'document_id' not found in the response.")

except Exception as e:
    print("Error pushing document:", str(e))

Response from push: {'document_id': '793f51a7a7f54122'}
Document ID: 793f51a7a7f54122


In [11]:
payload_fetch = {
    "document_id": document_id,  # Specify the document to fetch
    "datalake_id": datalake_id,  # Specify the DataLake where the document resides
    "fetch_format": "chunk",   # Fetching in chunks allows partial retrieval for large files
    "chunk_size": "500"         # Define the size of each chunk (in bytes or characters)
}

try:
    # Fetch the document from the DataLake
    data_fetch = data_lake.fetch(payload_fetch)
    print("Document Fetched Successfully:\n", data_fetch)

    # When fetching in chunks, the document is divided into manageable pieces for processing
except Exception as e:
    print("Error fetching document:", str(e))

Document Fetched Successfully:
 {'document_id': '793f51a7a7f54122', 'fetch_format': 'chunk', 'document_data': [':: Result No.Sample Reported On MR No. Sample Received On :Referred By:: Age / GenderName YASH SAMEER SAWANT 19 Year(s) /Male Dr. DIPANJAN  HALDAR - Reg No.  2015/08/449605:06:23 PM 26/02/2024 723733 LB-2024-0087054HAEMATOLOGY / IP No. 231677Whole Blood Specimen : Bed No. 0309303 Ward No. :DAY CARESample Registered On : :26/02/2024 03:34:40 PM : /26/02/2024 03:33:40 PM : : Investigations Result Methods Unit Biological Ref. Interval Complete Blood Count Hemoglobin SLS 4.7 g/dl 13.000 - 18.000 Red', ' Cell Count DC Detection 3.53 10^6/ul 4.500 - 6.500 Hematocrit Pulse Detection 20.6 % 40.000 - 54.000 MCV Calculated 58.4 fl 76.000 - 96.000 MCH Calculated 13.3 pg 27.000 - 32.000 MCHC Calculated 22.8 g/dl 32.000 - 35.000 RDW Calculated 22.5 % 11.500 - 14.500 WBC Count Flowcytometry 8.19 10^3/ul 4.000 - 11.000 Platelet Count Impedence 414 10^3/ul 150.000 - 450.000 MPV Impedence 9.6

In [12]:
try:
    document_chunks = data_fetch.get("document_data", [])  # Retrieve document data in chunks
    for idx, chunk in enumerate(document_chunks):
        print(f"Chunk {idx + 1}:")
        print(chunk)  # Display each chunk of the document
        print("-" * 50)  # Separator for clarity

    # Chunked fetching is especially useful for large documents, as it prevents memory overload
except Exception as e:
    print("Error processing fetched data:", str(e))

Chunk 1:
:: Result No.Sample Reported On MR No. Sample Received On :Referred By:: Age / GenderName YASH SAMEER SAWANT 19 Year(s) /Male Dr. DIPANJAN  HALDAR - Reg No.  2015/08/449605:06:23 PM 26/02/2024 723733 LB-2024-0087054HAEMATOLOGY / IP No. 231677Whole Blood Specimen : Bed No. 0309303 Ward No. :DAY CARESample Registered On : :26/02/2024 03:34:40 PM : /26/02/2024 03:33:40 PM : : Investigations Result Methods Unit Biological Ref. Interval Complete Blood Count Hemoglobin SLS 4.7 g/dl 13.000 - 18.000 Red
--------------------------------------------------
Chunk 2:
 Cell Count DC Detection 3.53 10^6/ul 4.500 - 6.500 Hematocrit Pulse Detection 20.6 % 40.000 - 54.000 MCV Calculated 58.4 fl 76.000 - 96.000 MCH Calculated 13.3 pg 27.000 - 32.000 MCHC Calculated 22.8 g/dl 32.000 - 35.000 RDW Calculated 22.5 % 11.500 - 14.500 WBC Count Flowcytometry 8.19 10^3/ul 4.000 - 11.000 Platelet Count Impedence 414 10^3/ul 150.000 - 450.000 MPV Impedence 9.6 fl 4.000 - 11.000 Diff. WBC Count Neutrophils