In [29]:
import csv
import io
import os
import time
import uuid # <-- NEW: Import uuid
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

In [30]:
# 1. Define your data
CSV_DATA = """Domain,Service,PDF_URL
Analytics,sagemaker,https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf
"""

In [34]:
# 2. Define constants
CHROMA_DB_PATH = "./chroma_db_AWSDocs"
EMBEDDING_MODEL_NAME = "nomic-embed-text" 
BATCH_SIZE = 50  
CHUNK_SIZE = 1000 
CHUNK_OVERLAP = 200

In [35]:
def parse_csv_data(csv_data):
    """Parses the in-memory CSV string into a list of dictionaries."""
    service_docs = []
    f = io.StringIO(csv_data)
    reader = csv.DictReader(f)
    for row in reader:
        service_docs.append(row)
    return service_docs

In [36]:
print("Initializing Ollama embeddings...")
try:
    embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME)
    embeddings.embed_query("Test embedding")
except Exception as e:
    print(f"Error connecting to Ollama. Is it running?")

2025-11-11 19:56:01,864 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Initializing Ollama embeddings...


In [37]:
vector_store = Chroma(
    collection_name="AWSDocs",
    persist_directory=CHROMA_DB_PATH,
    embedding_function=embeddings
)

In [38]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on)

In [39]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP
)

In [40]:
loader = DoclingLoader(
    file_path='https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf',
    export_type=ExportType.MARKDOWN,
)

In [41]:
docs_as_markdown = loader.load()

2025-11-11 19:56:11,972 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-11 19:56:12,053 - INFO - Going to convert document batch...
2025-11-11 19:56:12,054 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-11 19:56:12,059 - INFO - Auto OCR model selected ocrmac.
2025-11-11 19:56:12,062 - INFO - Accelerator device: 'mps'
2025-11-11 19:56:17,029 - INFO - Accelerator device: 'mps'
2025-11-11 19:56:18,364 - INFO - Processing document next-generation-sagemaker-ug.pdf
2025-11-11 19:56:42,120 - INFO - Finished converting document next-generation-sagemaker-ug.pdf in 30.61 sec.


In [42]:
docs_as_markdown



In [43]:
markdown_content = docs_as_markdown[0].page_content

In [44]:
semantic_chunks = markdown_splitter.split_text(markdown_content)


In [45]:
for chunk in semantic_chunks:
    chunk.metadata["domain"] = 'domain'
    chunk.metadata["service"] = 'service_name'
    chunk.metadata["source"] = 'url'
final_chunks = text_splitter.split_documents(semantic_chunks)
if final_chunks:
    print(f"Found {len(final_chunks)} final chunks to add.")    

Found 195 final chunks to add.


In [46]:
for chunk in final_chunks:
    print(chunk.page_content)
    print(chunk.metadata)
    print("-"*100)

aws
{'domain': 'domain', 'service': 'service_name', 'source': 'url'}
----------------------------------------------------------------------------------------------------
C  
&lt;/&gt;
{'Header 2': 'Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}
----------------------------------------------------------------------------------------------------
Copyright © 2025 Amazon Web Services, Inc. and/or its affiliates. All rights reserved.  
Amazon's trademarks and trade dress may not be used in connection with any product or service that is not Amazon's, in any manner that is likely to cause confusion among customers, or in any manner that disparages or discredits Amazon. All other trademarks not owned by Amazon are the property of their respective owners, who may or may not be affiliated with, connected to, or sponsored by Amazon.
{'Header 2': 'Amazon SageMaker: User Guide', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}
---------------------

In [47]:
process_docs = final_chunks[20:]

In [53]:
chunk_ids = [str(uuid.uuid4()) for _ in final_chunks]

In [49]:
final_chunks = process_docs

In [51]:
final_chunks = final_chunks[20:]

In [52]:
final_chunks

[Document(metadata={'Header 2': 'Generative AI application development', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Access Amazon Bedrock's capabilities through SageMaker Unified Studio to quickly build and customize your generative AI applications. This intuitive interface lets you work with highperforming foundation models (FMs) from leading companies like Anthropic, Mistral, Meta, and Amazon, and use advanced features like Amazon Bedrock Knowledge Bases, Amazon Bedrock Guardrails, Amazon Bedrock Agents, and Amazon Bedrock Flows. You can develop generative AI applications faster within SageMaker Unified Studio's secure environment, ensuring alignment with your requirements and responsible AI guidelines."),
 Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="When should I use Bedrock in SageMaker Unified Studio versus the standalone Amazon Bedrock service?  \nAmazon Bedrock's capabil

In [54]:
for i in range(0, len(final_chunks)):
    batch_docs = final_chunks[i:i + 1]
    batch_ids = chunk_ids[i:i + 1]
    
    batch_num = (i) + 1
    total_batches = len(final_chunks) + 1

    print(f"  Adding batch {batch_num}/{total_batches}...")
    
    print(batch_docs)
    # 3. Pass both documents and ids to the vector store
    vector_store.add_documents(
        documents=batch_docs,
        ids=batch_ids
    )
    
    time.sleep(2)

  Adding batch 1/156...
[Document(metadata={'Header 2': 'Generative AI application development', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Access Amazon Bedrock's capabilities through SageMaker Unified Studio to quickly build and customize your generative AI applications. This intuitive interface lets you work with highperforming foundation models (FMs) from leading companies like Anthropic, Mistral, Meta, and Amazon, and use advanced features like Amazon Bedrock Knowledge Bases, Amazon Bedrock Guardrails, Amazon Bedrock Agents, and Amazon Bedrock Flows. You can develop generative AI applications faster within SageMaker Unified Studio's secure environment, ensuring alignment with your requirements and responsible AI guidelines.")]


2025-11-11 20:02:52,608 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-11 20:02:54,901 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 2/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="When should I use Bedrock in SageMaker Unified Studio versus the standalone Amazon Bedrock service?  \nAmazon Bedrock's capabilities in Amazon SageMaker Unified Studio are ideal for enterprise teams who need a governed low-code/no-code environment for collaboratively building and deploying generative AI applications, alongside unified analytics and machine learning capabilities.  \nCustomers can use the standalone Bedrock service from the AWS Management Console or Bedrock API when they want to leverage the full feature set of Bedrock including the latest agents, flow and guardrail enhancements, and the Bedrock SDK.")]


2025-11-11 20:02:56,976 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 3/156...
[Document(metadata={'Header 2': 'Get started with Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You can view demos of Amazon SageMaker and get started by setting up a domain and project.')]


2025-11-11 20:02:59,061 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 4/156...
[Document(metadata={'Header 2': 'View demos of Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='To see Amazon SageMaker before using it yourself, you can review the following clickthrough demos:  \n- For an end-to-end demo, see the Amazon SageMaker detailed clickthrough experience. This demo includes SageMaker Lakehouse, Amazon SageMaker Catalog, and more in Amazon SageMaker Unified Studio.\n- For a demo of SageMaker Lakehouse, see Amazon SageMaker: Access data in your lakehouse. This demo includes SageMaker Lakehouse in Amazon SageMaker Unified Studio, including adding a data source and querying data.\n- For a demo of the Amazon SageMaker Catalog, see Amazon SageMaker: Catalog. This demo includes Amazon SageMaker Catalog in Amazon SageMaker Unified Studio, including browsing assets and subscribing to an asset.\n- For a demo of generative AI, see Amazon SageMaker: Generative AI playground and Gen AI app developmen

2025-11-11 20:03:01,130 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 5/156...
[Document(metadata={'Header 2': 'Get started with setting up Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='To get started using Amazon SageMaker, go to Setting up Amazon SageMaker in this guide to set up a domain and create a project. This domain setup and project creation is a prerequisite for all other tasks in Amazon SageMaker.')]


2025-11-11 20:03:03,198 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 6/156...
[Document(metadata={'Header 2': 'Prerequisites for Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Complete the following prerequisite tasks before you can set up Amazon SageMaker and proceed with the use cases in this guide.')]


2025-11-11 20:03:05,269 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 7/156...
[Document(metadata={'Header 2': 'Topics', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Sign up for an AWS account\n- Create a user with administrative access')]


2025-11-11 20:03:07,341 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 8/156...
[Document(metadata={'Header 2': 'Sign up for an AWS account', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='If you do not have an AWS account, complete the following steps to create one.')]


2025-11-11 20:03:09,455 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 9/156...
[Document(metadata={'Header 2': 'To sign up for an AWS account', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Open https://portal.aws.amazon.com/billing/signup.\n2. Follow the online instructions.  \nPart of the sign-up procedure involves receiving a phone call or text message and entering a verification code on the phone keypad.  \nWhen you sign up for an AWS account, an AWS account root user is created. The root user has access to all AWS services and resources in the account. As a security best practice, assign administrative access to a user, and use only the root user to perform tasks that require root user access.  \nAWS sends you a confirmation email after the sign-up process is complete. At any time, you can view your current account activity and manage your account by going to https://aws.amazon.com/ and choosing My Account .')]


2025-11-11 20:03:11,665 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 10/156...
[Document(metadata={'Header 2': 'Create a user with administrative access', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="After you sign up for an AWS account, secure your AWS account root user, enable AWS IAM Identity Center, and create an administrative user so that you don't use the root user for everyday tasks.")]
  Adding batch 11/156...
[Document(metadata={'Header 2': 'Secure your AWS account root user', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Sign in to the AWS Management Console as the account owner by choosing Root user and entering your AWS account email address. On the next page, enter your password.  \nFor help signing in by using root user, see Signing in as the root user in the AWS Sign-In User Guide .  \n2. Turn on multi-factor authentication (MFA) for your root user.  \nFor instructions, see Enable a virtual MFA device for your AWS account root user (console) in the IAM U

2025-11-11 20:03:13,919 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-11 20:03:16,027 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 12/156...
[Document(metadata={'Header 2': 'Create a user with administrative access', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Enable IAM Identity Center.\n2. For instructions, see Enabling AWS IAM Identity Center in the AWS IAM Identity Center User Guide .\n2. In IAM Identity Center, grant administrative access to a user.\n4. For a tutorial about using the IAM Identity Center directory as your identity source, see Configure user access with the default IAM Identity Center directory in the AWS IAM Identity Center User Guide .')]


2025-11-11 20:03:18,113 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 13/156...
[Document(metadata={'Header 2': 'Sign in as the user with administrative access', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- To sign in with your IAM Identity Center user, use the sign-in URL that was sent to your email address when you created the IAM Identity Center user.\n- For help signing in using an IAM Identity Center user, see Signing in to the AWS access portal in the AWS Sign-In User Guide .')]


2025-11-11 20:03:20,181 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 14/156...
[Document(metadata={'Header 2': 'Assign access to additional users', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. In IAM Identity Center, create a permission set that follows the best practice of applying leastprivilege permissions.\n2. For instructions, see  Create a permission set in the AWS IAM Identity Center User Guide .  \n2. Assign users to a group, and then assign single sign-on access to the group.  \nFor instructions, see  Add groups in the AWS IAM Identity Center User Guide .')]


2025-11-11 20:03:22,361 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 15/156...
[Document(metadata={'Header 2': 'Setting up Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Complete the following tasks to set up Amazon SageMaker.')]


2025-11-11 20:03:24,484 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 16/156...
[Document(metadata={'Header 2': 'Topics', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Step 1 - Create an Amazon SageMaker unified domain\n- Step 2 - Create a new project')]


2025-11-11 20:03:26,559 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 17/156...
[Document(metadata={'Header 2': 'Step 1 - Create an Amazon SageMaker unified domain', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Complete the following procedure to create an Amazon SageMaker unified domain with the Quick setup option.')]


2025-11-11 20:03:28,653 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 18/156...
[Document(metadata={'Header 2': 'Important', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Note that there is an additional charge for any VPC or resources that AWS sets up if you chose the Quick setup option for domain creation.  \n1. Navigate to the Amazon SageMaker management console at https:// console.aws.amazon.com/datazone and use the region selector in the top navigation bar to choose the appropriate AWS Region.\n2. Choose Create a Unified Studio domain and then choose Quick setup .  \nWith this option, you're choosing to create an Amazon SageMaker unified domain and you're letting Amazon SageMaker configure your domain with the following default capabilities that you can customize later:  \n- Data analytics, machine learning, SQL, and generative AI\n- Data and AI governance\n- Generative AI app development using Amazon Bedrock serverless models\n- Amazon Q - Free tier\n- Authentication via AWS IAM or AWS IAM Identity Ce

2025-11-11 20:03:30,759 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 19/156...
[Document(metadata={'Header 2': 'Important', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Data and AI governance\n- Generative AI app development using Amazon Bedrock serverless models\n- Amazon Q - Free tier\n- Authentication via AWS IAM or AWS IAM Identity Center  \n3. If you see the following note No VPC has been specifically set up for use with Amazon SageMaker Unified Studio , you can use the Choose VPC or Create VPC buttons to Create a new VPC (recommended) or choose an existing properly-configured VPC.  \nIf you plan to choose your own VPC, Amazon SageMaker Unified Studio enables you to choose VPCs within the same account as well as shared VPCs from other member accounts of the AWS organization. For more information, see Share your VPC subnets with other accounts.')]


2025-11-11 20:03:32,849 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 20/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="If you choose to create a new VPC, note that the VPC template with which it is created is not intended for production use. You can use this template as a start and modify it for your organization's purposes.  \n4. If you see the following note No models accessible , you can use the Grant model access button to grant access to Amazon Bedrock serverless models for use in Amazon SageMaker.\n5. Expand the Quick setup settings section and review the specified configurations for the domain. Leave these defaults and then choose Continue to proceed with creating your domain.\n3. Note For more information, see IAM roles for Amazon SageMaker Unified Studio.")]


2025-11-11 20:03:34,934 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 21/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='3. Note For more information, see IAM roles for Amazon SageMaker Unified Studio.\n6. On the Create IAM Identity Center user page, create a new or select an existing SSO user that you want to enable to log in to Amazon SageMaker Unified Studio. This is done because IAM roles that are used to create Amazon SageMaker unified domains cannot log in to Amazon SageMaker Unified Studio. The SSO user specified here is used as the administrator in Amazon SageMaker Unified Studio.\n7. Choose Create domain .  \nAfter some time, an email will be sent to the address you provided as part of the IAM Identity Center user setup. The email will prompt you to set a password that you can use to access the domain.')]


2025-11-11 20:03:37,016 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 22/156...
[Document(metadata={'Header 2': 'Step 2 - Create a new project', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In Amazon SageMaker, projects enable a group of users to collaborate on various business use cases. Within projects, you can manage data assets in the Amazon SageMaker catalog, perform data analysis, organize workflows, develop machine learning models, build generative AI apps, and more.')]


2025-11-11 20:03:39,107 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 23/156...
[Document(metadata={'Header 2': 'Navigate to Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='To begin creating a project, navigate to Amazon SageMaker Unified Studio. You can do this by using the link in your email that you used to set an IAM Identity Center password, or by selecting the domain in the Amazon SageMaker management console and choosing Open unified studio .  \nSign in using your SSO credentials that you configured using the email from IAM Identity Center.  \nIf your IAM Identity Center is configured to require multi-factor authentication (MFA), set up and use an MFA device. Follow the instructions on the screen to register or use an MFA device as needed, or contact your admin for support. For more information about configuring MFA device enforcement, see Configure MFA device enforcement in the IAM Identity Center User Guide.')]
  Adding batch 24/156...
[Document(metadata={'Header 2':

2025-11-11 20:03:41,438 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-11 20:03:43,519 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 25/156...
[Document(metadata={'Header 2': 'Review parameters', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='On the next page of project creation, you can review and optionally edit the names and values for different resources that are created when the project is created. You can leave all the defaults and then choose Continue .')]


2025-11-11 20:03:45,600 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 26/156...
[Document(metadata={'Header 2': 'Review', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Use the last page of project creation to review the configurations you have selected. When everything is configured as desired on the project creation review page, choose Create project .  \nYou are then redirected to the project home page. The project will start building and a progress bar will appear with the status.')]


2025-11-11 20:03:47,672 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 27/156...
[Document(metadata={'Header 2': 'Get started with uploading and querying data in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You can use the query editor to perform analysis using SQL. The query editor tool provides a place to write and run queries, view results, and share your work with your team.')]


2025-11-11 20:03:49,744 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 28/156...
[Document(metadata={'Header 2': 'Prerequisites', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Before you get started with the query editor, access Amazon SageMaker and create a project with the SQL analytics or All Capabilities project profile. For more information, see Setting up Amazon SageMaker .  \nDownload the file sales-data.zip.')]


2025-11-11 20:03:51,849 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 29/156...
[Document(metadata={'Header 2': 'Query sample data using Amazon Athena in Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Navigate to Amazon SageMaker Unified Studio using the URL from the Amazon SageMaker management console and log in using your SSO or AWS credentials.\n2. Use the top center menu of the Amazon SageMaker home page to navigate to the project you want to use to query data.\n3. Expand the Build menu in the top navigation bar, then choose Query editor .\n4. In the left data explorer navigation, choose the three-dot action menu next to a database and choose Create table .\n5. Upload the sales-data.csv file from the prerequisites section.\n6. Choose Next .\n7. Choose Create table .\n8. Refresh the Data explorer navigation pane and navigate to the sales-data table in the explorer.\n9. Choose the three-dot action menu next to the table, then choose Preview data . A SQL command to select the first 10 

2025-11-11 20:03:53,963 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 30/156...
[Document(metadata={'Header 2': 'Get started with importing and querying data sets for AWS Glue Data Catalog and Amazon S3 in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In this Getting Started tutorial for the next generation of Amazon SageMaker, you will use Amazon SageMaker Unified Studio, Amazon SageMaker Catalog, and Amazon SageMaker Lakehouse to import and query data sets. You will learn how to access and leverage your existing AWS Glue Data Catalog resources within Amazon SageMaker Unified Studio, allowing you to query and analyze your data without moving or duplicating it.  \nYou will need to have administrator access to a domain or create a domain.  \nA summary of the tasks in this getting started are as follows.  \n- Prerequisites and permissions setup\n- Setting up AWS Glue Data Catalog resources\n- Configuring S3 access and data import\n- Multiple query options (Spark and Athena) a

2025-11-11 20:03:56,054 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 31/156...
[Document(metadata={'Header 2': 'Get started with importing and querying data sets for AWS Glue Data Catalog and Amazon S3 in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Use Spark in Jupyter notebooks\n- Use Athena in the query editor\n- Create and modify tables using SQL\n- Visualize results using charts  \nThis getting started uses a .parquet file as sample S3 Raw file data to import that you can retrieve from the public bucket. There are other formats of data you can import into Lake Formation tables for AWS Glue Data Catalog, such as RDS tables, DynamoDB tables, or RedShift tables.')]


2025-11-11 20:03:58,178 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 32/156...
[Document(metadata={'Header 2': 'Topics', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Prerequisites\n- Step 1: Connect to an AWS Glue Data Catalog\n- Step 2: Get started with importing S3 data\n- Step 3: Get started with the query editor')]


2025-11-11 20:04:00,307 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 33/156...
[Document(metadata={'Header 2': 'Prerequisites', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='The following prerequisities are required for this getting started procedure.  \n- Create a project with an All capabilities project profile. This project profile sets up your project with access to S3 and Athena resources. There is more information about how to create a new project in the topic Setting up Amazon SageMaker AI.\n- A project role is created automatically when the project is created in SageMaker Unified Studio. You will make a note of the project role as detailed in the prerequisities below.\n- You can either use an existing AWS Glue database or create a new one. The Glue database must be Lake Formation managed.\n- You can either use an existing AWS Glue table or create a new one. The Glue table must be Lake Formation managed.\n- You also set up the Data lake administrator and revoke specified permissions.  \nSubsequent s

2025-11-11 20:04:02,405 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 34/156...
[Document(metadata={'Header 2': 'To set up the Lake Formation Data Lake administrator', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You must set up a user or role as the Lake Formation Data Lake administrator for your catalog data. This administrator grants access to data-lake resources.  \n1. In Create a data lake administrator in the AWS Lake Formation Developer Guide , follow the instructions to add the AWSLakeFormationDataAdmin managed policy to the user in IAM.\n2. After you add the IAM permission, follow the steps in Create a data lake administrator to add the inline policy granting permission to create the service-linked role.')]


2025-11-11 20:04:04,499 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 35/156...
[Document(metadata={'Header 2': 'To add the Lake Formation Data Lake administrator in the Lake Formation console', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='After updating the policies in the previous step for the user or role you want to make the Data lake administrator, use the Lake Formation console to add that user or role to the list under Data lake administrators. Use the following steps to add the Data lake administrator on the console.  \n1. Open the AWS Lake Formation console.\n2. Under Administration , choose Administrative roles and tasks .\n3. Under Data lake administrators , choose Add .  \n4. For Access type , choose Data lake administrator .\n5. For IAM users and roles , choose the user or role that you want to make the Data lake administrator. Make sure it is the same user or role for which you updated the IAM permissions in the Prerequisites.\n6. Choose Confirm .')]


2025-11-11 20:04:06,580 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 36/156...
[Document(metadata={'Header 2': 'Revoke the IAMAllowedPrincipals group permission', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You must revoke the IAMAllowedPrincipals group permission on both database and table to enforce AWS Lake Formation permission for access. For more information, see Revoking permission using the AWS Lake Formation console in the AWS Lake Formation Developer Guide .')]


2025-11-11 20:04:08,666 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 37/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='For the purposes of this topic, revoke the group permission as provided. This makes it so Lake Formation is the central point for managing fine-grained access control to your data lake resources. You can also use hybrid permissions in Lake Formation. For more information about hybrid permissions, see Hybrid access mode in the AWS Lake Formation Developer Guide .  \n1. Open the AWS Lake Formation console.\n2. Under Permissions , choose Data permissions .\n3. Choose the selector next to the IAMAllowedPrincipals group designated for Database .\n4. Choose Revoke .')]


2025-11-11 20:04:10,738 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 38/156...
[Document(metadata={'Header 2': 'Step 1: Connect to an AWS Glue Data Catalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Complete the steps in this section to set up your resources and permissions for accessing AWS Glue Data Catalog and preparing to import data.')]


2025-11-11 20:04:12,838 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 39/156...
[Document(metadata={'Header 2': 'Make a note of your IAM project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In the following sections of this topic, you will configure permissions using the project role in IAM that was created when you created your SageMaker Unified Studio project. The project role is an IAM role that is created and associated with a new project. This role grants the necessary  \npermissions for users working on the project to use AWS resources, such as Amazon S3, for instance. You will attach a resource-based bucket policy and configure permissions in the lakehouse. Use the following steps to make a note of the IAM project role for your SageMaker Unified Studio project. You will use the role in a procedure that follows when you configure and grant Lake Formation permissions.  \n1. Navigate to Amazon SageMaker Unified Studio using the URL from the Amazon SageMaker management console and log in using yo

2025-11-11 20:04:14,953 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 40/156...
[Document(metadata={'Header 2': 'Make a note of your IAM project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='2. Use the top center menu of the Amazon SageMaker home page to navigate to the project you want to use.\n3. Under the Overview , choose Project overview .\n4. Choose the Project details tab.\n5. Choose the project role that is associated with your Amazon SageMaker Unified Studio project. This role was created in IAM upon project creation and was copied in the steps above. In Project role ARN , copy the project role ARN.  \nThe Project IAM role will have the following format: arn:aws:iam::ACCOUNT\\_ID:role/ datazone\\_usr\\_role\\_xxxxxxxxxxxxxx\\_yyyyyyyyyyyyyy  \nDiscover v  \nOverview  \nProject overview  \nData  \nCompute  \nMembers  \nProject catalog  \nAssets  \nSubscription requests  \nData sources  \nMetadata entities  \nGovern v  \n› Project overview')]


2025-11-11 20:04:17,022 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 41/156...
[Document(metadata={'Header 2': 'Register the S3 location for AWS Glue Data Catalog tables in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='To access existing AWS Glue Data Catalog tables in Amazon SageMaker Unified Studio, complete the following steps to configure permissions.')]


2025-11-11 20:04:19,119 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 42/156...
[Document(metadata={'Header 2': 'To register the S3 location and configure access', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Open the AWS Lake Formation console using the data lake administrator. Choose Data lake locations in the navigation pane, and then choose Register location .  \nBuild v  \nMy\\_Project\\_mbg  \nQ Search catalog  \n2. Enter the S3 prefix for Amazon S3 path. For this topic, you must register the following S3 location in order to allow it to be queried: s3://aws-bigdata-blog/ generated\\_synthetic\\_reviews/data/product\\_category=Video\\_Games .\n3. For IAM role , choose your Lake Formation data access IAM role, which is not a service linked role.\n4. Select Lake Formation for Permission mode , and then choose Register location .\n5. For Database permissions, choose Describe , and then choose Grant .')]


2025-11-11 20:04:21,224 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 43/156...
[Document(metadata={'Header 2': 'Grant permission on the databases to the project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You will grant database access to the IAM role that is associated with your Amazon SageMaker Unified Studio project. This role is called the project role, and it was created in IAM upon project creation. To access existing AWS Glue Data Catalog databases in Amazon SageMaker Unified Studio, complete the following steps to configure permissions.  \n1. On the Lake Formation console, under Data Catalog in the navigation pane, choose Databases .\n2. Select the existing AWS Glue Data Catalog database.\n3. From the Actions menu, choose Grant to grant permissions to the project role.\n4. For IAM users and roles , choose the project role . This is the SageMaker Unified Studio project role that you noted previously in Make a note of your IAM project role.\n5. Select Named Data Catalog resources , and for C

2025-11-11 20:04:23,313 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 44/156...
[Document(metadata={'Header 2': 'Grant permission on the databases to the project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='5. Select Named Data Catalog resources , and for Catalogs , choose the default catalog or a catalog you want to use.\n6. For Databases , choose the default database or a database you want to use.\n7. For Database permissions , select Describe and choose Grant .  \nGranting these permissions provides the means to query the Lake Formation data in later steps.')]


2025-11-11 20:04:25,418 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 45/156...
[Document(metadata={'Header 2': 'Grant permission on the tables to the project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You will grant table access to the IAM role that is associated with your Amazon SageMaker Unified Studio project. This role is called the project role, and it was created in IAM upon project creation. To grant permission on the tables to the project role, complete the following steps.  \n1. On the Lake Formation console, under Data Catalog in the navigation pane, choose Databases .\n2. Select the existing Data Catalog database.\n3. From the Actions menu, choose Grant to grant permissions to the project role.\n4. For IAM users and roles , choose the project role. This is the SageMaker Unified Studio project role that you noted previously in Make a note of your IAM project role.\n5. Select Named Data Catalog resources , and for Catalogs , choose the default catalog.\n6. For Databases , choose your Dat

2025-11-11 20:04:27,496 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 46/156...
[Document(metadata={'Header 2': 'Grant permission on the tables to the project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='7. For Tables , select the tables that you need to provide permission to the project role.\n8. For Table permissions , select Select and Describe .\n9. For Grantable permissions , choose Select and Describe .\n10. Choose Grant .')]


2025-11-11 20:04:29,575 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 47/156...
[Document(metadata={'Header 2': 'Important', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You should revoke any existing permissions of IAMAllowedPrincipals on the databases and tables within Lake Formation as detailed in the prerequisites.')]


2025-11-11 20:04:31,667 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 48/156...
[Document(metadata={'Header 2': 'Create a new Lakehouse catalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In your project, create a new Lakehouse catalog. If you plan to use the default catalog, you can skip these steps.')]


2025-11-11 20:04:33,764 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 49/156...
[Document(metadata={'Header 2': 'To create a Lakehouse catalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="1. In the Amazon SageMaker Unified Studio, navigate to your project.\n2. On the project page, under Data , choose Lakehouse .\n3. Choose the + button.\n4. In the Add data section, choose Create Lakehouse catalog .\n5. Choose Next .\n6. In the Add catalog section, enter a name for your catalog.\n7. (Optional) Enter a description for the catalog.\n8. Choose Add catalog .  \nDiscover v  \nBuild v  \nGovern v  \nLakehouse  \nAdd data  \nThis resource will be shared between all project members.  \nAfter completing these steps, your database will appear under the catalog that you've created.  \nroject overview ata")]


2025-11-11 20:04:35,847 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 50/156...
[Document(metadata={'Header 2': 'Add data and create an AWS Glue table', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='ompute embers  \nAssets glue  \nor drag and drop  \nIn your project, create an AWS Glue table using sample data. To create a Glue table in Amazon SageMaker Unified Studio, complete the following steps.  \nTable type')]


2025-11-11 20:04:37,959 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 51/156...
[Document(metadata={'Header 2': 'To add data and create a Glue table', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Metadata entities  \nRedshift  \n1. Access the public S3 bucket to download the sample data. Download the .parquet file named 97dbfb1466264fd993c8cf29ee3b2881\\_1.snappy.parquet to your local drive.\n2. In the Amazon SageMaker Unified Studio console, navigate to your project.\n3. On the project page, under Overview , choose Data . Choose Lakehouse .\n4. Next to your Glue database, choose the options menu (three dots), and choose Create table .\n5. Next, upload the file in .CSV, JSON, Parquet, or Delimiter formats. For this example, upload the Parquet file you downloaded from the public sample bucket.\n6. For Table type , External/S3 is selected by default as the type of source.\n7. Choose Add data . For Catalog name , choose the name from the drop-down menu.\n8. For Database , choose the database that you created 

2025-11-11 20:04:40,044 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 52/156...
[Document(metadata={'Header 2': 'To add data and create a Glue table', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='8. For Database , choose the database that you created in the Use or create a Glue database section from the drop-down menu.\n9. For Table name , enter a table name of your choice.\n10. For Data format , choose the data format from the drop-down menu. The format updates automatically when you upload a file.\n11. Choose Next . Allow a few minutes for the schema creation to display.  \nUpload file  \nDiscover v  \nBuild v  \nGovern v  \n› Data')]


2025-11-11 20:04:42,121 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 53/156...
[Document(metadata={'Header 2': '12. Choose Create table .', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Project overview  \nData  \nLakehouse v • AwsDataCatalog  \nThe table appears under your database, such as in this example showing the new table myreviewtable added. Members Name myreviewtable Description Table created from uploading data in SageMaker Unified Last updated June 27, 2025, 11:13 (UTC07:00)  \nProject catalog  \nAssets  \nData sources')]


2025-11-11 20:04:44,193 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 54/156...
[Document(metadata={'Header 2': 'Verify access to your AWS Glue table from the Amazon SageMaker Unified Studio query editor', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='To verify that you can access the existing AWS Glue table from the Amazon SageMaker Unified Studio query editor, complete the following steps:')]


2025-11-11 20:04:46,290 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 55/156...
[Document(metadata={'Header 2': 'To verify that the Athena query can be accessed for the table', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Navigate to Amazon SageMaker Unified Studio using the URL from the Amazon SageMaker management console and log in using your SSO or AWS credentials.\n2. Use the top center menu of the Amazon SageMaker home page to navigate to the project you want to use.\n3. On the project page, under Overview , choose Data , and then choose Lakehouse .  \nMy\\_Project\\_mb  \nQ Search catalog  \nAwsDataCatalog &gt; glue\\_db\\_c761kuvded0srr &gt; myreviewtable myreviewtable  \nActions v  \n4. Next to the new table, choose the options menu (three dots), and choose Query with Athena . You can also choose to preview the data.')]


2025-11-11 20:04:48,370 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 56/156...
[Document(metadata={'Header 2': 'Create or use an S3 bucket', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In S3, create or have a bucket and note the bucket path, such as s3://amzn-s3-demo-bucket . You will upload your sample data to the existing bucket.')]


2025-11-11 20:04:50,450 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 57/156...
[Document(metadata={'Header 2': '(Optional) Use sample data in your existing S3 bucket', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Configure your S3 data using an existing bucket and sample data to upload and import.  \nAlternately, you can use the public bucket with the sample data location and skip this step.')]


2025-11-11 20:04:52,541 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 58/156...
[Document(metadata={'Header 2': 'To upload sample S3 data', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Use the S3 console to upload the sample .parquet file from your local drive to your S3 source bucket.  \n1. Sign in to Amazon Simple Storage Service.\n2. Navigate to the .parquet file that you downloaded from the public sample bucket.\n3. Navigate to your existing S3 bucket and choose Upload . Upload the file to your S3 bucket.\n4. Choose Save .')]


2025-11-11 20:04:54,631 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 59/156...
[Document(metadata={'Header 2': 'Edit your IAM project role and attach the S3 bucket policy', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Configure your IAM role with a policy for S3 bucket permissions to allow the SageMaker project role to access your S3 source bucket. Use these steps to create and attach a resource-based bucket policy and configure permissions in Lakehouse.')]


2025-11-11 20:04:56,731 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 60/156...
[Document(metadata={'Header 2': 'To attach the S3 bucket policy to the project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Using the account that is associated with the SageMaker domain, navigate to the IAM console, and choose Roles .\n2. Choose the project role that is associated with your Amazon SageMaker Unified Studio project. This role was created automatically when you created your project in Amazon SageMaker  \nUnified Studio. This is the project role that you made a note of previously in Make a note of your IAM project role. Open the project role, such as arn:aws:iam::ACCOUNT\\_ID:role/ &lt;datazone\\_usr\\_role\\_xxxxxxxxxxxxxx\\_yyyyyyyyyyyyyy&gt; .  \n3. Choose Add permissions , and then choose Create inline policy .')]


2025-11-11 20:04:58,828 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 61/156...
[Document(metadata={'Header 2': 'To attach the S3 bucket policy to the project role', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='3. Choose Add permissions , and then choose Create inline policy .\n4. Choose JSON , and then paste the following policy statement into the Policy editor. The following is an example bucket policy. Replace ACCOUNT\\_ID with the account ID where the domain resides, &lt;s3\\_bucket&gt; with the name of the S3 bucket that you intend to query in SageMaker Unified Studio, and &lt;datazone\\_usr\\_role\\_xxxxxxxxxxxxxx\\_yyyyyyyyyyyyyy&gt; with the project role in SageMaker Unified Studio. For the purposes of this topic, replace the amzns3-demo-bucket value with the bucket name for the sample data ( s3://aws-bigdatablog/generated\\_synthetic\\_reviews/* ).')]


2025-11-11 20:05:00,932 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 62/156...
[Document(metadata={'Header 2': 'Open a new notebook and start an Apache Spark session to import the data', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Configure your SageMaker spark session to import and query the S3 data using a Jupyter notebook in the console. To access the data through the unified JupyterLab experience with a spark session, complete the following steps:  \n1. Sign in to your SageMaker project.\n2. Navigate to the Project overview page.\n3. Choose New , and then choose Notebook .\n4. Choose the default notebook titled Untitled.jpynb . Click the file name and type in the field to rename the file to mynotebok.jpynb .\n5. On the SageMaker Unified Studio project page, on the top menu, choose Build . Under IDE &amp; APPLICATIONS , choose JupyterLab .\n6. Wait for the space to be ready.\n7. Choose the plus sign and for Notebook , choose Python3 .\n8. In the notebook, switch the connection type to PySpark and choo

2025-11-11 20:05:03,203 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-11 20:05:05,278 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 64/156...
[Document(metadata={'Header 2': 'Step 3: Get started with the query editor', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You can use the query editor to perform analysis using SQL. The query editor tool provides a place to write and run queries, view results, and share your work with your team.')]


2025-11-11 20:05:07,372 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 65/156...
[Document(metadata={'Header 2': 'Prerequisites to access your project', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Before you get started with the query editor, you must have access to Amazon SageMaker Unified Studio and create a project.  \nView  \nRun  \nKernel  \nGit  \nTabs  \nSettings  \nHelp  \n• mynotebook.ipynb  \n+  \n•  \n»  \nCode  \nE  \ngit  \nNotebook C  \nPython 3 (ipykernel) C  \nUse the following command in a cell to load the S3 source with the %%pyspark cell magic. This imports the S3 data. Make sure the second line is indented as shown.  \nspark = SparkSession.builder •getorCreate()  \n+  \n- Navigate to Amazon SageMaker Unified Studio using the URL from your admin and log in using your SSO or configure credentials with IAM Identity Center.')]


2025-11-11 20:05:09,462 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 66/156...
[Document(metadata={'Header 2': 'Query AWS Glue sample data using Amazon Athena in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='After you create a project, you can use the query editor to write and run queries. Use the following steps to create a table using a SQL query with Athena, query the table, and visualize the results.  \n1. In the Amazon SageMaker Unified Studio, navigate to your project.\n2. On the project page, under Overview , choose Data .\n3. Choose Lakehouse , Expand AwsDataCatalog , and then choose the three-dot action menu next to your database.\n4. Choose Query with Athena .\n5. Copy and paste the following SQL query into the editor. The following query will create a table synthetic\\_reviews\\_video\\_games and query it.  \n```')]


2025-11-11 20:05:11,596 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 67/156...
[Document(metadata={'Header 2': 'Query AWS Glue sample data using Amazon Athena in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="4. Choose Query with Athena .\n5. Copy and paste the following SQL query into the editor. The following query will create a table synthetic\\_reviews\\_video\\_games and query it.  \n```\nCREATE EXTERNAL TABLE `synthetic_reviews_video_games`( `marketplace` string, `customer_id` string, `review_id` string, `product_id` string, `product_parent` string, `product_title` string, `star_rating` int, `helpful_votes` int, `total_votes` int, `vine` string, `verified_purchase` string, `review_headline` string, `review_body` string, `review_date` bigint, `year` int ) ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.parquet.M

2025-11-11 20:05:13,703 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 68/156...
[Document(metadata={'Header 2': 'Note ~ · AwsDataCatalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='v @ glue\\_db\\_c761kuvdedOsrr  \n- E table (5)  \nec review id  \n123 star\\_rating ma helpful\\_votes  \nma total\\_votes ec insight  \n• Untitled 6  \nActions v  \n( Untitled 5  \nDraft  \nLimit 100  \nCREATE EXTERNAL TABLE "synthetic\\_reviews\\_video\\_games" (  \n"marketplace" string,  \nFor simplicity, in this topic, these steps create a table under a specific partition folder instead of creating a top level table that includes all the partition folders. As a gene3ral recommendation, create tables at the top level. string string  \nbigint bigint  \nbigint string  \n12  \n13  \n14  \n15  \n16  \n17  \n"verified\\_purchase" string,  \n"review\\_headline"  \nstring,  \n"review body"  \nstring,  \n\'review\\_date bigint,  \n"year" int  \n) ROW FORMAT SERDE  \n\'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe')]

2025-11-11 20:05:15,812 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 69/156...
[Document(metadata={'Header 2': 'Note ~ · AwsDataCatalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='"review\\_headline"  \nstring,  \n"review body"  \nstring,  \n\'review\\_date bigint,  \n"year" int  \n) ROW FORMAT SERDE  \n\'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe  \nThe SQL query creates an external table named "synthetic\\_reviews\\_video\\_games" that maps to Amazon product review data stored in Parquet format. The table defines columns for marketplace, customer information, product details, ratings, and review content. * review\\_date timestamp  \n6. Choose the Run cell icon. Rac department  \nsac customer\\_id string  \nstring string  \n=  \n( Untitled 1  \nRun all  \nQuery executed successfully  \nAffected rows: 0  \nElapsed time: 577ms  \nWhen the query finishes running, a Result tab appears below the cell to display the outcome.  \n•c product\\_parent  \nstring  \n7. Refresh the Data explorer n

2025-11-11 20:05:17,937 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 70/156...
[Document(metadata={'Header 2': 'Note ~ · AwsDataCatalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='8. Choose Add SQL to add another cell to the querybook. Then enter the following script:  \n```\nWITH review_stats AS (\n```  \nM Untitled 4 x  \n( Untitled 2  \n+  \n#1  \n```\nSELECT product_title, ROUND(AVG(star_rating), 2) as avg_rating, COUNT(*) as review_count, COUNT(CASE WHEN star_rating >= 4 then 1 END) as positive_reviews, COUNT(CASE WHEN star_rating <= 2 then 1 END) as negative_reviews FROM "awsdatacatalog"."glue_db_<database-ID>"."synthetic_reviews_video_games" GROUP BY product_title HAVING COUNT(*) >= 5 ) SELECT product_title, avg_rating, review_count, ROUND((positive_reviews * 100.0 / review_count), 1) as positive_percentage, ROUND((negative_reviews * 100.0 / review_count), 1) as negative_percentage FROM review_stats WHERE avg_rating >= 2.5 ORDER BY review_count DESC, avg_rating DESC LIMIT 10;\n```  \nThis query 

2025-11-11 20:05:20,049 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 71/156...
[Document(metadata={'Header 2': 'Note ~ · AwsDataCatalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='```  \nThis query completes the following tasks:  \n- Creates a CTE (Common Table Expression) to calculate review statistics\n- Calculates average ratings, total review count, and counts of positive/negative reviews per game\n- Filters for games with at least 5 reviews\n- Computes the percentage of positive and negative reviews\n- Shows only games with an average rating of 2.5 or higher\n- Orders results by review count and average rating\n- Returns the top 10 most reviewed, highly-rated games  \nThe results will show you the most popular well-rated games in your dataset, along with meaningful metrics about their review distribution.  \nHome  \n•  \n13  \n14  \n15  \n16  \nResult 1  \nproduct\\_title single accessory, with thumb ...  \nboard sweatband, with thumb ..  \nfastest webcam, with elbow s...  \nergonomic touch panel,

2025-11-11 20:05:22,161 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 72/156...
[Document(metadata={'Header 2': 'Note ~ · AwsDataCatalog', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="ergonomic touch panel, set  \n1 - 10 of 10  \n&gt; Style  \nProjects &gt;  \nMy\\_Project\\_mbgugib9 &gt; Query Editor  \nLimit 100  \nproduct\\_title,  \nRun all  \n9. Choose the Run cell icon.  \nIn the Results tab, the first ten rows of the table you created are displayed.  \n10. In the Results tab, you can choose the Chart view icon. This opens up a chart view with a line graph as a default.  \n11. Set up the chart to display a pie chart. Choose Trace .  \n- a. For Type , choose Pie .\n- b. For Values , choose avg\\_rating .\n- c. For Labels , choose product\\_title .\n- d. Choose the download arrow to view the chart.  \nThis displays a pie chart so you can visualize results.  \navg\\_rating,  \nActions V  \nUntitled 2 ×  \nDraft  \n+  \nEi Edit  \nG  \n#1  \nUntitled 2 (2).png  \nAfter you've finished querying the data, 

2025-11-11 20:05:24,254 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 73/156...
[Document(metadata={'Header 2': 'Overview', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Amazon EMR Serverless provide a powerful way to process data at scale without managing infrastructure. In addition to Amazon EMR on EC2 clusters, you can create and delete EMR Serverless applications directly from SageMaker Unified Studio. EMR Serverless applications operate similarly to traditional notebooks, letting you run queries and code while actively observing the output simultaneously.  \nUnlike traditional notebooks, the contents of an EMR notebook run in a client and are executed by a kernel in your EMR Serverless Application. This means you don't need to configure a cluster to run applications, and helps you avoid over or under provisioning resources for your jobs. EMR Serverless is ideal for applications that need responses quickly, such as interactive data analysis.")]


2025-11-11 20:05:26,336 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 74/156...
[Document(metadata={'Header 2': 'Overview', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='This architecture allows you to use a single EMR Serverless application on multiple clusters and run clusters on demand as it fits your use case and needs. These are seperate from Spark applications For more general information about EMR Serverless Notebooks and Applications, see the EMR Management Guide.')]


2025-11-11 20:05:28,443 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 75/156...
[Document(metadata={'Header 2': 'Getting started with EMR serverless applications', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="SageMaker Unified Studio provides a straightforward interface for creating EMR Serverless applications. In order to create a new EMR Serverless Application your admin needs to enable blueprints. For more information about the blueprint setup process see  Enable or disable blueprints in the Amazon Sagemaker Unified Studio Guide . Once blueprints are enabled:  \n1. From the SageMaker Unified Studio UI, navigate to the Project Management view and then select your project from the project list.\n2. Select Compute from the navigation bar, then select Data processing. Select the Add Compute button. You'll be prompted to connect to an existing compute resource or create new compute resources. From there, select EMR Serverless.\n3. On the Add Compute screen, you'll add your compute resource's name, descriptio

2025-11-11 20:05:30,538 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 76/156...
[Document(metadata={'Header 2': 'Getting started with EMR serverless applications', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='• Untitled.ipynb  \nLocal Python  \n• +  \nCode v project.python  \n- Compatibility mode. This permission mode allows your project to be compatible with data managed using full-table access, meaning the compute engine can access all rows and columns in the data. Choosing this option configures your compute to work with data assets from AWS and from external systems that you connect to from your project.  \n[]:  \n- Fine-grained mode. This option is for data managed using fine-grained access, meaning the compute engine can only access specific rows and columns from the full dataset. Choosing this option configures your compute to work with data asset subscriptions from Amazon SageMaker catalog.  \nYour EMR Serverless compute will now be listed in your Data processing list. From here, you can connect to

2025-11-11 20:05:32,626 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 77/156...
[Document(metadata={'Header 2': 'Connecting to an EMR Serverless compute', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Once you have an EMR serverless compute added, you can connect to the compute directly from the Sagemaker Unified Studio notebook workspace.  \nTo connect to an EMR Serverless compute:  \n1. Above a code block in your Jupyter Notebook, there will be two drop down boxes. One lets you select your connection type, the other your compute.\n2. Select the connection type "PySpark" and then click on the drop down for your Compute. From here you can select your EMR Serverless compute from the second drop down box.\n3. Run the code in your code block. The first time you run code, it will connect to the compute and start a session for your connection. This means that you are connected to the serverless compute, and all codeblocks using this EMR compute this session will use this connection.')]


2025-11-11 20:05:34,739 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 78/156...
[Document(metadata={'Header 2': 'Connecting to an EMR Serverless compute', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='For first-time users, we recommend starting with the EMR example notebook provided (getting\\_started.ipynb), which demonstrates basic operations and best practices. You can access this notebook from the Examples tab in the Unified Studio file browser, pictured below:  \nV E O sit m  \nNotebook [3 Python 3 (ipykernel) O =  \nFile name  \n•  \nView more  \nActions V  \nNew  \nDue to the nature of EMR Serverless applications, you can have multiple computes available at once. This allows you to maintain your notebook code while connecting to different EMR applications as needed for various workloads. You can switch between applications without modifying your notebook code, allowing you to test different configurations or work with different data processing requirements.')]


2025-11-11 20:05:36,847 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 79/156...
[Document(metadata={'Header 2': '(Optional) Remove or stop an EMR application', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="While using EMR serverless compute, you may need to stop using a particular compute either for a period of time or permanently. You can remove or stop EMR serverless computes in those cases. Stopping a compute lets you pause a compute until you want to reactiveate it Applications in this paused state can be reactivated whenver you want, with all definition remaining. You only incure storage costs for stopped applications  \nFor applications you don't intend to use again, you can delete, or remove them. This will permanently delete the application, and cannot be undone. To access the application again you will need to recreate it. Deleting an application removes all costs associated with it including storage cost.")]


2025-11-11 20:05:38,931 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 80/156...
[Document(metadata={'Header 2': '(Optional) Remove or stop an EMR application', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="EMR Serverless computes can be removed from projects via the Data processing tab in your project view. Simply click the menu to the right of the compute's name and click Remove. Removed EMR Serverless computes are deleted. You can also manually stop an EMR Serverless compute by using the EMR Studio page on the AWS Console. For more information see Manage applications from the EMR Studio console in the EMR Serverless user guide.")]


2025-11-11 20:05:41,026 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 81/156...
[Document(metadata={'Header 2': 'Get started using Amazon Bedrock in SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Amazon Bedrock in SageMaker Unified Studio offers multiple playgrounds that allow you to easily access and experiment with Amazon Bedrock models. With the chat playground, you can chat with a model through text and image prompts. With the image and video playground, you can use a compatible model to generate and edit images and videos.  \nIn addition to the playgrounds, you can also use Amazon Bedrock in SageMaker Unified Studio to create chat agent apps and flows apps. A chat agent app allows users to create a custom app that interacts with a Amazon Bedrock model through a conversational interface. You can enhance chat agent apps with Amazon Bedrock features such as data sources and guardrails and share the app with other users. A flows app allows users to link together prompts, foundatio

2025-11-11 20:05:43,107 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 82/156...
[Document(metadata={'Header 2': 'Get started using Amazon Bedrock in SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='The following section will walk you through the basic functionalities of Amazon Bedrock in SageMaker Unified Studio. First, you will select a model from the model catalog and chat with it in the chat playground. Then, you will create a chat agent app that can create playlists for a rock and pop radio station. For more in-depth information on other Amazon Bedrock features you can use with Amazon Bedrock in SageMaker Unified Studio, see Amazon Bedrock in SageMaker Unified Studio.')]


2025-11-11 20:05:45,206 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 83/156...
[Document(metadata={'Header 2': 'Step 1: Explore Amazon Bedrock foundation models', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="The following section shows how to select a model from the model catalog in the Amazon Bedrock in SageMaker Unified Studio playground. You can also access the model catalog from inside your projects. The models you have access to in your projects might be different from those you can access in the playground, based on your administrator's settings. To check which models you can access in a project, open or create a project, and then select Models in the navigation pane to open the model catalog.")]


2025-11-11 20:05:47,311 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 84/156...
[Document(metadata={'Header 2': 'To open the model catalog in the playground', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="1. Navigate to the Amazon SageMaker landing page by using the URL from your admininstrator.\n2. Access Amazon SageMaker using your IAM or single sign-on (SSO) credentials. For more information, see Access Amazon SageMaker Unified Studio.\n3. At the top of the page, choose Discover .  \n4. Under Data and model catalog , choose Amazon Bedrock models . This opens the model catalog in the Amazon Bedrock in SageMaker Unified Studio playground.\n5. (Optional) Choose Group by: Modality and select Provider to sort the list by model provider.\n6. Choose a model from the list of models that you have access to. For information about a model, choose View full model details in the information panel. If you don't have access to an appropriate model, contact your administrator. Some features may not be supported by all m

2025-11-11 20:05:49,401 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 85/156...
[Document(metadata={'Header 2': 'Step 2: Chat with a model in the chat playground', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In this section you will chat with your selected model in the chat playground. You chat by sending a prompt to the model and receiving a response. For more information, see Experiment with the Amazon Bedrock playgrounds.')]


2025-11-11 20:05:51,469 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 86/156...


2025-11-11 20:05:53,570 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 87/156...
[Document(metadata={'Header 2': 'To chat with a model', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. In the chat playground, enter What is Avebury stone circle? in the Enter prompt text box.\n2. (Optional) If the model you chose is a reasoning model, you can choose Reason to have the model include its reasoning in the reponse. For more information, see Enhance model responses with model reasoning in the Amazon Bedrock user guide .\n3. Press Enter on your keyboard, or choose the run button, to send the prompt to the model. The response from the model will be generated in the playground.\n4. Continue chatting with the model by entering the prompt Is there a museum there? .  \nThe model will use the previous prompt as context for generating its response to this question.  \n5. (Optional) Compare the output from multiple models, or shared apps.\n- a. In the playground, turn on Compare mode . This will open two panes side-by-side

2025-11-11 20:05:55,667 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 88/156...
[Document(metadata={'Header 2': 'To chat with a model', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- b. In each panes, select a model that you want to compare. If you want to use a shared app, select App in Type and then select the app in App .\n- c. Enter a prompt in the text box and run the prompt. The output from each model is shown in their respective panes. You can choose the copy icon to copy the prompt or model response to the clipboard.\n- d. (Optional) Choose Add chat window to add a third window. You can compare up to 3 models or apps.\n- e. Turn off Compare mode to stop comparing models.\n6. Choose Reset to start a new chat with the model.')]


2025-11-11 20:05:57,749 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 89/156...
[Document(metadata={'Header 2': 'Step 3: Create a chat agent app', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='In this section you will learn how to create a simple Amazon Bedrock in SageMaker Unified Studio chat agent app that creates playlists for a radio station and shares the dates and locations of upcoming shows.')]


2025-11-11 20:05:59,844 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 90/156...
[Document(metadata={'Header 2': 'To create an Amazon Bedrock chat agent app', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. On the Amazon SageMaker home page, choose Build chat agent app to create a new chat agent app. The Select or create a new project to continue dialog box opens.\n2. In the Select or create a new project to continue dialog box, do one of the following:\n- If you want to use a new project, follow the instructions at Step 2 - Create a new project. For the Project profile in step 1, choose Generative AI application development .\n- If you want to use an existing project, select the project that you want to use and then choose Continue .\n3. On the app creation page, an untitled app will automatically be created for you. In Untitled App - nnnn , enter Radio show as the name for your app.\n4. In the Configs pane, do the following:')]


2025-11-11 20:06:01,944 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 91/156...
[Document(metadata={'Header 2': 'To create an Amazon Bedrock chat agent app', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="3. On the app creation page, an untitled app will automatically be created for you. In Untitled App - nnnn , enter Radio show as the name for your app.\n4. In the Configs pane, do the following:\n- a. For Model , select a model that supports Guardrails, Data, and Function components. The description of the model tells you the components that a model supports. For full information about the model, choose View full model details in the information panel. For more information, see Find serverless models with the model catalog. If you don't have access to an appropriate model, contact your administrator. Different models might not support all features.  \n- b. For Enter a system instruction in Instructions for chat agent &amp; examples , enter You are a chat agent app that creates 2 hour long playlists for a ra

2025-11-11 20:06:04,082 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 92/156...
[Document(metadata={'Header 2': 'To create an Amazon Bedrock chat agent app', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="- c. In the UI section, update the user interface for the app by doing the following:\n- i. In Hint text for empty chat enter Hi! I'm your radio show playlist creator. .\n- ii. In Hint text for user input enter Enter a prompt that describes the playlist that you want. .\n- iii. In Quick start prompts choose Edit .\n- iv. Choose Reset to clear the list of quick start prompts\n- v. For Quick-start prompt 1 , enter Create a playlist of pop music songs. .\n- vi. (Optional). Enter quick start prompts of your choice in the remaining quick start prompt text boxes.\n- vii. Choose Back to configs .\n5. Choose Save to save the current working draft of your app.\n6. In the Quick start prompts section of the Preview pane, run the quick start prompt that you just created by choosing the prompt.  \nThe app shows the prom

2025-11-11 20:06:06,176 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 93/156...
[Document(metadata={'Header 2': 'To create an Amazon Bedrock chat agent app', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='The app shows the prompt and the response from the model in the Preview pane.  \n7. In the prompt text box (the text should read Enter a prompt that describes the playlist that you want ), enter Create a playlist of songs where each song on the list is related to the next song, by musician, bands, or other connections. Be sure to explain the connection from one song to the next. .\n8. Choose the run button (or press Enter on your keyboard) to send the prompt to the model.  \nYou have now created a basic chat agent app that can create playlists for a rock and pop radio station. You can experiment with sending prompts and receiving responses from your chat agent app.')]


2025-11-11 20:06:08,304 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 94/156...
[Document(metadata={'Header 2': 'Additional capabilities', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Amazon Bedrock in SageMaker Unified Studio offers many additional capabilities to the ones covered in this walkthrough, including the following.  \n- You can customize and influence model behavior using inference parameters and system prompts. For more information, see What is a prompt?.\n- You can enhance your chat agent app by adding data sources and guardrails. For more information, see Build a chat agent app.\n- You can share your chat agent app with other users and use it as a component in a flows app. For more information, see Share a chat agent app and Deploy a chat agent app.\n- You can create a flows app to link together different components such as knowledge bases and reusable prompts. For more information, see Build a flow app.')]


2025-11-11 20:06:10,412 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 95/156...
[Document(metadata={'Header 2': 'Get started with Amazon S3 Tables in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Amazon SageMaker Unified Studio provides integrated support for S3 Tables, allowing you to create S3 table buckets and Apache Iceberg tables in those buckets.  \nAmazon S3 Tables provide S3 storage that's optimized for analytics workloads, with built-in Apache Iceberg support and features designed to continuously improve query performance and reduce storage costs for tables. Data in S3 Tables is stored in table buckets, which are specialized buckets for storing tabular data. For more information, see Working with Amazon S3 Tables and table buckets.  \nYou can begin working with S3 Tables directly by creating an S3 table bucket as a new data source within Amazon SageMaker Unified Studio.")]


2025-11-11 20:06:12,590 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 96/156...
[Document(metadata={'Header 2': 'Integrating S3 with AWS analytics services through Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Amazon S3 table buckets integrate with AWS Glue Data Catalog and AWS Lake Formation to allow AWS analytics services to automatically discover and access your table data. For more information, see Integrating Amazon S3 Tables with AWS analytics services.  \nIf you've never used S3 Tables before in the current Region, you can allow Amazon SageMaker to enable the S3 Tables analytics integration when you create a new S3 Tables catalog in the Amazon SageMaker Unified Studio console.  \nWhen you allow Amazon SageMaker Unified Studio to perform the integration, Amazon SageMaker takes the following actions on your behalf in your account:  \n- Creates a new AWS AWS Identity and Access Management (IAM) service role that gives Lake Formation access to all your tables and table b

2025-11-11 20:06:14,735 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 97/156...
[Document(metadata={'Header 2': 'Integrating S3 with AWS analytics services through Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Creates the S3tablescatalog in the AWS Glue Data Catalog in your current Region without privileged access.  \n- Adds the Amazon Redshift service role ( AWSServiceRoleForRedshift ) as a Lake Formation Read-only administrator. This allows Amazon Redshift to automatically mount all tables in S3 table buckets in the Region.\n- Note  \nIntegration will be performed in the current Region only.')]


2025-11-11 20:06:16,832 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 98/156...
[Document(metadata={'Header 2': 'Prerequisites', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Create a Amazon SageMaker domain and project. For more information, see Setting up Amazon SageMaker.')]


2025-11-11 20:06:18,950 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 99/156...
[Document(metadata={'Header 2': 'Creating S3 Tables catalogs in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='To get started using S3 Tables in Amazon SageMaker Unified Studio you create a new Lakehouse catalog with S3 table bucket source using the following steps.  \n1. Open the Amazon SageMaker at https://console.aws.amazon.com/sagemaker/ and use the Region selector in the top navigation bar to choose the appropriate AWS Region.\n2. Select your Amazon SageMaker domain.\n3. Select the project you want to create a table bucket in.\n4. In the navigation menu select Data , then select + to add a new data source.\n5. select Create Lakehouse catalog .\n6. In the add catalog menu, choose S3 Tables as the source.\n7. Enter a name for the catalog, and a database name.\n8. Choose Create catalog . This creates the following resources in your account:\n- a. A new S3 Table bucket and the corresponding AWS 

2025-11-11 20:06:21,086 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 100/156...
[Document(metadata={'Header 2': 'Creating S3 Tables catalogs in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='8. Choose Create catalog . This creates the following resources in your account:\n- a. A new S3 Table bucket and the corresponding AWS Glue child catalog under the parent catalog s3tablescatalog .\n- b. A new database within that AWS Glue child catalog. The database name will match the database name you provided. In S3 tables, this is the table namespace.  \n9. Begin creating tables in your database and querying them using query editor or Jupyter notebook.')]


2025-11-11 20:06:23,211 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 101/156...
[Document(metadata={'Header 2': 'Creating and Querying S3 Tables', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='After you add an S3 Tables catalog it can be queried as s3tablescatalog/ your-bucket-name . You can begin creating S3 tables in the catalog and querying them in Amazon SageMaker Unified Studio with the Query editor and Jupyterlab.')]


2025-11-11 20:06:25,292 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 102/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You can only create S3 tables in Amazon SageMaker Unified Studio with Athena engine or Spark. Once created, you can query tables with Athena, Amazon Redshift, or Spark.')]


2025-11-11 20:06:27,396 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 103/156...
[Document(metadata={'Header 2': 'Using the Query Editor', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Navigate to the project you created in the top center menu of the Amazon SageMaker Unified Studio home page.\n2. Expand the Build menu in the top navigation bar, then choose Query editor .\n3. Create a new querybook tab. A querybook is a kind of SQL notebook where you can draw from multiple engines to design and visualize data analytics solutions.\n4. Select a data source for your queries by using the menu in the upper-right corner of the querybook.\n- a. Under Connections , choose Lakehouse (Athena) to connect to your Lakehouse resources.\n- b. Under Catalogs , choose s3tablescatalog/{your-table-bucket}\n- c. Under Databases , choose the name of the database for your S3 tables.\n5. Select Choose to connect to the database and query engine.\n6. Enter SQL to create your first table, the following is an example SQL query:  \

2025-11-11 20:06:29,510 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 104/156...
[Document(metadata={'Header 2': 'Using the Query Editor', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="```\nCREATE TABLE daily_sales ( sale_date date, product_category string, sales_price double )\n```  \n```\nPARTITIONED BY (month(sale_date))\n```  \n```\nTBLPROPERTIES ('table_type' = 'iceberg')\n```  \n- After you create the table you can browse to it in the Data explorer by choosing S3tablescatalog → your-bucket-name → example\\_database → example\\_table\n7. Insert data into a table with the following query.\n8. Select data from a table with the following query.  \n```\nINSERT INTO daily_sales VALUES (DATE '2024-01-15', 'Monitor', 900.00), (DATE '2024-01-14', 'Keyboard', 250.00), (DATE '2024-01-16', 'CPU', 1350.00) ;\n```  \n```\nWHERE sale_date BETWEEN DATE '2024-01-14' AND DATE '2024-01-16' ORDER BY\n```  \n```\nSELECT * FROM daily_sales sale_date;\n```  \nTo learn more about the query editor and see more SQL examples, s

2025-11-11 20:06:31,626 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 105/156...
[Document(metadata={'Header 2': 'Using JupyterLab', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Navigate to the project you created in the top center menu of the Amazon SageMaker Unified Studio home page.\n2. Expand the Build menu in the top navigation bar, then choose JupyterLab .\n3. Create a new notebook.\n4. Select engine you want to use\n5. Select your table bucket and namespace as the data source for your queries:\n- a. For Spark engine, execute query USE s3tablescatalog\\_ example-table-bucket\n- b. For Athena or Amazon Redshift engine, use the following configure magic. For more information, see Configure compute resources in JupyterLab in the SageMaker AI Unified Studio User Guide .  \n```\n%%configure -n project.athena -f\n```  \n```\n{ "catalog_name": "s3tablescatalog/ examples-table-bucket ", "schema_name": " example-namespace " }\n```  \n6. Enter SQL queries into the notebook cell to create a table in the data

2025-11-11 20:06:33,739 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 106/156...
[Document(metadata={'Header 2': 'Important', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="When using the Spark engine through a Spark connection, the S3TableFullAccess permission is required for table creation. For more information, refer to Considerations for enabling Lake Formation permissions in the AWS Glue Developer Guide .  \nThe following are examples of basic SQL queries you can use to start working with tables.  \nCreate a new table  \n```\nCREATE TABLE daily_sales ( sale_date date, product_category string, sales_price double ) PARTITIONED BY (month(sale_date)) TBLPROPERTIES ('table_type' = 'iceberg')\n```  \nAfter you create the table you can browse to it in the Data explorer by choosing S3tablescatalog → your-bucket-name → your-database-name → daily\\_sales Insert data into a table  \n```\nINSERT INTO daily_sales VALUES (DATE '2024-01-15', 'Monitor', 900.00), (DATE '2024-01-14', 'Keyboard', 250.00), (DATE '2024-01-1

2025-11-11 20:06:35,816 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 107/156...
[Document(metadata={'Header 2': 'Important', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="```  \nSelect data from a table  \n```\nSELECT * FROM\n```  \n```\ndaily_sales\n```  \n```\nWHERE sale_date BETWEEN DATE '2024-01-14' AND DATE '2024-01-16' ORDER BY sale_date;\n```")]


2025-11-11 20:06:37,888 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 108/156...
[Document(metadata={'Header 2': 'Drop a table', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='DROP TABLE IF EXISTS sample\\_table ;')]


2025-11-11 20:06:39,986 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 109/156...
[Document(metadata={'Header 2': 'Get started with SageMaker Lakehouse integrated access controls for Athena federated queries in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Scaling data infrastructure creates challenges with data silos, fragmented access controls, and complex connectivity requirements. Data analysts need to access information across multiple storage systems but are frequently hindered by:  \n- Complex connectivity setup - Configuring connections to various data sources requires technical expertise and access to configuration details that analysts may not have.\n- Fragmented governance - Different data sources have their own access control mechanisms, making consistent security policies difficult to implement.\n- Data duplication - Copying data between systems for analysis increases costs and creates data consistency risks.  \nTo address the challenges of data silos and fragme

2025-11-11 20:06:42,062 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 110/156...
[Document(metadata={'Header 2': 'Get started with SageMaker Lakehouse integrated access controls for Athena federated queries in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Streamlining the creation of connections to diverse data sources through a unified interface\n- Centralizing access control management through AWS Lake Formation\n- Enabling in-place querying through federated catalogs without data movement\n- Providing fine-grained permissions at the catalog, database, table, and column levels\n- Exploring data for ad hoc reporting and proof of concept before setting up new zero-ETL pipelines')]


2025-11-11 20:06:44,168 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 111/156...
[Document(metadata={'Header 2': 'Get started with SageMaker Lakehouse integrated access controls for Athena federated queries in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Providing fine-grained permissions at the catalog, database, table, and column levels\n- Exploring data for ad hoc reporting and proof of concept before setting up new zero-ETL pipelines  \nSageMaker Lakehouse provides a unified environment for accessing, discovering, preparing, and analyzing data from various sources for machine learning (ML) and analytics workloads. Athena complements this as a serverless query service that analyzes data lake and federated data sources such as Amazon DynamoDB and PostgreSQL, through using SQL without extract, transform, and load (ETL) scripts. Federated connections in SageMaker Lakehouse establish secure links to external data sources, enabling access without data movement. Federated c

2025-11-11 20:06:46,249 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 112/156...
[Document(metadata={'Header 2': 'Get started with SageMaker Lakehouse integrated access controls for Athena federated queries in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='SageMaker Lakehouse interface. Federated queries use these connections to run SQL statements across multiple data sources simultaneously, breaking down data silos for comprehensive analysis.')]


2025-11-11 20:06:48,355 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 113/156...
[Document(metadata={'Header 2': "What you'll learn", 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='This guide shows you how to use SageMaker Lakehouse with integrated access controls for Athena federated queries. In this guide, you create an environment where data analysts can discover and query data across sources while administrators maintain consistent governance and appropriate security controls. This guide includes the following steps:  \n1. Set up federated connections between SageMaker Lakehouse and DynamoDB.\n- Create connections that serve as bridges between your SageMaker Lakehouse and external data sources.\n- Enable seamless data access while maintaining security boundaries.\n- Learn how connections eliminate the need for data movement or duplication.\n2. Create federated catalogs for data discovery.\n- Establish catalogs that contains metadata and views about tables from your connected data sources.\n- Access data 

2025-11-11 20:06:50,446 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 114/156...
[Document(metadata={'Header 2': "What you'll learn", 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Access data from the connected data source within your SageMaker Lakehouse environment.\n- Make external tables queryable through the Lakehouse interface.\n- Use catalogs as directories of available data assets to simplify discovery and access.\n3. Implement column-level security using AWS Lake Formation\n- Configure fine-grained permissions for sensitive data.\n- Apply data access controls based on user roles and responsibilities.\n- Ensure consistent security policies across all data sources.\n4. Validate security controls through Athena queries\n- Test access permissions with different user personas.\n- Verify that you properly protect sensitive data.\n- Confirm that authorized users can access appropriate data.')]


2025-11-11 20:06:52,533 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 115/156...
[Document(metadata={'Header 2': 'Prerequisites', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Before you begin, make sure you have the following:  \n- An AWS account with permission to create IAM roles and IAM policies.\n- An AWS Identity and Access Management (IAM) user with an access key and secret key to configure the AWS Command Line Interface (AWS CLI).\n- Your administrator role added as a data lake administrator in AWS Lake Formation. For more information about how to create and add a data lake administrator, see Create a data lake administrator in AWS Lake Formation Developer Guide .')]


2025-11-11 20:06:54,638 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 116/156...
[Document(metadata={'Header 2': 'Prerequisites', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Administrator access to Amazon SageMaker Unified Studio. For more information about permissions of the administrator role, see Lake Formation personas and IAM permissions reference in the AWS Lake Formation Developer Guide . For more information about using the IAM Identity Center directory as your identity source, see Configure user access with the default IAM Identity Center directory in the AWS IAM Identity Center User Guide . For more information about how to access SageMaker, see Accessing Amazon SageMaker Unified Studio in the Amazon SageMaker Unified Studio Administrator Guide .\n- A SageMaker Unified Studio domain with the SQL Analytics profile enabled. For more information about creating an Amazon SageMaker Unified Studio domain and a project, see Setting up Amazon SageMaker in the Amazon SageMaker User Guide . For more inf

2025-11-11 20:06:56,727 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 117/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Add your administrator as an SSO user to your domain. For more information about how to add an SSO user as a root domain owner, see Step 1 - Create an Amazon SageMaker unified domain in the Amazon SageMaker User Guide and Managing users in Amazon SageMaker Unified Studio in the Amazon SageMaker Unified Studio Administrator Guide .  \n- Two SageMaker Unified Studio projects set up for this guide:\n- An Admin project for creating connections. This project has a SQL analytics project profile.\n- A Data Analyst project for analyzing data, which includes both administrator and analysts as members. This project has a SQL analytics project profile.  \nFor more information about how to create a project in SageMaker Unified Studio, see Setting up Amazon SageMaker in the Amazon SageMaker User Guide .')]


2025-11-11 20:06:58,827 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 118/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='For more information about how to create a project in SageMaker Unified Studio, see Setting up Amazon SageMaker in the Amazon SageMaker User Guide .  \n- To find the project role ARN for each project, in the SageMaker Unified Studio, choose the name of the project, choose Project overview , and find Project role ARN under Project details . For more information, see Get project details in the Amazon SageMaker Unified Studio User Guide .\n- For more information about how to add members to your projects, see Add project members in the Amazon SageMaker Unified Studio User Guide .\n- Administrator access to a data source. SageMaker Lakehouse connections support several popular data sources, such as Amazon DynamoDB, PostgreSQL, and Amazon DocumentDB. In this guide, we use DynamoDB as the data source.\n- To set up data sources in DynamoDB:')]


2025-11-11 20:07:00,940 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 119/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- To set up data sources in DynamoDB:\n- You can create a new table in DynamoDB with the partition key cust\\_id and the sort key zipcode and another column mobile through AWS CloudShell by using the following command:\n- You can populate the DynamoDB table with sample data by using the following commands:  \n```\naws dynamodb create-table \\ --table-name customer_ddb \\ --attribute-definitions \\ AttributeName=cust_id,AttributeType=N \\ AttributeName=zipcode,AttributeType=N \\ --key-schema \\ AttributeName=cust_id,KeyType=HASH \\ AttributeName=zipcode,KeyType=RANGE \\ --provisioned-throughput \\ ReadCapacityUnits=5,WriteCapacityUnits=5 \\ --table-class STANDARD\n```  \n```\n# First item aws dynamodb put-item \\ --table-name customer_ddb \\\n```  \n```')]


2025-11-11 20:07:03,068 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 120/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='```  \n```\n# First item aws dynamodb put-item \\ --table-name customer_ddb \\\n```  \n```\n--item \'{"cust_id": {"N": "11"}, "zipcode": {"N": "2000"}, "mobile": {"N": "11113333"}}\' # Second item aws dynamodb put-item \\ --table-name customer_ddb \\ --item \'{"cust_id": {"N": "12"}, "zipcode": {"N": "2000"}, "mobile": {"N": "22224444"}}\' # Third item aws dynamodb put-item \\ --table-name customer_ddb \\ --item \'{"cust_id": {"N": "13"}, "zipcode": {"N": "3000"}, "mobile": {"N": "33335555"}}\' # Fourth item aws dynamodb put-item \\ --table-name customer_ddb \\ --item \'{"cust_id": {"N": "14"}, "zipcode": {"N": "4000"}, "mobile": {"N": "55556666"}}\'\n```  \nFor more information about setting up a DynamoDB data source by using AWS CloudShell, see Amazon DynamoDB tutorial for AWS Cloud9 in the AWS Cloud9 User Guide .')]


2025-11-11 20:07:05,195 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 121/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='```  \nFor more information about setting up a DynamoDB data source by using AWS CloudShell, see Amazon DynamoDB tutorial for AWS Cloud9 in the AWS Cloud9 User Guide .  \n- To allow the appropriate actions for the SageMaker Unified Studio projects to take on your DynamoDB data source, add a resource-based policy to your DynamoDB data source. Attach the following policy for the table customer\\_ddb .  \nJSON  \n```\n{ "Version":"2012-10-17", "Statement": [ { "Effect": "Allow", "Principal": { "AWS": [ "arn:aws:iam:: 111122223333 :role/ datazone_usr_role_xxxxxxxxxxxxxx_yyyyyyyyyyyyyy ",\n```  \n```\n"arn:aws:iam:: 111122223333 :role/ datazone_usr_role_zzzzzzzzzzzzzz_aaaaaaaaaaaaaa " ] }, "Action": [ "dynamodb:Query", "dynamodb:Scan", "dynamodb:DescribeTable", "dynamodb:PartiQLSelect", "dynamodb:BatchWriteItem" ], "Resource": "arn:aws:dynamodb: us

2025-11-11 20:07:07,297 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 122/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='```  \nThis example policy allows connecting to DynamoDB tables as a federated source. Replace us-west-2 with your AWS Region, 111122223333 with the AWS account ID where DynamoDB is deployed, customer\\_ddb with the DynamoDB table that you intend to query from SageMaker Unified Studio, datazone\\_usr\\_role\\_xxxxxxxxxxxxxx\\_yyyyyyyyyyyyyy with the admin project role, and datazone\\_usr\\_role\\_zzzzzzzzzzzzzz\\_aaaaaaaaaaaaaa with the data analyst project role in SageMaker Unified Studio. For more information about how to attach a policy to a DynamoDB data source, see Attach a policy to a DynamoDB existing table in the Amazon DynamoDB Developer Guide .')]


2025-11-11 20:07:09,376 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 123/156...
[Document(metadata={'Header 2': 'Step 1: Set up federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='The first step is to set up federated catalogs for our data sources using an administrator account.')]


2025-11-11 20:07:11,473 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 124/156...
[Document(metadata={'Header 2': 'To set up federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. On the SageMaker Unified Studio console, for the domain you created in the prerequisite, choose Open unified studio .\n2. Choose your admin project name under Your projects .\n3. Choose Data in the navigation pane.  \n4. In the Data explorer , choose the plus icon to add a data source.\n5. Under Add data , choose Add connection , choose Next .\n6. Choose Amazon DynamoDB , and choose Next .\n7. For Name , enter the name for your data source of DynamoDB.\n8. Choose Add data .  \nSageMaker Unified Studio connects to the DynamoDB data source that you created in the prerequisites, registers the data source as a federated catalog with SageMaker Lakehouse, and displays it in your data explorer. The catalog references your DynamoDB data source.')]


2025-11-11 20:07:13,568 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 125/156...
[Document(metadata={'Header 2': 'To explore and query your data', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Choose your admin project from SageMaker Unified Studio.\n2. Choose Data in the navigation pane.\n3. Choose the SageMaker Lakehouse catalog that you just created to view its contents. Use the data explorer to drill down to a table and choose Query with Athena .\n4. In the query editor, run a sample SQL query to understand your data.  \nFor example, run the following query. Replace your\\_federated\\_catalog\\_name with the name of the federated catalog that you just created, default with the name of your database, and your\\_table\\_name with the name of your DynamoDB table. To learn more, see SQL analytics in the Amazon SageMaker Unified Studio User Guide .  \n```\nselect * from your_federated_catalog_name . default . your_table_name limit 10;\n```')]


2025-11-11 20:07:15,704 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 126/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Access to the data source in the SageMaker Unified Studio project is governed by the policies for the project role. Users whoever become the member of this admin project use the same project role ARN and have the same full access level permissions to the data source. For more information about how to add members to your projects, see Add project members in the Amazon SageMaker Unified Studio User Guide . To grant fine-grained access permissions to different user personas, such as data analysts, create a separate data analyst  \nproject and add the data analyst users as project members of the data analyst project. Step 2 shows how to set up the fine-grained permissions.  \nFor more information about creating connections in SageMaker Lakehouse, see Creating a connection in SageMaker Lakehouse in the Amazon SageMaker Unified Studio User Guide . F

2025-11-11 20:07:17,822 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 127/156...
[Document(metadata={'Header 2': 'Step 2: Set up fine-grained access permissions on federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Security is a critical aspect of data access. SageMaker Lakehouse provides integrated access controls that work with federated queries in Athena to ensure proper governance. You can manage permissions at the catalog, database, and table levels. Administrators can apply access controls at different levels of granularity to ensure sensitive data remains protected while expanding data access.')]


2025-11-11 20:07:19,918 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 128/156...
[Document(metadata={'Header 2': 'Step 2: Set up fine-grained access permissions on federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='This step is to delegate access permissions on your DynamoDB federated catalogs to other users. You grant permissions to the data analyst persona. To set up the fine-grained access permissions to the data analyst persona, you need to add permissions on your DynamoDB federated catalogs to the SageMaker Unified Studio data analyst project role that you created in the prerequisites section. This will ensure that access controls that you specify are enforced when the data is queried. For more information about the Lake Formation personas and IAM permissions, see Lake Formation personas and IAM permissions reference in the AWS Lake Formation Developer Guide .')]


2025-11-11 20:07:22,029 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 129/156...
[Document(metadata={'Header 2': 'To set up fine-grained access permissions on federated catalog and database', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="1. Navigate to Lake Formation in the AWS Management Console as an administrator.\n2. In the Lake Formation console, under Data Catalog in the navigation pane, choose Catalogs .\n3. Choose the federated catalog name that you set up in Step 1: Set up federated catalogs. You'll see the databases.\n4. Choose the database name in the catalog. You can see details for the database and manage permissions.\n5. To set up permissions for the federated catalog and database to your SageMaker Unified Studio data analyst project (the data analyst project that you set up in prerequisites), from the Actions menu, choose Grant .  \n6. For Principal type , choose Principals .\n7. For Principals , choose IAM users and roles .\n8. For IAM users and roles , choose the project role ARN that you g

2025-11-11 20:07:24,124 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 130/156...
[Document(metadata={'Header 2': 'To set up fine-grained access permissions on federated catalog and database', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='9. For LF-Tags or catalog resources , choose Named Data Catalog resources .\n10. For Catalogs , choose the federated catalog name for the source (the federated catalog that you set up in Step 1) to grant permissions on.\n11. For Databases , the console populates the databases for your DynamoDB data source.\n12. For Database permissions - Database permissions , select Describe .\n13. Choose Grant .')]


2025-11-11 20:07:26,244 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 131/156...
[Document(metadata={'Header 2': 'To set up fine-grained access permissions on the tables', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='For example, if you wish to restrict access to a sensitive column containing the mobile phone number for each customer, the steps are as follows.  \n1. Navigate to Lake Formation in the AWS Management Console as an administrator.\n2. In the Lake Formation console, under Data Catalog in the navigation pane, choose Tables .\n3. Under Choose catalog , choose the federated catalog name that you set up in Step 1.\n4. Choose the table name in the catalog. You can see details for the table and manage permissions.\n5. From the Actions menu, choose Grant .\n6. For Principal type , choose Principals .\n7. For Principals , choose IAM users and roles .\n8. For IAM users and roles , choose the project role ARN that you got from your data analyst project in the prerequisites section.\n9. For LF-Tags or cata

2025-11-11 20:07:28,388 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 132/156...
[Document(metadata={'Header 2': 'To set up fine-grained access permissions on the tables', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='10. For Catalogs , choose the federated catalog name for the source (the federated catalog that you set up in Step 1) to grant permissions on.\n11. For Databases , the console populates the databases for our DynamoDB data source.\n12. For Tables , the console populates the tables for your DynamoDB data source.\n13. For Table permissions - Table permissions , select Select .\n14. For Data permissions , choose Column-based access .  \n15. For Choose permission filter , choose Include columns .\n16. For Select columns , choose columns zipcode and cust\\_id .\n17. Choose Grant .')]


2025-11-11 20:07:30,497 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 133/156...
[Document(metadata={'Header 2': 'To set up fine-grained access permissions on the tables', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='15. For Choose permission filter , choose Include columns .\n16. For Select columns , choose columns zipcode and cust\\_id .\n17. Choose Grant .  \nIn this example, we demonstrate how to set up a basic column-level filter to restrict access to sensitive data. However, SageMaker Lakehouse supports a broad range of fine-grained access control scenarios beyond column filters that allow you to meet complex security and compliance requirements across diverse data sources. For more information about managing permissions on catalogs, see Adding existing databases and catalogs using AWS Lake Formation permissions in the Amazon SageMaker Unified Studio User Guide and Managing Lake Formation Permissions in the AWS Lake Formation Developer Guide .')]


2025-11-11 20:07:32,586 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 134/156...
[Document(metadata={'Header 2': 'To set up fine-grained access permissions on the tables', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="By implementing these fine-grained access controls, you can ensure that users only access data they're authorized to see, maintaining compliance with your organization's security policies. This creates a consistent security model across your data sources. Now, you have successfully set up fine-grained access permissions on your DynamoDB federated catalog.")]


2025-11-11 20:07:34,673 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 135/156...
[Document(metadata={'Header 2': 'Step 3: Validate fine-grained access permissions on federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='After you set up federated catalogs with fine-grained access permissions in Step 2, run queries to confirm access permissions are working as expected.')]


2025-11-11 20:07:36,778 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 136/156...
[Document(metadata={'Header 2': 'To validate fine-grained access permissions on federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. On the SageMaker Unified Studio console, for the domain you created in the prerequisite, choose Open unified studio .\n2. Choose your data analyst project name under Your projects .\n3. From the Build menu, choose Query Editor .\n4. In the Data explorer , expand Lakehouse , choose the DynamoDB catalog that you created in Step 1.\n5. Drill down to the table that you set up fine-grained access permissions in Step 2, and choose Query with Athena to run a sample query.  \nFor example, run the following query. Replace your\\_federated\\_catalog\\_name with the name of your catalog, default with the name of your database, and your\\_table\\_name with  \nthe name of your DynamoDB table. To learn more, see SQL analytics in the Amazon SageMaker Unified Studio User Guide .  \n```\nselect * 

2025-11-11 20:07:38,859 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 137/156...
[Document(metadata={'Header 2': 'To validate fine-grained access permissions on federated catalogs', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="```\nselect * from your_federated_catalog_name . default . your_table_name limit 10;\n```  \nNote how permissions are working as expected because the query result doesn't include the mobile phone number column that was visible in the admin project view.")]


2025-11-11 20:07:41,005 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 138/156...
[Document(metadata={'Header 2': 'To have other users under the data analyst persona get the fine-grained access permissions', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='1. Create data analyst SSO users or groups. For more information about how to add an SSO user to your domain, see Managing users in Amazon SageMaker Unified Studio in the Amazon SageMaker Unified Studio Administrator Guide .\n2. Add these SSO users to your SageMaker Unified Studio domain. For more information about how to add an SSO user to your domain, see Managing users in Amazon SageMaker Unified Studio in the Amazon SageMaker Unified Studio Administrator Guide .\n3. Add these users as members ( "Contributor" ) to your SageMaker Unified Studio data analyst project. The data analyst users can have access to this data analyst project and will only have access to a subset of data that\'s defined by the data lake administrator in Step 2. For more information a

2025-11-11 20:07:43,106 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 139/156...
[Document(metadata={'Header 2': 'Step 4: Clean up', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Make sure you remove the SageMaker Lakehouse resources to mitigate any unexpected costs. Delete the following resources:  \n- The connections and catalogs that you created in Step 1.\n- Specifically, choose your project from SageMaker Unified Studio. Choose Data in the navigation pane. Choose the SageMaker Lakehouse catalog that you created in Step 1. Choose the Actions menu and choose Remove . Type " Confirm " and choose Remove connection .\n- The underlying DynamoDB data sources that you created in the prerequisites. For more information about deleting a DynamoDB table, see Delete your DynamoDB table to clean up resources in the Amazon DynamoDB Developer Guide .  \n- The SageMaker Unified Studio admin and data analyst projects that you created in the prerequisites. For more information about deleting projects, see Delete a projec

2025-11-11 20:07:45,207 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 140/156...
[Document(metadata={'Header 2': 'Next steps', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="Now that you've successfully set up SageMaker Lakehouse integrated access controls for Athena federated queries, consider these next steps to further enhance your data governance and analytics capabilities:  \n- Expand your data sources - Connect supported data sources such as PostgreSQL, MySQL, or Amazon DocumentDB, to create a unified data ecosystem with consistent access controls.\n- Implement advanced security patterns - Explore row-level security, cell-level filtering, and attribute-based access control to meet complex compliance requirements across your organization. For more information, see Managing Lake Formation Permissions in the AWS Lake Formation Developer Guide .\n- Build analytics workflows - Create end-to-end analytics pipelines that leverage federated queries for data preparation and ML model training.\n- Integrate with 

2025-11-11 20:07:47,299 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 141/156...
[Document(metadata={'Header 2': 'Next steps', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='- Integrate with visualization tools - Connect Amazon QuickSight to your federated catalogs to create dashboards and visualizations with the same security controls.\n- Automate governance processes - Use the Amazon Athena REST API (CreateDataCatalog), AWS CloudFormation ( AWS::Athena::DataCatalog ) or the AWS CDK (CfnDataCatalog) to automate the creation and management of federated connections and access controls. After creating a data catalog, you need to create a data source connection and register your connection as a Glue Data Catalog.')]


2025-11-11 20:07:49,393 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 142/156...
[Document(metadata={'Header 2': 'Next steps', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='This integration between SageMaker Lakehouse and Athena federated queries provides significant benefits for organizations with diverse data ecosystems. Data scientists can now analyze customer behavior by combining transaction data from PostgreSQL with clickstream data in Amazon S3. Financial analysts can query historical market data alongside real-time trading information without complex ETL processes. Healthcare researchers can analyze patient records stored in different systems while maintaining compliance with privacy regulations.  \nFor more information about federated queries in Athena and the data sources that support finegrained access controls, see Register your connection as a Glue Data Catalog in the Athena User')]


2025-11-11 20:07:51,483 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 143/156...
[Document(metadata={'Header 2': 'Next steps', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='For more information about federated queries in Athena and the data sources that support finegrained access controls, see Register your connection as a Glue Data Catalog in the Athena User  \nGuide . For more information about extending your SageMaker Lakehouse environment, see Add Data to SageMaker Lakehouse and Publishing Data in the Amazon SageMaker Unified Studio User Guide . For more information about specific use cases and implementation examples, see Simplify data access for your enterprise using SageMaker Lakehouse, Simplify analytics and AI/ML with new SageMaker Lakehouse, and Catalog and govern Amazon Athena federated queries with SageMaker Lakehouse in the AWS Blog posts .')]


2025-11-11 20:07:53,575 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 144/156...
[Document(metadata={'Header 2': 'Get started fine-tuning foundation models in Amazon SageMaker Unified Studio', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Amazon SageMaker Unified Studio provides a large collection of state-of-the-art foundation models. These models support use cases such as content writing, code generation, question answering, copywriting, summarization, classification, information retrieval, and more. You can find and deploy these foundation models in the JumpStart model catalog. In some cases, you can also customize them. You can use the foundation models to build your own generative AI solutions for a wide range of applications.  \nA foundation model is a large pre-trained model that is adaptable to many downstream tasks and often serves as the starting point for developing more specialized models. Examples of foundation models include Meta Llama 4 Maverick 17B, DeepSeek-R1, or Stable Diffusion 3.5 Large

2025-11-11 20:07:55,687 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 145/156...
[Document(metadata={'Header 2': 'Model customization', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You might need to customize a base foundation model to better align it with your specific use cases. The recommended way to first customize a foundation model is through prompt engineering. Providing your foundation model with well-engineered, context-rich prompts can help achieve desired results without any fine-tuning or changing of model weights. For more information, see Prompt engineering for foundation models in the Amazon SageMaker AI Developer Guide .  \nIf prompt engineering alone is not enough to customize your foundation model to a specific task, you can fine-tune a foundation model on additional domain-specific data. The fine-tuning process involves changing model weights.')]


2025-11-11 20:07:57,775 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 146/156...
[Document(metadata={'Header 2': 'Model customization', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="To help you learn how to fine-tune foundation models, Amazon SageMaker Unified Studio provides an example training dataset for each model that's eligible for training. You can also choose to finetune the model with your own data set. Before you can do that, you must prepare your data set and store it in an Amazon S3 bucket. The required format for the data set varies between models. You can learn about the required format in the model details page in Amazon SageMaker Unified Studio.")]


2025-11-11 20:07:59,890 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 147/156...
[Document(metadata={'Header 2': 'Fine-tuning a foundation model', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="One way to fine-tune a model in Amazon SageMaker Unified Studio is to use JumpStart. First, you browse the model catalog to find a model that's eligible for fine-tuning. Then, you train the model with a training data set. Follow these steps to learn how to fine-tune with this approach.  \n1. Sign in to Amazon SageMaker Unified Studio using the link that your administrator gave you.\n2. Choose a model to train by doing the following:\n- a. From the main menu, choose Build .\n- b. From the drop-down menu, under Model Development , choose Jumpstart Models .\n- c. If the Select or create project to continue window appears, select a project that you've created, and choose Continue .  \nThe JumpStart page lists the model providers.  \n- d. Choose a provider to see the available models.")]


2025-11-11 20:08:01,974 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 148/156...
[Document(metadata={'Header 2': 'Fine-tuning a foundation model', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='The JumpStart page lists the model providers.  \n- d. Choose a provider to see the available models.  \nNot all providers have models that you can fine-tune in JumpStart. If you want to quickly find an eligible model so that you can get familiar with fine-tuning, choose Meta . It has many trainable models to choose from.')]


2025-11-11 20:08:04,078 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 149/156...
[Document(metadata={'Header 2': 'Tip', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="For some providers, you can filter the list of models so that you see only the trainable ones. Choose the Trainable checkbox if it's present.  \n- e. From the provider's list of models, choose the model you want to train.  \nAmazon SageMaker Unified Studio shows the model details page, which provides information from the model provider. If you want to prepare a custom fine-tuning data set, use this page to learn the required format.  \n3. From the model details page, if the model is trainable, choose Train to create a training job.  \nIf the model isn't trainable, the button is disabled. In that case, return to the JumpStart page, find a different model that's trainable, and try again.  \n4. On the Fine-tune model page, under Artifacts , do one of the following:")]


2025-11-11 20:08:06,183 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 150/156...
[Document(metadata={'Header 2': 'Tip', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content="4. On the Fine-tune model page, under Artifacts , do one of the following:  \n- a. Keep the default selection of Example training dataset . This dataset is useful when you want to learn how to fine-tune with Amazon SageMaker Unified Studio. However, it won't be effective for customizing the model for your specific needs.\n- b. If you've prepared a custom training dataset, choose Enter training dataset , and provide the URI that locates it in Amazon S3.\n5. For Output artifact location (S3 URI) , specify where Amazon SageMaker Unified Studio uploads the fine-tuned model. You can choose to use the default bucket, or you can specify a custom location in Amazon S3.\n6. (Optional) Under Hyperparameters , update the hyperparameters you want to change.")]


2025-11-11 20:08:08,275 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 151/156...
[Document(metadata={'Header 2': 'Tip', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='6. (Optional) Under Hyperparameters , update the hyperparameters you want to change.  \nThe hyperparameters available for each trainable model differ depending on the model. Review the help text and additional information in the model details pages in Amazon SageMaker Unified Studio to learn more about hyperparameters specific to the model of your choice.  \nFor more information on available hyperparameters, see Commonly supported fine-tuning hyperparameters in the Amazon SageMaker AI Developer Guide .  \n7. Under Compute , for Training Instance , specify the training instance type for your training job. You can choose only from instances that are compatible with the chosen model.')]


2025-11-11 20:08:10,429 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 152/156...
[Document(metadata={'Header 2': 'Important', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Choose an instance type that fits within the service quotas for your AWS account. When you submit your training job, Amazon SageMaker Unified Studio attempts to provision the chosen instance type in your account. This attempt succeeds only if your quotas have remaining capacity for the instance type.  \nTo see the quotas for your account, open the Service Quotas console at https:// console.aws.amazon.com/servicequotas/.  \nIf you want to use a specific instance type but lack the required quota capacity, you can request a quota increase with Support. For more information, see Requesting a quota increase in the Service Quotas User Guide .  \n8. (Optional) Under Information , for Training Job Name , you can edit the default name.\n9. (Optional) For Tags , you can add and remove tags in the form of key-value pairs to help organize and categor

2025-11-11 20:08:12,533 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 153/156...
[Document(metadata={'Header 2': 'Note', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='Some models require acceptance of an end-user license agreement (EULA). If this applies to the model that you choose to fine-tune, Amazon SageMaker Unified Studio prompts you with a window that contains the EULA content. You are responsible for reviewing and complying with any applicable license terms and making sure they are acceptable for your use case before using the model.  \nAmazon SageMaker Unified Studio shows a page with details about the training job. Here, you can observe the status of the job as it executes.  \nThe training job might take a long time to complete. You can view it at any time from the Training jobs page.  \nWhen the training job completes, the status becomes Completed . After the job completes, you can choose Deploy to deploy the fine-tuned model to an inference endpoint.')]


2025-11-11 20:08:14,618 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 154/156...
[Document(metadata={'Header 2': 'API Documentation for Amazon SageMaker', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='You can use the following guides to work with Amazon SageMaker programmatically:  \n- SQL analytics\n- Amazon Redshift API reference\n- Data processing\n- Amazon EMR API reference\n- AWS Glue API reference\n- Amazon Athena API reference\n- Model development\n- Amazon SageMaker AI API reference\n- Gen AI app development\n- Amazon Bedrock API reference\n- Data and AI Governance\n- Amazon DataZone API Reference\n- SageMaker lakehouse architecture\n- Lake Formation API reference\n- Catalog objects API reference')]


2025-11-11 20:08:16,713 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


  Adding batch 155/156...
[Document(metadata={'Header 2': 'Document history for the Amazon SageMaker User Guide', 'domain': 'domain', 'service': 'service_name', 'source': 'url'}, page_content='The following table describes the documentation releases for Amazon SageMaker.  \n| Change          | Description                   | Date          |\n|-----------------|-------------------------------|---------------|\n| Initial release | Initial release of the Amazon | June 13, 2025 |')]


In [27]:
import csv
import io
import os
import time
import uuid # <-- NEW: Import uuid
from langchain_chroma import Chroma
from langchain_ollama.embeddings import OllamaEmbeddings
from langchain_docling import DoclingLoader
from langchain_docling.loader import ExportType
from langchain_text_splitters import MarkdownHeaderTextSplitter, RecursiveCharacterTextSplitter

# 1. Define your data
CSV_DATA = """Domain,Service,PDF_URL
Analytics,sagemaker,https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf
"""

# 2. Define constants
CHROMA_DB_PATH = "./chroma_db_cloudyintel"
EMBEDDING_MODEL_NAME = "nomic-embed-text" 
BATCH_SIZE = 1  
CHUNK_SIZE = 2000 # Safety net: Max characters per *final* chunk
CHUNK_OVERLAP = 200

def parse_csv_data(csv_data):
    """Parses the in-memory CSV string into a list of dictionaries."""
    service_docs = []
    f = io.StringIO(csv_data)
    reader = csv.DictReader(f)
    for row in reader:
        service_docs.append(row)
    return service_docs

def load_and_process_pdfs(service_docs):
    """
    Downloads, loads, splits, and embeds all PDFs using Docling,
    then saves them to Chroma.
    """
    
    # 3. Initialize components
    
    # Embedding model (Using Ollama)
    print("Initializing Ollama embeddings...")
    try:
        embeddings = OllamaEmbeddings(model=EMBEDDING_MODEL_NAME)
        embeddings.embed_query("Test embedding")
    except Exception as e:
        print(f"Error connecting to Ollama. Is it running?")
        return

    # Chroma vector store
    vector_store = Chroma(
        persist_directory=CHROMA_DB_PATH,
        embedding_function=embeddings
    )
    
    # Safety net splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP
    )
    
    print(f"Initialized Chroma DB at {CHROMA_DB_PATH}")
    
    # 4. Process each service document
    total_services = len(service_docs)
    for i, service in enumerate(service_docs):
        domain = service['Domain']
        service_name = service['Service']
        url = service['PDF_URL']
        
        print(f"\n--- Processing {i+1}/{total_services}: {domain} - {service_name} ---")
        print(f"URL: {url}")
        
        try:
            start_time = time.time()
            
            # A. Load PDF using Docling's default AI chunker
            print("Initializing DoclingLoader (using default AI chunking)...")
            loader = DoclingLoader(
                file_path=url,
                # NO export_type specified, so it uses default ExportType.DOC_CHUNKS
            )
            
            print("Loading and parsing with Docling (this may take a while)...")
            # 'docs' is now a list of AI-powered semantic chunks
            docs = loader.load()
            
            if not docs:
                print("Docling returned no content. Skipping.")
                continue
            
            # B. Add our custom metadata to each semantic chunk
            for doc in docs:
                doc.metadata["domain"] = domain
                doc.metadata["service"] = service_name
                doc.metadata["source"] = url

            # C. Stage 2 Split: Safety net
            print(f"Applying safety split to {len(docs)} AI chunks...")
            final_chunks = text_splitter.split_documents(docs)
            
            # D. Embed and add final chunks to Chroma
            if final_chunks:
                print(f"Found {len(final_chunks)} final chunks to add.")
                
                # Create a list of unique IDs
                chunk_ids = [str(uuid.uuid4()) for _ in final_chunks]
                
                # Batch both the documents and the IDs
                for i in range(0, len(final_chunks), BATCH_SIZE):
                    batch_docs = final_chunks[i:i + BATCH_SIZE]
                    batch_ids = chunk_ids[i:i + BATCH_SIZE]
                    
                    batch_num = (i // BATCH_SIZE) + 1
                    total_batches = (len(final_chunks) // BATCH_SIZE) + 1

                    print(f"  Adding batch {batch_num}/{total_batches}...")
                    
                    vector_store.add_documents(
                        documents=batch_docs,
                        ids=batch_ids
                    )
                
                end_time = time.time()
                print(f"Successfully processed {service_name} in {end_time - start_time:.2f} seconds.")
            else:
                print(f"No text chunks extracted from {service_name}.")

        except Exception as e:
            print(f"Error: Failed to process {service_name} from {url}. Skipping. Details: {e}")

    print("\n--- All documents processed! ---")
    print(f"Vector database is persistent and saved in '{CHROMA_DB_PATH}'")
    
    return vector_store


In [24]:
OllamaEmbeddings(model=EMBEDDING_MODEL_NAME).embed_query("Initial connection test")
print("Ollama connection successful.")

2025-11-11 19:52:47,981 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Ollama connection successful.


In [None]:
if os.path.exists(CHROMA_DB_PATH):
    print(f"Database already exists at {CHROMA_DB_PATH}.")
    print("To re-build, please delete this directory and run again.")
else:


Database already exists at ./chroma_db_AWSDocs.
To re-build, please delete this directory and run again.


In [28]:
service_docs = parse_csv_data(CSV_DATA)
load_and_process_pdfs(service_docs)

Initializing Ollama embeddings...


2025-11-11 19:53:43,159 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"
2025-11-11 19:53:43,182 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


Initialized Chroma DB at ./chroma_db_cloudyintel

--- Processing 1/1: Analytics - sagemaker ---
URL: https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf
Initializing DoclingLoader (using default AI chunking)...
Loading and parsing with Docling (this may take a while)...


2025-11-11 19:53:45,772 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-11 19:53:45,834 - INFO - Going to convert document batch...
2025-11-11 19:53:45,837 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-11 19:53:45,847 - INFO - Auto OCR model selected ocrmac.
2025-11-11 19:53:45,849 - INFO - Accelerator device: 'mps'
2025-11-11 19:53:50,863 - INFO - Accelerator device: 'mps'
2025-11-11 19:53:52,338 - INFO - Processing document next-generation-sagemaker-ug.pdf
2025-11-11 19:54:28,595 - INFO - Finished converting document next-generation-sagemaker-ug.pdf in 44.15 sec.
Token indices sequence length is longer than the specified maximum sequence length for this model (3596 > 512). Running this sequence through the model will result in indexing errors


Applying safety split to 203 AI chunks...
Found 203 final chunks to add.
  Adding batch 1/204...


2025-11-11 19:54:29,815 - INFO - HTTP Request: POST http://127.0.0.1:11434/api/embed "HTTP/1.1 200 OK"


Error: Failed to process sagemaker from https://docs.aws.amazon.com/pdfs/next-generation-sagemaker/latest/userguide/next-generation-sagemaker-ug.pdf. Skipping. Details: Expected metadata value to be a str, int, float, bool, SparseVector, or None, got {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/0', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 9.27027010472973, 't': 782.0892856775794, 'r': 39.06756740202703, 'b': 771.5178571061508, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 3]}]}], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 16562198080799201207, 'filename': 'next-generation-sagemaker-ug.pdf'}} which is a dict in upsert.

Try filtering complex metadata from the document using langchain_community.vectorstores.utils.filter_complex_metadata.

--- All documents processed! ---
Vector database is persistent and saved in './chro

<langchain_chroma.vectorstores.Chroma at 0x1458c0cb0>

In [None]:

def main():
    print("Checking Ollama connection...")
    try:
        OllamaEmbeddings(model=EMBEDDING_MODEL_NAME).embed_query("Initial connection test")
        print("Ollama connection successful.")
    except Exception as e:
        print("Error: Could not connect to Ollama.")
        print("Please make sure the Ollama application is running and you have run:")
        print(f"ollama pull {EMBEDDING_MODEL_NAME}")
        return

    if os.path.exists(CHROMA_DB_PATH):
        print(f"Database already exists at {CHROMA_DB_PATH}.")
        print("To re-build, please delete this directory and run again.")
    else:
        service_docs = parse_csv_data(CSV_DATA)
        load_and_process_pdfs(service_docs)

if __name__ == "__main__":
    main()