In [0]:
from databricks.sdk import WorkspaceClient

w = WorkspaceClient()
w.secrets.create_scope(scope="agentic")
w.secrets.put_secret("agentic","tavily_key",string_value ="<TAVILY_KEY>")

In [0]:
tavily_api_key = dbutils.secrets.get("agentic", "tavily_key")
print(dbutils.secrets.get(scope="agentic", key="tavily_key"))

In [0]:
%sql
CREATE CONNECTION IF NOT EXISTS tavily_connect_bearer
TYPE HTTP
OPTIONS (
  host 'https://api.tavily.com',  -- Base host for Tavily API
  port '443',                     -- Standard HTTPS port
  base_path '/',                  -- Set base_path to root, UDF will specify '/search'
  bearer_token secret('agentic', 'tavily_key') -- Databricks will use this to send "Authorization: Bearer <token>"
);

In [0]:
%sql
CREATE OR REPLACE FUNCTION agentic_ai.databricks.search_web( -- Renamed for clarity, or replace your existing one
    query_text STRING COMMENT 'The search query text',
    search_depth_val STRING DEFAULT 'basic' COMMENT 'The depth of the search, e.g., basic or deep',
    include_raw_content_val BOOLEAN DEFAULT true COMMENT 'Whether to include an answer in the response',
    max_results_val INTEGER DEFAULT 10 COMMENT 'The maximum number of results to return',
    topic_val STRING DEFAULT 'general' COMMENT 'The topic category for the search'
)
RETURNS STRING
LANGUAGE SQL
COMMENT 'Runs a Tavily search and returns the JSON response, defaults to 5 results max.'
RETURN (
  http_request(
    conn => "tavily_connect_bearer", -- Use the new connection that handles bearer token
    method => "POST",
    path => "/search",                 -- Endpoint path for Tavily search
    json => to_json(named_struct(
      'query', query_text,
      'search_depth', search_depth_val,
      'include_raw_content', include_raw_content_val,
      'max_results', max_results_val,
      'topic', topic_val
      -- Add other Tavily parameters as needed
    )),
    headers => map(
      'Content-Type', 'application/json' -- Still needed for the POST request body type
      -- No 'Authorization' header here, the connection handles it
    )
  )
);

In [0]:
# %sql
# SELECT 
# http_request(
#     conn => "tavily_connect_bearer", -- Use the new connection that handles bearer token
#     method => "POST",
#     path => "/search",                 -- Endpoint path for Tavily search
#     json => to_json(named_struct(
#       'query', 'Latest news on Apple',
#       'search_depth', 'advanced',
#       'include_answer', true,
#       'max_results', 10,
#       'topic', 'news'
#       -- Add other Tavily parameters as needed
#     )),
#     headers => map(
#       'Content-Type', 'application/json' -- Still needed for the POST request body type
#       -- No 'Authorization' header here, the connection handles it
#     )
#   )

In [0]:
# %sql
# SELECT 
# http_request(
#     conn => "tavily_connect_bearer", -- Use the new connection that handles bearer token
#     method => "POST",
#     path => "/search",                 -- Endpoint path for Tavily search
#     json => to_json(named_struct(
#       'query', 'Latest news on Apple',
#       'search_depth', 'advanced',
#       'include_raw_content', true,
#       'max_results', 10,
#       'topic', 'news'
#       -- Add other Tavily parameters as needed
#     )),
#     headers => map(
#       'Content-Type', 'application/json' -- Still needed for the POST request body type
#       -- No 'Authorization' header here, the connection handles it
#     )
#   )

In [0]:
%sql
SELECT agentic_ai.databricks.search_web(
    query_text => 'Latest news on Apple',
    topic_val => 'news',
    include_raw_content_val => true
) AS tavily_response;

In [0]:
%sql
-- Simpler test with defaults
SELECT cankoklu.toolbox.tavily_search('Tell me about Databricks SQL') AS tavily_default_response;

## Extract

In [0]:
%sql
CREATE OR REPLACE FUNCTION cankoklu.toolbox.tavily_extract(
    url_to_extract STRING COMMENT 'The URL of the page to extract content from', 
    include_images_val BOOLEAN DEFAULT false COMMENT 'Flag to include images in the extraction',
    extract_depth_val STRING DEFAULT 'advanced' COMMENT 'Depth of extraction: "basic" or "advanced"'
)
RETURNS STRING
LANGUAGE SQL
COMMENT 'Sends a URL to Tavily to extract the contents of the page into JSON.'
RETURN (
  http_request(
    conn => "tavily_connect_bearer",
    method => "POST",
    path => "/extract",
    json => to_json(named_struct(
      'urls', url_to_extract,
      'include_images', include_images_val,
      'extract_depth', extract_depth_val
    )),
    headers => map(
      'Content-Type', 'application/json'
    )
  )
);

In [0]:
%sql
-- Test with default optional parameters
SELECT cankoklu.toolbox.tavily_extract(
    'https://en.wikipedia.org/wiki/Databricks'
) AS extract_response_default;



In [0]:
%sql
-- Test with include_images and advanced depth
SELECT cankoklu.toolbox.tavily_extract(
    url_to_extract => 'https://www.databricks.com/blog',
    include_images_val => true,
    extract_depth_val => 'advanced'
) AS extract_response_advanced;

# Calling Search, then Extract in Python

In [0]:
import json
import re
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json, lit

# Execute the search query
search_response = spark.sql("""
  SELECT cankoklu.toolbox.tavily_search(
    query_text => 'Latest product or feature releases from Databricks',
    include_answer_val => true,
    max_results_val => 1
  ) AS search_response
""")

# Convert the search_response to a Python object
search_response_row = search_response.collect()[0]
search_response_str = search_response_row["search_response"]

# Extract just the JSON part from the response
json_match = re.search(r'\{200,\s*(\{.*\})\}', search_response_str)
if json_match:
    json_str = json_match.group(1)
    search_response_dict = json.loads(json_str)
    
    # Extract the URL from the parsed JSON
    url_from_search = ""
    if "results" in search_response_dict and len(search_response_dict["results"]) > 0:
        url_from_search = search_response_dict["results"][0].get("url", "")
    
    # Now use the URL in your SQL query
    if url_from_search:
        print(f"Extracted URL: {url_from_search}")
        extract_response_advanced = spark.sql(f"""
          SELECT cankoklu.toolbox.tavily_extract(
            url_to_extract => '{url_from_search}',
            include_images_val => true,
            extract_depth_val => 'advanced'
          ) AS extract_response_advanced
        """)
        
        # Display the extract response
        print("Extract Response Schema:")
        extract_response_advanced.printSchema()
        
        print("Extract Response Content:")
        extract_response_advanced.show(truncate=False)
        
        # Get the actual extracted content
        extract_row = extract_response_advanced.collect()[0]
        extract_content = extract_row["extract_response_advanced"]
        
        print("Raw Extract Content:")
        print(extract_content)
        
        # Try to parse the extract content if it's in JSON format
        try:
            # Check if it has the same format as the search response
            extract_json_match = re.search(r'\{200,\s*(\{.*\})\}', extract_content)
            if extract_json_match:
                extract_json_str = extract_json_match.group(1)
                extract_dict = json.loads(extract_json_str)
                print("Parsed Extract Content:")
                print(json.dumps(extract_dict, indent=2))
            else:
                # Try direct JSON parsing
                extract_dict = json.loads(extract_content)
                print("Parsed Extract Content:")
                print(json.dumps(extract_dict, indent=2))
        except json.JSONDecodeError:
            print("Extract content is not in JSON format")
    else:
        print("No URL found in the search results")
else:
    print("Could not extract JSON from the response")
    print(f"Raw response: {search_response_str}")

# TO DO Langchain implementation

In [0]:
# Add tools via Playground, then get & run notebook.
# serve the newly created model through provisioned throughput.

## Original Python Implementation

In [0]:
import requests

url = "https://api.tavily.com/search"

payload = {
    "query": "who is Leo Messi?",
    "topic": "general",
    "search_depth": "basic",
    "chunks_per_source": 3,
    "max_results": 1,
    "time_range": None,
    "days": 7,
    "include_answer": True,
    "include_raw_content": False,
    "include_images": False,
    "include_image_descriptions": False,
    "include_domains": [],
    "exclude_domains": []
}
headers = {
    "Authorization": "Bearer <token>",
    "Content-Type": "application/json"
}

response = requests.request("POST", url, json=payload, headers=headers)

print(response.text)
